实战项目二:自动摘要生成器

目录

项目概述

自动摘要生成是自然语言处理中的重要任务,旨在从长文本中提取关键信息并生成简洁准确的摘要。本项目将实现多种摘要技术,包括经典的抽取式方法和先进的生成式方法,并构建完整的生产级服务。

项目目标

def project_objectives():
    """
    自动摘要生成器项目目标
    """
    objectives = {
        "技术目标": [
            "实现抽取式摘要(TextRank、BERT-based)",
            "实现生成式摘要(T5、BART、GPT)",
            "建立ROUGE评估体系",
            "构建FastAPI服务接口"
        ],
        "性能目标": [
            "ROUGE-1分数 > 0.40",
            "ROUGE-2分数 > 0.15", 
            "摘要长度控制在原文10-20%",
            "单文档处理时间 < 2秒"
        ],
        "业务目标": [
            "支持多种摘要风格(简洁、详细、技术)",
            "处理不同长度文本(100-5000字)",
            "提供高质量摘要结果",
            "支持批量处理需求"
        ]
    }
    
    print("项目目标:")
    for category, items in objectives.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  ✓ {item}")

project_objectives()

摘要任务定义

def summary_task_definition():
    """
    摘要任务定义与分类
    """
    print("摘要任务定义:")
    
    task_types = {
        "抽取式摘要 (Extractive Summary)": {
            "定义": "从原文中直接选择重要的句子组成摘要",
            "特点": "保留原文表达,不产生新词汇",
            "适用": "新闻、学术论文、技术文档"
        },
        "生成式摘要 (Abstractive Summary)": {
            "定义": "理解原文内容后重新组织语言生成摘要",
            "特点": "可改写、压缩、生成新表达",
            "适用": "多样化文本、创意内容"
        }
    }
    
    for task_type, details in task_types.items():
        print(f"\n{task_type}:")
        for key, value in details.items():
            print(f"  {key}: {value}")

summary_task_definition()

摘要技术分类

抽取式vs生成式对比

def extraction_vs_generation():
    """
    抽取式与生成式摘要对比分析
    """
    comparison_table = [
        {
            "对比维度": "词汇保留",
            "抽取式": "保留原文词汇",
            "生成式": "可生成新词汇"
        },
        {
            "对比维度": "语法正确性",
            "抽取式": "保证语法正确",
            "生成式": "可能存在语法错误"
        },
        {
            "对比维度": "内容忠实性",
            "抽取式": "高度忠实原文",
            "生成式": "可能存在偏差"
        },
        {
            "对比维度": "处理速度",
            "抽取式": "相对较快",
            "生成式": "相对较慢"
        },
        {
            "对比维度": "模型复杂度",
            "抽取式": "较低",
            "生成式": "较高"
        }
    ]
    
    print("抽取式vs生成式摘要对比:")
    print(f"{'对比维度':<12} {'抽取式':<20} {'生成式':<20}")
    print("-" * 55)
    for item in comparison_table:
        print(f"{item['对比维度']:<12} {item['抽取式']:<20} {item['生成式']:<20}")
    
    print("\n2026年应用场景选择:")
    scenarios = [
        "短文本(<500字): 抽取式(TextRank/BERT) - 快速、成本低",
        "中等文本(500-2000字): BERT抽取式或T5生成式",
        "长文本(>2000字): 分段处理+GPT生成式",
        "实时需求: 抽取式优先",
        "质量要求高: 生成式优先"
    ]
    
    for scenario in scenarios:
        print(f"  ✓ {scenario}")

extraction_vs_generation()

摘要技术发展脉络

def summary_technology_timeline():
    """
    摘要技术发展历程
    """
    timeline = [
        {
            "年份": "1958",
            "事件": "Luhn提出基于词频的自动摘要概念",
            "意义": "开创性工作"
        },
        {
            "年份": "2004", 
            "事件": "TextRank算法提出",
            "意义": "图算法在摘要中的应用"
        },
        {
            "年份": "2017",
            "事件": "Transformer架构提出",
            "意义": "奠定现代摘要基础"
        },
        {
            "年份": "2018",
            "事件": "BERT模型发布",
            "意义": "预训练模型在摘要中的应用"
        },
        {
            "年份": "2019",
            "事件": "T5模型发布",
            "意义": "统一文本生成框架"
        },
        {
            "年份": "2020",
            "事件": "BART模型发布",
            "意义": "双向去噪自编码器"
        },
        {
            "年份": "2022-至今",
            "事件": "大语言模型(GPT系列)在摘要中的应用",
            "意义": "生成式摘要质量显著提升"
        }
    ]
    
    print("摘要技术发展历程:")
    for event in timeline:
        print(f"  {event['年份']}: {event['事件']} - {event['意义']}")

summary_technology_timeline()

抽取式摘要实现

TextRank算法详解

def textrank_algorithm():
    """
    TextRank算法原理与实现
    """
    print("TextRank算法详解:")
    
    algorithm_principle = """
    TextRank算法原理:
    
    1. 句子表示
       - 将文本分割为句子集合
       - 每个句子作为一个节点
    
    2. 相似度计算
       - 使用TF-IDF或其他方法计算句子间相似度
       - 构建相似度矩阵
    
    3. PageRank迭代
       - 使用类似PageRank的公式迭代计算节点重要性
       - 直到收敛或达到最大迭代次数
    
    4. 摘要生成
       - 根据节点得分排序选择Top-K句子
       - 按原文顺序输出摘要
    """
    
    print(algorithm_principle)
    
    # TextRank实现代码
    textrank_implementation = """
    import numpy as np
    import jieba
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import networkx as nx
    
    class TextRankSummarizer:
        def __init__(self, top_k=3, max_iter=50, damping_factor=0.85, min_diff=1e-5):
            self.top_k = top_k
            self.max_iter = max_iter
            self.damping_factor = damping_factor
            self.min_diff = min_diff
        
        def preprocess_text(self, text):
            """
            文本预处理:分句、过滤
            """
            # 按句号、问号、感叹号分句
            sentences = []
            for sent in text.replace('!', '。').replace('?', '。').split('。'):
                sent = sent.strip()
                if len(sent) > 10:  # 过滤过短句子
                    sentences.append(sent)
            return sentences
        
        def calculate_similarity(self, sentences):
            """
            计算句子间的相似度矩阵
            """
            # 使用jieba分词
            tokenized_sentences = [' '.join(jieba.cut(sent)) for sent in sentences]
            
            # TF-IDF向量化
            vectorizer = TfidfVectorizer(tokenizer=lambda x: jieba.lcut(x))
            tfidf_matrix = vectorizer.fit_transform(tokenized_sentences)
            
            # 计算余弦相似度
            similarity_matrix = cosine_similarity(tfidf_matrix)
            
            # 自己与自己的相似度设为0
            np.fill_diagonal(similarity_matrix, 0)
            
            return similarity_matrix
        
        def textrank(self, similarity_matrix):
            """
            TextRank核心算法
            """
            n = len(similarity_matrix)
            scores = np.ones(n) / n  # 初始化得分
            
            for iteration in range(self.max_iter):
                new_scores = (1 - self.damping_factor) / n
                new_scores += self.damping_factor * similarity_matrix.T.dot(scores)
                
                # 检查收敛
                if abs(new_scores - scores).sum() <= self.min_diff:
                    break
                
                scores = new_scores
            
            return scores
        
        def summarize(self, text):
            """
            生成摘要
            """
            sentences = self.preprocess_text(text)
            
            if len(sentences) <= self.top_k:
                return '。'.join(sentences)
            
            similarity_matrix = self.calculate_similarity(sentences)
            scores = self.textrank(similarity_matrix)
            
            # 获取top-k句子索引
            top_indices = scores.argsort()[-self.top_k:][::-1]
            top_indices.sort()  # 按原文顺序排序
            
            summary_sentences = [sentences[i] for i in top_indices]
            return '。'.join(summary_sentences)
    
    # 使用示例
    summarizer = TextRankSummarizer(top_k=3)
    text = "自然语言处理是人工智能的重要分支。它研究计算机与人类语言的交互。自然语言处理包括语音识别、文本分类、机器翻译等任务。近年来,大语言模型取得了突破性进展。BERT和GPT是代表性模型。ChatGPT在对话系统中表现出色。"
    summary = summarizer.summarize(text)
    print(f"摘要: {summary}")
    """
    
    print("TextRank实现代码:")
    print(textrank_implementation)

textrank_algorithm()

BERT抽取式摘要

def bert_extractive_summary():
    """
    BERT抽取式摘要实现
    """
    print("BERT抽取式摘要:")
    
    # BERT抽取式摘要实现
    bert_implementation = """
    import torch
    from transformers import AutoTokenizer, AutoModel
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    
    class BertExtractiveSummarizer:
        def __init__(self, model_name='bert-base-chinese', top_k=3):
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModel.from_pretrained(model_name)
            self.top_k = top_k
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            self.model.eval()
        
        def encode_sentences(self, sentences):
            """
            使用BERT编码句子
            """
            encoded_sentences = []
            
            for sentence in sentences:
                inputs = self.tokenizer(
                    sentence,
                    return_tensors='pt',
                    max_length=512,
                    truncation=True,
                    padding=True
                ).to(self.device)
                
                with torch.no_grad():
                    outputs = self.model(**inputs)
                    # 使用[CLS]标记的表示
                    sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                    encoded_sentences.append(sentence_embedding.flatten())
            
            return np.array(encoded_sentences)
        
        def calculate_sentence_scores(self, sentences, document_embedding):
            """
            计算句子重要性得分
            """
            sentence_embeddings = self.encode_sentences(sentences)
            
            # 计算句子与文档的相似度
            similarities = cosine_similarity(sentence_embeddings, document_embedding.reshape(1, -1)).flatten()
            
            return similarities, sentence_embeddings
        
        def summarize(self, text):
            """
            生成抽取式摘要
            """
            # 分句
            sentences = []
            for sent in text.replace('!', '。').replace('?', '。').split('。'):
                sent = sent.strip()
                if len(sent) > 10:
                    sentences.append(sent)
            
            if len(sentences) <= self.top_k:
                return '。'.join(sentences)
            
            # 计算文档向量(句子向量的平均)
            sentence_embeddings = self.encode_sentences(sentences)
            document_embedding = np.mean(sentence_embeddings, axis=0)
            
            # 计算句子得分
            scores, _ = self.calculate_sentence_scores(sentences, document_embedding)
            
            # 选择top-k句子
            top_indices = scores.argsort()[-self.top_k:][::-1]
            top_indices.sort()  # 按原文顺序排序
            
            summary_sentences = [sentences[i] for i in top_indices]
            return '。'.join(summary_sentences)
    
    # 使用示例
    summarizer = BertExtractiveSummarizer(top_k=3)
    summary = summarizer.summarize(text)
    print(f"BERT抽取式摘要: {summary}")
    """
    
    print("BERT抽取式摘要实现代码:")
    print(bert_implementation)

bert_extractive_summary()

其他抽取式方法

def other_extractive_methods():
    """
    其他抽取式摘要方法
    """
    print("其他抽取式摘要方法:")
    
    methods = [
        {
            "方法": "TF-IDF",
            "原理": "基于词频-逆文档频率选择关键词句",
            "优点": "简单高效,易于理解",
            "缺点": "忽略语义关系"
        },
        {
            "方法": "LexRank",
            "原理": "基于词汇重叠的图排序算法",
            "优点": "考虑句子间词汇重叠",
            "缺点": "对长文档效果有限"
        },
        {
            "方法": "SummaRuNNer",
            "原理": "使用双向RNN判断句子重要性",
            "优点": "考虑上下文信息",
            "缺点": "需要训练数据"
        },
        {
            "方法": "BERTScore",
            "原理": "使用BERT计算句子间语义相似度",
            "优点": "语义理解能力强",
            "缺点": "计算复杂度高"
        }
    ]
    
    for method in methods:
        print(f"\n{method['方法']}:")
        print(f"  原理: {method['原理']}")
        print(f"  优点: {method['优点']}")
        print(f"  缺点: {method['缺点']}")

other_extractive_methods()

生成式摘要实现

T5模型摘要

def t5_summarization():
    """
    T5模型生成式摘要
    """
    print("T5生成式摘要:")
    
    # T5摘要实现代码
    t5_implementation = """
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    import torch
    
    class T5Summarizer:
        def __init__(self, model_name='t5-base', max_input_length=512, max_output_length=150):
            self.tokenizer = T5Tokenizer.from_pretrained(model_name)
            self.model = T5ForConditionalGeneration.from_pretrained(model_name)
            self.max_input_length = max_input_length
            self.max_output_length = max_output_length
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            self.model.eval()
        
        def summarize(self, text, min_length=50, style='default'):
            """
            使用T5生成摘要
            """
            # T5需要特定的前缀
            if style == 'simple':
                prefix = "summarize: "
            else:
                prefix = "summarize: "
            
            input_text = prefix + text[:self.max_input_length]
            
            inputs = self.tokenizer(
                input_text,
                return_tensors='pt',
                max_length=self.max_input_length,
                truncation=True,
                padding=True
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=self.max_output_length,
                    min_length=min_length,
                    num_beams=4,
                    length_penalty=2.0,
                    early_stopping=True,
                    no_repeat_ngram_size=3,
                    do_sample=False
                )
            
            summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return summary
    
    # 使用示例
    t5_summarizer = T5Summarizer(model_name='t5-base')
    summary = t5_summarizer.summarize(text)
    print(f"T5摘要: {summary}")
    """
    
    print("T5摘要实现代码:")
    print(t5_implementation)

t5_summarization()

BART模型摘要

def bart_summarization():
    """
    BART模型生成式摘要
    """
    print("BART生成式摘要:")
    
    # BART摘要实现代码
    bart_implementation = """
    from transformers import BartTokenizer, BartForConditionalGeneration
    import torch
    
    class BartSummarizer:
        def __init__(self, model_name='facebook/bart-large-cnn', max_input_length=1024, max_output_length=200):
            self.tokenizer = BartTokenizer.from_pretrained(model_name)
            self.model = BartForConditionalGeneration.from_pretrained(model_name)
            self.max_input_length = max_input_length
            self.max_output_length = max_output_length
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            self.model.eval()
        
        def summarize(self, text, min_length=50):
            """
            使用BART生成摘要
            """
            inputs = self.tokenizer(
                text,
                return_tensors='pt',
                max_length=self.max_input_length,
                truncation=True,
                padding=True
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=self.max_output_length,
                    min_length=min_length,
                    num_beams=4,
                    early_stopping=True,
                    do_sample=False
                )
            
            summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return summary
    
    # 使用示例
    bart_summarizer = BartSummarizer()
    summary = bart_summarizer.summarize(text)
    print(f"BART摘要: {summary}")
    """
    
    print("BART摘要实现代码:")
    print(bart_implementation)

bart_summarization()

GPT系列摘要

def gpt_summarization():
    """
    GPT系列生成式摘要
    """
    print("GPT生成式摘要:")
    
    # GPT摘要实现代码
    gpt_implementation = """
    from openai import OpenAI
    import json
    
    class GPTSummarizer:
        def __init__(self, api_key, model="gpt-4o-mini"):
            self.client = OpenAI(api_key=api_key)
            self.model = model
        
        def summarize(self, text, style="简洁", max_tokens=200):
            """
            使用GPT生成摘要
            """
            prompt = f"""请为以下文章生成一个{style}摘要:

{text}

要求:
1. 保留核心信息和关键数据
2. 语言简洁流畅
3. 字数控制在合理范围内
4. 直接输出摘要,不要解释"""
            
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "你是一个专业的文本摘要助手,能够生成简洁准确的摘要。"},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=max_tokens,
                    temperature=0.3
                )
                
                return response.choices[0].message.content
            
            except Exception as e:
                print(f"GPT摘要生成失败: {e}")
                return "摘要生成失败"
        
        def batch_summarize(self, texts, style="简洁"):
            """
            批量摘要生成
            """
            summaries = []
            for text in texts:
                summary = self.summarize(text, style)
                summaries.append(summary)
            return summaries
    
    # 使用示例
    # gpt_summarizer = GPTSummarizer(api_key="your-api-key")
    # summary = gpt_summarizer.summarize(text, style="详细")
    # print(f"GPT摘要: {summary}")
    """
    
    print("GPT摘要实现代码:")
    print(gpt_implementation)

gpt_summarization()

评估指标ROUGE

ROUGE指标详解

def rouge_metrics():
    """
    ROUGE评估指标详解
    """
    print("ROUGE评估指标:")
    
    rouge_explanation = """
    ROUGE (Recall-Oriented Understudy for Gisting Evaluation) 是摘要评估的标准指标:
    
    1. ROUGE-N (N-gram Recall)
       - ROUGE-1: 单词级别召回率
       - ROUGE-2: 二元组级别召回率
       - ROUGE-N: N元组级别召回率
    
    2. ROUGE-L (Longest Common Subsequence)
       - 基于最长公共子序列的评估
       - 考虑句子结构和语法
    
    3. ROUGE-S (Skip-Bigram)
       - 考虑跳跃二元组的评估
       - 考虑词语间的顺序关系
    
    计算公式:
    ROUGE-N = (候选摘要中N-gram的总数) / (参考摘要中N-gram的总数)
    """
    
    print(rouge_explanation)
    
    # ROUGE计算实现
    rouge_implementation = """
    from collections import Counter
    import math
    
    def calculate_rouge_n(candidate, reference, n=1):
        """
        计算ROUGE-N指标
        """
        def get_ngrams(text, n):
            tokens = text.split()
            return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
        
        candidate_ngrams = get_ngrams(candidate, n)
        reference_ngrams = get_ngrams(reference, n)
        
        if len(reference_ngrams) == 0:
            return 0.0
        
        candidate_counter = Counter(candidate_ngrams)
        reference_counter = Counter(reference_ngrams)
        
        # 计算重叠
        overlap = sum((candidate_counter & reference_counter).values())
        
        # ROUGE-N召回率
        recall = overlap / len(reference_ngrams) if len(reference_ngrams) > 0 else 0
        
        # ROUGE-N精确率
        precision = overlap / len(candidate_ngrams) if len(candidate_ngrams) > 0 else 0
        
        # F1分数
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    def calculate_rouge_l(candidate, reference):
        """
        计算ROUGE-L指标
        """
        def lcs_length(s1, s2):
            m, n = len(s1), len(s2)
            dp = [[0] * (n + 1) for _ in range(m + 1)]
            
            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if s1[i-1] == s2[j-1]:
                        dp[i][j] = dp[i-1][j-1] + 1
                    else:
                        dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            
            return dp[m][n]
        
        c_tokens = candidate.split()
        r_tokens = reference.split()
        
        if len(c_tokens) == 0 or len(r_tokens) == 0:
            return 0.0
        
        lcs_len = lcs_length(c_tokens, r_tokens)
        
        precision = lcs_len / len(c_tokens)
        recall = lcs_len / len(r_tokens)
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    # 使用示例
    candidate = "自然语言处理是人工智能的重要分支"
    reference = "自然语言处理是人工智能的一个重要领域"
    
    rouge1 = calculate_rouge_n(candidate, reference, n=1)
    rouge2 = calculate_rouge_n(candidate, reference, n=2)
    rougel = calculate_rouge_l(candidate, reference)
    
    print(f"ROUGE-1: {rouge1}")
    print(f"ROUGE-2: {rouge2}")
    print(f"ROUGE-L: {rougel}")
    """
    
    print("ROUGE指标实现代码:")
    print(rouge_implementation)

rouge_metrics()

使用py-rouge库

def py_rouge_usage():
    """
    使用py-rouge库进行评估
    """
    print("使用py-rouge库:")
    
    rouge_library_usage = """
    # 安装
    pip install py-rouge
    
    from rouge import Rouge
    
    def evaluate_summaries(candidates, references):
        """
        使用py-rouge库评估摘要质量
        """
        rouge = Rouge()
        
        # 计算ROUGE分数
        scores = rouge.get_scores(candidates, references, avg=True)
        
        # 返回详细的ROUGE分数
        return {
            'rouge-1': scores['rouge-1'],
            'rouge-2': scores['rouge-2'], 
            'rouge-l': scores['rouge-l']
        }
    
    # 使用示例
    candidates = ["这是生成的摘要内容"]
    references = ["这是标准的摘要内容"]
    
    scores = evaluate_summaries(candidates, references)
    print(f"ROUGE分数: {scores}")
    
    # 更详细的评估函数
    def detailed_evaluation(generated_summaries, reference_summaries):
        """
        详细评估摘要质量
        """
        rouge = Rouge()
        
        # 批量评估
        scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
        
        # 按类型分别输出
        for rouge_type in ['rouge-1', 'rouge-2', 'rouge-l']:
            print(f"{rouge_type.upper()}:")
            print(f"  Precision: {scores[rouge_type]['p']:.4f}")
            print(f"  Recall: {scores[rouge_type]['r']:.4f}")
            print(f"  F1: {scores[rouge_type]['f']:.4f}")
        
        return scores
    """
    
    print("py-rouge库使用代码:")
    print(rouge_library_usage)

py_rouge_usage()

模型融合策略

混合摘要策略

def hybrid_summary_strategy():
    """
    混合摘要策略
    """
    print("混合摘要策略:")
    
    hybrid_approaches = [
        {
            "策略": "抽取+生成",
            "描述": "先用抽取式选择关键句子,再用生成式润色",
            "优势": "结合两种方法的优势",
            "适用": "高质量摘要需求"
        },
        {
            "策略": "多模型投票",
            "描述": "多个模型生成摘要后投票选择",
            "优势": "提高稳定性",
            "适用": "可靠性要求高的场景"
        },
        {
            "策略": "层次化摘要",
            "描述": "先粗粒度摘要再细粒度摘要",
            "优势": "处理长文档效果好",
            "适用": "长文本摘要"
        },
        {
            "策略": "自适应选择",
            "描述": "根据文本特征自动选择摘要方法",
            "优势": "灵活性强",
            "适用": "多样化文本处理"
        }
    ]
    
    for approach in hybrid_approaches:
        print(f"\n{approach['策略']}:")
        print(f"  描述: {approach['描述']}")
        print(f"  优势: {approach['优势']}")
        print(f"  适用: {approach['适用']}")
    
    # 混合摘要实现代码
    hybrid_implementation = """
    class HybridSummarizer:
        def __init__(self):
            self.extractive_summarizer = TextRankSummarizer(top_k=5)
            self.abstractive_summarizer = T5Summarizer()
        
        def hybrid_summarize(self, text, text_length_threshold=1000):
            """
            混合摘要生成策略
            """
            if len(text) < text_length_threshold:
                # 短文本使用抽取式
                return self.extractive_summarizer.summarize(text)
            else:
                # 长文本先抽取关键句,再生成摘要
                extracted = self.extractive_summarizer.summarize(text)
                generated = self.abstractive_summarizer.summarize(extracted)
                return generated
        
        def adaptive_summarize(self, text, style='balanced'):
            """
            自适应摘要生成
            """
            # 根据文本特征选择策略
            text_length = len(text)
            sentence_count = len(text.split('。'))
            
            if style == 'fast':
                # 快速模式:使用抽取式
                return self.extractive_summarizer.summarize(text)
            elif style == 'quality':
                # 质量模式:使用生成式
                return self.abstractive_summarizer.summarize(text)
            else:
                # 平衡模式:混合策略
                return self.hybrid_summarize(text)
    """
    
    print("\n混合摘要实现代码:")
    print(hybrid_implementation)

hybrid_summary_strategy()

API服务部署

FastAPI摘要服务

def fastapi_summary_service():
    """
    FastAPI摘要服务部署
    """
    print("FastAPI摘要服务:")
    
    fastapi_implementation = """
    from fastapi import FastAPI, HTTPException
    from pydantic import BaseModel
    from typing import Optional, Literal
    import time
    import logging
    
    app = FastAPI(
        title="自动摘要生成API",
        description="基于多种模型的自动摘要生成服务",
        version="1.0.0"
    )
    
    # 请求模型
    class SummarizeRequest(BaseModel):
        text: str
        method: Literal['textrank', 'bert', 't5', 'bart', 'gpt'] = 'textrank'
        max_length: Optional[int] = 150
        min_length: Optional[int] = 50
        top_k: Optional[int] = 3
        style: Optional[str] = 'balanced'
    
    # 响应模型
    class SummarizeResponse(BaseModel):
        summary: str
        method: str
        processing_time: float
        word_count: int
        compression_ratio: float
    
    # 初始化各摘要模型
    summarizers = {
        'textrank': TextRankSummarizer(),
        'bert': BertExtractiveSummarizer(),
        't5': T5Summarizer(),
        'bart': BartSummarizer(),
        'gpt': GPTSummarizer(api_key="your-api-key") if 'OPENAI_API_KEY' in os.environ else None
    }
    
    @app.post("/summarize", response_model=SummarizeResponse)
    async def summarize_endpoint(request: SummarizeRequest):
        """
        摘要生成接口
        """
        start_time = time.time()
        
        try:
            text = request.text.strip()
            if not text:
                raise HTTPException(status_code=400, detail="输入文本不能为空")
            
            if len(text) > 10000:  # 限制文本长度
                raise HTTPException(status_code=400, detail="文本长度超过限制(10000字符)")
            
            method = request.method
            summarizer = summarizers.get(method)
            
            if not summarizer:
                raise HTTPException(status_code=400, detail=f"不支持的摘要方法: {method}")
            
            if method == 'textrank':
                summary = summarizer.summarize(text)
            elif method == 'bert':
                summary = summarizer.summarize(text)
            elif method == 't5':
                summary = summarizer.summarize(
                    text, 
                    min_length=request.min_length,
                    style=request.style
                )
            elif method == 'bart':
                summary = summarizer.summarize(text, min_length=request.min_length)
            elif method == 'gpt':
                if summarizers['gpt'] is None:
                    raise HTTPException(status_code=400, detail="GPT模型未配置")
                summary = summarizers['gpt'].summarize(text, style=request.style)
            
            processing_time = time.time() - start_time
            
            # 计算压缩比
            original_word_count = len(text.split())
            summary_word_count = len(summary.split())
            compression_ratio = summary_word_count / original_word_count if original_word_count > 0 else 0
            
            return SummarizeResponse(
                summary=summary,
                method=method,
                processing_time=round(processing_time, 4),
                word_count=summary_word_count,
                compression_ratio=round(compression_ratio, 4)
            )
        
        except Exception as e:
            logging.error(f"摘要生成失败: {str(e)}")
            raise HTTPException(status_code=500, detail=f"摘要生成失败: {str(e)}")
    
    @app.get("/health")
    async def health_check():
        """
        健康检查接口
        """
        return {"status": "healthy", "timestamp": time.time()}
    
    @app.get("/methods")
    async def get_methods():
        """
        获取支持的摘要方法
        """
        return {
            "methods": ["textrank", "bert", "t5", "bart", "gpt"],
            "description": {
                "textrank": "基于TextRank算法的抽取式摘要",
                "bert": "基于BERT的抽取式摘要", 
                "t5": "基于T5的生成式摘要",
                "bart": "基于BART的生成式摘要",
                "gpt": "基于GPT的生成式摘要"
            }
        }
    
    # 启动命令: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
    """
    
    print("FastAPI摘要服务实现:")
    print(fastapi_implementation)

fastapi_summary_service()

Docker部署

def docker_deployment():
    """
    Docker容器化部署
    """
    print("Docker部署方案:")
    
    dockerfile_content = """
    # Dockerfile
    FROM python:3.9-slim
    
    WORKDIR /app
    
    # 安装系统依赖
    RUN apt-get update \\
        && apt-get install -y gcc g++ \\
        && rm -rf /var/lib/apt/lists/*
    
    # 复制依赖文件
    COPY requirements.txt .
    
    # 安装Python依赖
    RUN pip install --no-cache-dir -r requirements.txt
    
    # 复制应用代码
    COPY . .
    
    # 创建模型缓存目录
    RUN mkdir -p /root/.cache/huggingface
    
    # 暴露端口
    EXPOSE 8000
    
    # 启动命令
    CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
    """
    
    requirements_content = """
    # requirements.txt
    fastapi==0.104.1
    uvicorn[standard]==0.24.0
    torch==2.1.0
    transformers==4.35.0
    scikit-learn==1.3.0
    jieba==0.42.1
    rouge==1.0.1
    numpy==1.24.3
    pandas==2.1.0
    pydantic==2.4.2
    python-multipart==0.0.6
    openai==1.3.0
    """
    
    docker_compose_content = """
    # docker-compose.yml
    version: '3.8'
    
    services:
      summary-service:
        build: .
        ports:
          - "8000:8000"
        environment:
          - OPENAI_API_KEY=${OPENAI_API_KEY}
          - TRANSFORMERS_CACHE=/root/.cache/huggingface
        volumes:
          - ./logs:/app/logs
          - ~/.cache/huggingface:/root/.cache/huggingface
        deploy:
          resources:
            limits:
              memory: 8G
              cpus: '4'
        restart: unless-stopped
        healthcheck:
          test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
          interval: 30s
          timeout: 10s
          retries: 3
          start_period: 60s
    """
    
    print("Dockerfile:")
    print(dockerfile_content)
    print("\nrequirements.txt:")
    print(requirements_content)
    print("\ndocker-compose.yml:")
    print(docker_compose_content)

docker_deployment()

性能优化

模型优化策略

def performance_optimization():
    """
    摘要系统性能优化
    """
    print("性能优化策略:")
    
    optimization_strategies = [
        {
            "策略": "模型量化",
            "描述": "INT8或FP16量化减少模型大小",
            "效果": "推理速度提升2-3倍,内存减少50%"
        },
        {
            "策略": "模型蒸馏",
            "描述": "训练小模型模仿大模型行为",
            "效果": "显著提升推理速度,轻微精度损失"
        },
        {
            "策略": "批处理优化",
            "描述": "批量处理多个摘要请求",
            "效果": "提高吞吐量"
        },
        {
            "策略": "缓存机制",
            "描述": "缓存常见文本的摘要结果",
            "效果": "减少重复计算,提升响应速度"
        },
        {
            "策略": "异步处理",
            "描述": "使用异步IO处理长文本",
            "效果": "提高并发处理能力"
        }
    ]
    
    for strategy in optimization_strategies:
        print(f"\n{strategy['策略']}:")
        print(f"  描述: {strategy['描述']}")
        print(f"  效果: {strategy['效果']}")
    
    # 优化示例代码
    optimization_examples = """
    import asyncio
    import concurrent.futures
    from functools import lru_cache
    
    class OptimizedSummarizer:
        def __init__(self):
            # 使用LRU缓存
            self.cache_summarize = lru_cache(maxsize=1000)(self._summarize_internal)
        
        def _summarize_internal(self, text_hash, method, **kwargs):
            """
            内部摘要生成方法(带缓存)
            """
            # 实际的摘要生成逻辑
            pass
        
        async def async_summarize(self, text, method='textrank'):
            """
            异步摘要生成
            """
            loop = asyncio.get_event_loop()
            with concurrent.futures.ThreadPoolExecutor() as executor:
                result = await loop.run_in_executor(
                    executor, 
                    lambda: self.summarize(text, method)
                )
            return result
        
        def batch_summarize(self, texts, method='textrank'):
            """
            批量摘要生成
            """
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
                futures = [executor.submit(self.summarize, text, method) for text in texts]
                results = [future.result() for future in concurrent.futures.as_completed(futures)]
            return results
    """
    
    print("\n优化实现代码:")
    print(optimization_examples)

performance_optimization()

实际应用案例

企业应用案例

def enterprise_case_studies():
    """
    企业应用案例分析
    """
    print("企业应用案例:")
    
    case_studies = [
        {
            "行业": "新闻媒体",
            "应用场景": "新闻文章自动摘要",
            "技术方案": "BERT抽取式 + T5生成式混合",
            "效果": "摘要准确率85%,处理速度<1秒/篇"
        },
        {
            "行业": "金融",
            "应用场景": "研报摘要生成",
            "技术方案": "BART生成式 + 领域微调",
            "效果": "ROUGE-1: 0.42, ROUGE-2: 0.18"
        },
        {
            "行业": "电商",
            "应用场景": "商品描述摘要",
            "技术方案": "TextRank抽取式 + 规则优化",
            "效果": "提升点击率15%,转化率8%"
        },
        {
            "行业": "教育",
            "应用场景": "学术论文摘要",
            "技术方案": "GPT-4生成式 + 专业术语保留",
            "效果": "专家评分4.2/5.0"
        }
    ]
    
    for case in case_studies:
        print(f"\n{case['行业']}行业:")
        print(f"  场景: {case['应用场景']}")
        print(f"  方案: {case['技术方案']}")
        print(f"  效果: {case['效果']}")

enterprise_case_studies()

最佳实践

def best_practices():
    """
    摘要系统最佳实践
    """
    print("最佳实践建议:")
    
    practices = [
        "根据文本类型选择合适的摘要方法",
        "建立完善的评估体系(Rouge+人工评估)",
        "考虑摘要的可读性和连贯性",
        "处理特殊领域术语和专有名词",
        "平衡摘要长度和信息保留度",
        "定期更新模型以适应语言变化",
        "建立用户反馈机制持续优化"
    ]
    
    for i, practice in enumerate(practices, 1):
        print(f"  {i}. {practice}")

best_practices()

相关教程

自动摘要是一个综合性很强的NLP任务,建议先掌握基础的抽取式方法,再深入学习生成式方法。实际应用中,混合策略往往能取得更好的效果。

总结

自动摘要生成系统的核心要点:

  1. 方法选择: 根据文本长度和质量要求选择合适的方法
  2. 评估指标: ROUGE是标准评估指标,但也需结合人工评估
  3. 混合策略: 抽取+生成的混合方法效果通常更好
  4. 工程实践: 考虑性能、稳定性、可扩展性等因素
  5. 持续优化: 根据实际应用效果持续改进模型

💡 核心要点: 2026年的摘要系统应结合抽取式方法的速度优势和生成式方法的质量优势,根据具体应用场景选择最适合的策略。


🔗 扩展阅读

📂 所属阶段:第六阶段 — 工业级 NLP 项目实战
🔗 相关章节:BERT家族详解 · Prompt Engineering基础