#文本分类实战:基于BERT的企业级情感分析引擎完整开发指南
#目录
#项目概述与目标
构建一个企业级中文情感分析引擎,能够准确识别用户评论中的情感倾向,为企业提供客户反馈分析能力。
#项目目标设定
企业级情感分析引擎需求分析:
业务目标:
- 准确率:> 95% 的情感分类准确率
- 响应时间:单次预测 < 100ms
- 数据来源:电商评论、社交媒体、客服对话等
- 情感类别:正面、负面、中性(扩展支持细粒度情感)
技术架构:
- 模型:基于BERT的中文预训练模型
- 框架:Transformers + PyTorch
- 部署:FastAPI + Docker
- 监控:性能指标追踪#技术栈选择
def tech_stack_overview():
"""
项目技术栈概览
"""
print("技术栈选择理由:")
print("\n模型选择:")
print("- BERT-base-chinese: 中文预训练,效果稳定")
print("- RoBERTa: 更好的中文理解能力")
print("- ALBERT: 参数少,推理快")
print("\n框架选择:")
print("- Transformers: 预训练模型管理")
print("- PyTorch: 模型训练灵活性")
print("- FastAPI: 高性能API服务")
print("- Docker: 容器化部署")
tech_stack_overview()#数据准备与预处理
#数据收集与清洗
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import jieba
from typing import List, Tuple, Dict
class DataPreprocessor:
"""
数据预处理类
"""
def __init__(self):
self.stop_words = self.load_stop_words()
def load_stop_words(self) -> set:
"""
加载停用词表
"""
# 简化版停用词表
stop_words = {
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
'都', '一', '一个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '那', '里'
}
return stop_words
def clean_text(self, text: str) -> str:
"""
清洗文本数据
"""
# 去除特殊字符
text = re.sub(r'[^\w\s]', '', text)
# 去除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def segment_text(self, text: str) -> List[str]:
"""
中文分词
"""
words = jieba.lcut(text)
# 去除停用词和单字符
words = [w for w in words if w not in self.stop_words and len(w) > 1]
return words
def preprocess_dataset(self, df: pd.DataFrame) -> pd.DataFrame:
"""
预处理数据集
"""
df = df.copy()
df['cleaned_text'] = df['text'].apply(self.clean_text)
df['segmented_text'] = df['cleaned_text'].apply(self.segment_text)
df['text_length'] = df['cleaned_text'].apply(len)
# 统计信息
print(f"数据集大小: {len(df)}")
print(f"平均文本长度: {df['text_length'].mean():.2f}")
print(f"文本长度分布:\n{df['text_length'].describe()}")
return df
def load_and_prepare_data():
"""
加载和准备数据的完整流程
"""
# 模拟数据加载(实际项目中从CSV、数据库或API加载)
data = {
'text': [
'这个产品质量很好,值得推荐!',
'服务态度很差,完全不推荐。',
'物流很快,包装也不错',
'质量一般,性价比不高',
'非常满意,下次还会购买',
'商品与描述不符,很失望'
],
'label': [1, 0, 1, 0, 1, 0], # 1: 正面, 0: 负面
'category': ['product', 'service', 'delivery', 'value', 'rebuy', 'quality']
}
df = pd.DataFrame(data)
print("原始数据分布:")
print(df['label'].value_counts())
# 数据预处理
preprocessor = DataPreprocessor()
processed_df = preprocessor.preprocess_dataset(df)
return processed_df
# 示例使用
df = load_and_prepare_data()#数据划分与验证
def data_split_and_validation(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
数据集划分:训练集、验证集、测试集
"""
# 按类别分层采样
train_df, temp_df = train_test_split(
df,
test_size=0.3,
random_state=42,
stratify=df['label']
)
val_df, test_df = train_test_split(
temp_df,
test_size=0.5,
random_state=42,
stratify=temp_df['label']
)
print("数据集划分结果:")
print(f"训练集: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"验证集: {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
print(f"测试集: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
# 检查类别分布
for name, subset in [("训练集", train_df), ("验证集", val_df), ("测试集", test_df)]:
print(f"{name}标签分布: {subset['label'].value_counts().to_dict()}")
return train_df, val_df, test_df
# 示例使用
train_df, val_df, test_df = data_split_and_validation(df)#数据增强技术
#高级数据增强方法
import random
from transformers import pipeline
import synonyms # 需要安装: pip install synonyms
class DataAugmenter:
"""
数据增强类
"""
def __init__(self):
self.synonym_methods = [
self.synonym_replacement,
self.random_insertion,
self.random_swap,
self.random_deletion
]
def synonym_replacement(self, text: str, n: int = 1) -> str:
"""
同义词替换
"""
try:
# 使用synonyms库进行同义词替换(需要预先安装)
words = text.split()
new_words = words.copy()
random_word_list = list(set([word for word in words if len(word) > 1]))
random.shuffle(random_word_list)
replaced_count = 0
for random_word in random_word_list:
if replaced_count >= n:
break
# 简化版同义词替换逻辑
synonyms_dict = {
"好": ["优秀", "棒", "不错", "良好", "优质"],
"坏": ["差", "糟糕", "不好", "恶劣", "劣质"],
"喜欢": ["喜爱", "爱好", "欣赏", "钟爱", "青睐"],
"讨厌": ["厌恶", "反感", "嫌弃", "不满", "排斥"]
}
if random_word in synonyms_dict:
synonym = random.choice(synonyms_dict[random_word])
new_words = [synonym if word == random_word else word for word in new_words]
replaced_count += 1
return " ".join(new_words)
except:
# 如果synonyms库不可用,返回原文本
return text
def random_insertion(self, text: str, n: int = 1) -> str:
"""
随机插入
"""
words = text.split()
new_words = words.copy()
for _ in range(n):
if len(new_words) == 0:
continue
# 简化版:随机选择一个词并重复
random_idx = random.randint(0, len(new_words)-1)
random_word = new_words[random_idx]
insert_idx = random.randint(0, len(new_words))
new_words.insert(insert_idx, random_word)
return " ".join(new_words)
def random_swap(self, text: str, n: int = 1) -> str:
"""
随机交换
"""
words = text.split()
new_words = words.copy()
for _ in range(n):
if len(new_words) < 2:
break
idx1, idx2 = random.sample(range(len(new_words)), 2)
new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
return " ".join(new_words)
def random_deletion(self, text: str, p: float = 0.1) -> str:
"""
随机删除
"""
words = text.split()
if len(words) == 1:
return text
new_words = []
for word in words:
if random.uniform(0, 1) > p:
new_words.append(word)
if len(new_words) == 0:
return random.choice(words)
return " ".join(new_words)
def augment(self, text: str, n_aug: int = 2, method: str = 'all') -> List[str]:
"""
数据增强主函数
"""
augmented_texts = [text]
if method == 'all':
methods = self.synonym_methods
else:
method_map = {
'synonym': self.synonym_replacement,
'insertion': self.random_insertion,
'swap': self.random_swap,
'deletion': self.random_deletion
}
methods = [method_map[method]] if method in method_map else [self.synonym_replacement]
for _ in range(n_aug):
original_text = random.choice(augmented_texts)
aug_method = random.choice(methods)
if aug_method == self.random_deletion:
augmented_text = aug_method(original_text, p=0.1)
else:
augmented_text = aug_method(original_text, n=1)
augmented_texts.append(augmented_text)
return augmented_texts[1:] # 返回增强后的文本,不包含原始文本
def apply_data_augmentation(train_df: pd.DataFrame) -> pd.DataFrame:
"""
应用数据增强
"""
augmenter = DataAugmenter()
augmented_data = []
for _, row in train_df.iterrows():
original_text = row['text']
label = row['label']
# 原始数据
augmented_data.append({'text': original_text, 'label': label})
# 数据增强
augmented_texts = augmenter.augment(original_text, n_aug=2)
for aug_text in augmented_texts:
augmented_data.append({'text': aug_text, 'label': label})
augmented_df = pd.DataFrame(augmented_data)
print(f"数据增强前: {len(train_df)} 条")
print(f"数据增强后: {len(augmented_df)} 条")
print(f"增强比例: {len(augmented_df)/len(train_df):.2f}x")
return augmented_df
# 示例使用(注释掉,因为需要安装synonyms库)
# augmented_train_df = apply_data_augmentation(train_df)
augmented_train_df = train_df # 使用原始数据作为示例#模型微调完整流程
#使用Transformers进行微调
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
EarlyStoppingCallback
)
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
class SentimentClassifier:
"""
情感分类器类
"""
def __init__(self, model_name: str = "bert-base-chinese", num_labels: int = 2):
self.model_name = model_name
self.num_labels = num_labels
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def tokenize_data(self, texts: List[str], labels: List[int] = None) -> Dataset:
"""
对数据进行tokenize
"""
encodings = self.tokenizer(
texts,
truncation=True,
padding=True,
max_length=128,
return_tensors='pt'
)
dataset = Dataset.from_dict({
'input_ids': encodings['input_ids'].tolist(),
'attention_mask': encodings['attention_mask'].tolist(),
'labels': labels if labels is not None else [-1] * len(texts) # -1表示无标签
})
return dataset
def compute_metrics(self, eval_pred):
"""
计算评估指标
"""
predictions, labels = eval_pred
predictions = torch.tensor(predictions)
labels = torch.tensor(labels)
# 获取预测结果
preds = torch.argmax(predictions, dim=-1)
probs = torch.softmax(predictions, dim=-1)
# 计算各项指标
accuracy = accuracy_score(labels.numpy(), preds.numpy())
precision, recall, f1, _ = precision_recall_fscore_support(
labels.numpy(), preds.numpy(), average='weighted'
)
# 对于二分类,计算AUC
if self.num_labels == 2:
try:
auc = roc_auc_score(labels.numpy(), probs[:, 1].numpy())
except:
auc = 0.0
else:
auc = 0.0
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
'auc': auc
}
def train(self, train_dataset: Dataset, val_dataset: Dataset, output_dir: str = "./sentiment_model"):
"""
模型训练
"""
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
warmup_steps=500,
weight_decay=0.01,
logging_dir=f'{output_dir}/logs',
logging_steps=10,
evaluation_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=100,
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
save_total_limit=2,
seed=42,
fp16=torch.cuda.is_available(), # 如果有GPU则使用混合精度
report_to=None # 禁用wandb等记录
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=self.compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
print("开始模型训练...")
trainer.train()
# 保存最佳模型
trainer.save_model(f"{output_dir}/best_model")
self.tokenizer.save_pretrained(f"{output_dir}/best_model")
print(f"模型已保存到: {output_dir}/best_model")
return trainer
def train_sentiment_model():
"""
完整的模型训练流程
"""
# 初始化分类器
classifier = SentimentClassifier()
# 准备训练数据
train_texts = augmented_train_df['text'].tolist()
train_labels = augmented_train_df['label'].tolist()
train_dataset = classifier.tokenize_data(train_texts, train_labels)
# 准备验证数据
val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()
val_dataset = classifier.tokenize_data(val_texts, val_labels)
# 训练模型
trainer = classifier.train(train_dataset, val_dataset)
return classifier, trainer
# 注意:由于数据量小,这里只展示代码结构
# trained_classifier, trainer = train_sentiment_model()#模型调优与超参数优化
def hyperparameter_tuning():
"""
超参数调优示例
"""
print("超参数调优策略:")
print("\n1. 学习率调优:")
print(" - 初始值: 2e-5, 3e-5, 5e-5")
print(" - 调整策略: 学习率调度器")
print("\n2. 批次大小:")
print(" - GPU内存限制下的最大批次")
print(" - 影响训练稳定性")
print("\n3. 训练轮数:")
print(" - 早停机制防止过拟合")
print(" - 通常3-5轮足够")
print("\n4. 权重衰减:")
print(" - 防止过拟合: 0.01-0.1")
# 学习率调度器示例
from transformers import get_linear_schedule_with_warmup
def create_scheduler(optimizer, num_training_steps, warmup_steps=500):
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps
)
return scheduler
print("\n学习率调度器已准备就绪")
hyperparameter_tuning()#模型评估与分析
#全面的模型评估
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
class ModelEvaluator:
"""
模型评估类
"""
def __init__(self, classifier):
self.classifier = classifier
def evaluate_model(self, test_dataset, test_labels):
"""
评估模型性能
"""
# 预测
predictions = []
self.classifier.model.eval()
with torch.no_grad():
for i in range(0, len(test_dataset), 32): # 批量预测
batch = test_dataset.select(range(i, min(i+32, len(test_dataset))))
inputs = {
'input_ids': torch.tensor(batch['input_ids']),
'attention_mask': torch.tensor(batch['attention_mask'])
}
outputs = self.classifier.model(**inputs)
batch_preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
predictions.extend(batch_preds)
# 计算评估指标
predictions = np.array(predictions)[:len(test_labels)]
# 分类报告
target_names = ['负面', '正面'] if self.classifier.num_labels == 2 else [f'类别{i}' for i in range(self.classifier.num_labels)]
report = classification_report(test_labels, predictions, target_names=target_names, output_dict=True)
# 混淆矩阵
cm = confusion_matrix(test_labels, predictions)
print("模型评估结果:")
print(f"准确率: {report['accuracy']:.4f}")
print(f"精确率: {report['weighted avg']['precision']:.4f}")
print(f"召回率: {report['weighted avg']['recall']:.4f}")
print(f"F1分数: {report['weighted avg']['f1-score']:.4f}")
print("\n详细分类报告:")
print(classification_report(test_labels, predictions, target_names=target_names))
return report, cm
def plot_confusion_matrix(self, cm, class_names):
"""
绘制混淆矩阵
"""
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.tight_layout()
plt.show()
def error_analysis(self, test_texts, test_labels, predictions):
"""
错误分析
"""
errors = []
for i, (text, true_label, pred_label) in enumerate(zip(test_texts, test_labels, predictions)):
if true_label != pred_label:
errors.append({
'text': text,
'true_label': true_label,
'pred_label': pred_label
})
print(f"\n错误分析 - 总共 {len(errors)} 个错误预测:")
for i, error in enumerate(errors[:5]): # 显示前5个错误
true_name = '负面' if error['true_label'] == 0 else '正面'
pred_name = '负面' if error['pred_label'] == 0 else '正面'
print(f"{i+1}. 文本: {error['text'][:50]}...")
print(f" 真实: {true_name}, 预测: {pred_name}")
return errors
def comprehensive_evaluation():
"""
综合评估示例
"""
print("模型评估流程:")
print("1. 准备测试数据")
print("2. 模型预测")
print("3. 计算评估指标")
print("4. 混淆矩阵分析")
print("5. 错误分析")
print("6. 性能瓶颈识别")
comprehensive_evaluation()#模型部署与服务化
#构建生产级API服务
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import uvicorn
from typing import List, Optional
import time
import logging
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SentimentRequest(BaseModel):
"""
情感分析请求模型
"""
text: str
threshold: Optional[float] = 0.5
class SentimentResponse(BaseModel):
"""
情感分析响应模型
"""
text: str
sentiment: str
confidence: float
probabilities: dict
processing_time: float
class SentimentAPI:
"""
情感分析API类
"""
def __init__(self, model_path: str):
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.eval()
# 情感标签映射
self.label_map = {0: "负面", 1: "正面"}
logger.info("模型加载完成")
def predict(self, text: str) -> dict:
"""
预测单个文本的情感
"""
start_time = time.time()
# 文本预处理
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
)
# 模型预测
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
# 获取预测结果
predicted_class = torch.argmax(logits, dim=-1).item()
confidence = float(probabilities[predicted_class])
processing_time = time.time() - start_time
return {
"text": text,
"sentiment": self.label_map[predicted_class],
"confidence": round(confidence, 4),
"probabilities": {
self.label_map[i]: round(float(prob), 4)
for i, prob in enumerate(probabilities)
},
"processing_time": round(processing_time, 4)
}
# 创建FastAPI应用
app = FastAPI(
title="情感分析API",
description="基于BERT的企业级中文情感分析服务",
version="1.0.0"
)
# 初始化模型(实际使用时需要指定正确的模型路径)
# sentiment_api = SentimentAPI("./best_model")
@app.get("/")
async def health_check():
"""
健康检查端点
"""
return {"status": "healthy", "message": "情感分析API服务正常运行"}
@app.post("/predict", response_model=SentimentResponse)
async def predict_sentiment(request: SentimentRequest):
"""
情感分析预测接口
"""
try:
# 这里需要先初始化模型
# result = sentiment_api.predict(request.text)
# 为了演示,使用模拟结果
result = {
"text": request.text,
"sentiment": "正面" if "好" in request.text or "棒" in request.text else "负面",
"confidence": 0.85,
"probabilities": {"负面": 0.15, "正面": 0.85},
"processing_time": 0.05
}
return SentimentResponse(**result)
except Exception as e:
logger.error(f"预测错误: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/batch_predict")
async def batch_predict(texts: List[str]):
"""
批量情感分析接口
"""
results = []
for text in texts:
# 模拟批量处理
result = {
"text": text,
"sentiment": "正面" if "好" in text or "棒" in text else "负面",
"confidence": 0.85,
"probabilities": {"负面": 0.15, "正面": 0.85}
}
results.append(result)
return {"results": results}
def run_api_server():
"""
运行API服务
"""
print("启动情感分析API服务...")
print("服务地址: http://localhost:8000")
print("API文档: http://localhost:8000/docs")
# 注意:在实际部署时取消注释下面的行
# uvicorn.run(app, host="0.0.0.0", port=8000)
# run_api_server() # 注释掉,避免在当前环境中运行#Docker容器化部署
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# 复制requirements文件
COPY requirements.txt .
# 安装Python依赖
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY . .
# 创建模型目录
RUN mkdir -p /app/models
# 下载预训练模型(在实际部署时可能需要预先下载)
# RUN python -c "from transformers import AutoTokenizer, AutoModelForSequenceClassification; \
# AutoTokenizer.from_pretrained('bert-base-chinese'); \
# AutoModelForSequenceClassification.from_pretrained('bert-base-chinese')"
# 暴露端口
EXPOSE 8000
# 启动命令
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]# requirements.txt
fastapi==0.104.1
uvicorn==0.24.0
transformers==4.35.2
torch>=1.13.0
pandas==2.1.3
scikit-learn==1.3.2
jieba==0.42.1
numpy==1.24.3
seaborn==0.13.0
matplotlib==3.8.1
pydantic==2.5.0#性能优化与监控
#模型优化技术
def model_optimization_techniques():
"""
模型优化技术总结
"""
print("模型优化技术:")
print("\n1. 模型量化 (Quantization):")
print(" - INT8量化: 减少模型大小,提升推理速度")
print(" - FP16半精度: GPU推理加速")
print("\n2. 知识蒸馏 (Knowledge Distillation):")
print(" - DistilBERT: 保持95%性能,推理速度快2-3倍")
print(" - TinyBERT: 更小更快的BERT变体")
print("\n3. 模型剪枝 (Pruning):")
print(" - 移除不重要的权重")
print(" - 减少模型大小和计算量")
print("\n4. ONNX转换:")
print(" - 转换为ONNX格式")
print(" - 使用ONNX Runtime进行推理")
print("\n5. 缓存机制:")
print(" - 结果缓存减少重复计算")
print(" - 批量处理提升吞吐量")
def performance_monitoring():
"""
性能监控策略
"""
print("性能监控指标:")
print("\n1. 推理性能:")
print(" - 响应时间: P50, P95, P99延迟")
print(" - 吞吐量: QPS (Queries Per Second)")
print(" - 并发数: 同时处理的请求数")
print("\n2. 模型性能:")
print(" - 准确率: 持续监控模型准确性")
print(" - 数据漂移: 输入分布变化检测")
print(" - 概率校准: 预测概率的可靠性")
print("\n3. 系统性能:")
print(" - CPU/GPU利用率")
print(" - 内存使用情况")
print(" - 磁盘I/O")
model_optimization_techniques()
performance_monitoring()#实际应用案例
#电商评论情感分析
class ECommerceSentimentAnalyzer:
"""
电商评论情感分析系统
"""
def __init__(self, model_path: str):
self.api = SentimentAPI(model_path)
def analyze_review(self, review: str, metadata: dict = None) -> dict:
"""
分析单条评论
"""
result = self.api.predict(review)
# 添加业务逻辑
if result['confidence'] < 0.7:
result['confidence_level'] = 'low'
elif result['confidence'] < 0.9:
result['confidence_level'] = 'medium'
else:
result['confidence_level'] = 'high'
# 结合元数据
analysis_result = {
'review': review,
'sentiment_analysis': result,
'metadata': metadata or {}
}
return analysis_result
def batch_analyze(self, reviews: List[str]) -> List[dict]:
"""
批量分析评论
"""
results = []
for review in reviews:
result = self.analyze_review(review)
results.append(result)
return results
def generate_insights(self, analysis_results: List[dict]) -> dict:
"""
生成业务洞察
"""
sentiments = [r['sentiment_analysis']['sentiment'] for r in analysis_results]
positive_count = sentiments.count('正面')
negative_count = sentiments.count('负面')
total_count = len(sentiments)
insights = {
'total_reviews': total_count,
'positive_percentage': round(positive_count / total_count * 100, 2) if total_count > 0 else 0,
'negative_percentage': round(negative_count / total_count * 100, 2) if total_count > 0 else 0,
'average_confidence': round(sum([r['sentiment_analysis']['confidence'] for r in analysis_results]) / len(analysis_results), 4) if analysis_results else 0
}
return insights
def ecommerce_demo():
"""
电商情感分析演示
"""
print("电商评论情感分析演示:")
# 模拟评论数据
reviews = [
"这个产品质量很好,物流也很快,非常满意!",
"服务态度很差,商品质量也不行,不推荐购买。",
"价格合理,功能齐全,性价比很高。",
"包装破损严重,客服态度也不好,很失望。",
"使用体验不错,会推荐给朋友。"
]
print("原始评论:")
for i, review in enumerate(reviews, 1):
print(f"{i}. {review}")
print("\n情感分析结果:")
# 由于模型未实际加载,使用模拟结果
for i, review in enumerate(reviews, 1):
sentiment = "正面" if any(word in review for word in ["好", "满意", "不错", "推荐", "喜欢"]) else "负面"
confidence = 0.8 if sentiment == "正面" else 0.75
print(f"{i}. 情感: {sentiment}, 置信度: {confidence}")
ecommerce_demo()#客服对话情感分析
class CustomerServiceAnalyzer:
"""
客服对话情感分析
"""
def __init__(self, model_path: str):
self.api = SentimentAPI(model_path)
def analyze_conversation(self, conversation: List[dict]) -> dict:
"""
分析完整对话
"""
all_messages = []
customer_messages = []
agent_messages = []
for msg in conversation:
all_messages.append(msg['text'])
if msg['role'] == 'customer':
customer_messages.append(msg['text'])
elif msg['role'] == 'agent':
agent_messages.append(msg['text'])
# 分析整体对话情感
overall_sentiment = self._analyze_group(all_messages)
# 分析客户情感
customer_sentiment = self._analyze_group(customer_messages)
# 分析客服情感
agent_sentiment = self._analyze_group(agent_messages)
return {
'overall_analysis': overall_sentiment,
'customer_analysis': customer_sentiment,
'agent_analysis': agent_sentiment,
'conversation_summary': self._generate_summary(conversation)
}
def _analyze_group(self, messages: List[str]) -> dict:
"""
分析一组消息
"""
if not messages:
return {'sentiment': '中性', 'count': 0, 'avg_confidence': 0.0}
sentiments = []
confidences = []
for msg in messages:
result = self.api.predict(msg) # 模拟调用
sentiments.append(result['sentiment'])
confidences.append(result['confidence'])
# 统计主要情感
pos_count = sentiments.count('正面')
neg_count = sentiments.count('负面')
if pos_count > neg_count:
main_sentiment = '正面'
elif neg_count > pos_count:
main_sentiment = '负面'
else:
main_sentiment = '中性'
return {
'sentiment': main_sentiment,
'count': len(messages),
'avg_confidence': sum(confidences) / len(confidences) if confidences else 0.0,
'breakdown': {'正面': pos_count, '负面': neg_count, '总数': len(messages)}
}
def _generate_summary(self, conversation: List[dict]) -> str:
"""
生成对话摘要
"""
summary_parts = []
for msg in conversation[:3]: # 只取前3条消息作为摘要
role = "客户" if msg['role'] == 'customer' else "客服"
summary_parts.append(f"{role}: {msg['text'][:50]}...")
return " | ".join(summary_parts)
def customer_service_demo():
"""
客服对话分析演示
"""
# 模拟对话数据
conversation = [
{'role': 'customer', 'text': '你好,我想咨询一下订单状态'},
{'role': 'agent', 'text': '您好,很高兴为您服务,请问您的订单号是多少?'},
{'role': 'customer', 'text': '订单号是123456,为什么还没有发货?'},
{'role': 'agent', 'text': '非常抱歉给您带来不便,我马上为您查询'},
{'role': 'agent', 'text': '系统显示明天就会发货,我们会尽快处理'},
{'role': 'customer', 'text': '好的,谢谢你的耐心解答'}
]
print("客服对话分析演示:")
print("对话内容:")
for msg in conversation:
role = "客户" if msg['role'] == 'customer' else "客服"
print(f" {role}: {msg['text']}")
print("\n分析结果:")
print(" 整体情感: 正面 (基于模拟分析)")
print(" 客户情感: 中性 (基于模拟分析)")
print(" 客服情感: 正面 (基于模拟分析)")
customer_service_demo()#相关教程
#总结
企业级情感分析系统开发的关键要素:
- 数据质量:高质量的标注数据是模型性能的基础
- 模型选择:根据业务需求选择合适的预训练模型
- 评估体系:建立全面的模型评估和监控机制
- 工程实践:注重代码质量和系统稳定性
- 持续优化:根据业务反馈不断优化模型性能
💡 核心要点:企业级NLP应用不仅需要优秀的模型性能,更需要可靠的工程实现和完善的运维体系。
🔗 扩展阅读
- BERT Fine-tuning Guide
- Transformers Documentation
- Production ML Best Practices
- FastAPI Documentation
📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)
🔗 相关章节:Hugging Face实战 · 命名实体识别NER

