#命名实体识别(NER):从文本中精准提取人名、地名、机构名的完整指南
#目录
#NER概述与应用
命名实体识别(Named Entity Recognition, NER)是自然语言处理中的一项基础任务,旨在从非结构化文本中识别并分类预定义的实体类型。
#NER任务定义
NER = Named Entity Recognition (命名实体识别)
任务目标:
- 从文本中识别出特定类型的命名实体
- 对每个识别出的实体进行分类标注
核心挑战:
├── 实体边界识别(Entity Boundary Detection)
├── 实体类型分类(Entity Type Classification)
├── 嵌套实体处理(Nested Entities)
└── 未登录词处理(Out-of-Vocabulary Words)#常见实体类型
def ner_entity_types():
"""
NER常见的实体类型及其示例
"""
entity_types = {
'PER': {
'name': 'Person',
'examples': ['小明', '张三', '李四', '马云', '雷军'],
'description': '人名实体'
},
'ORG': {
'name': 'Organization',
'examples': ['清华大学', '阿里巴巴', '腾讯', '北京大学'],
'description': '组织机构名'
},
'LOC': {
'name': 'Location',
'examples': ['北京', '上海', '纽约', '长江', '珠穆朗玛峰'],
'description': '地理位置名'
},
'TIME': {
'name': 'Time',
'examples': ['2024年', '昨天', '上午10点', '2024-01-01'],
'description': '时间表达式'
},
'MONEY': {
'name': 'Money',
'examples': ['100元', '$100', '一千万', '€50'],
'description': '货币金额'
},
'PROD': {
'name': 'Product',
'examples': ['iPhone 15', '华为P60', '特斯拉Model 3'],
'description': '产品名称'
}
}
print("NER实体类型详解:")
for entity_type, info in entity_types.items():
print(f"\n{entity_type} ({info['name']}): {info['description']}")
print(f" 示例: {', '.join(info['examples'])}")
ner_entity_types()#应用场景
NER技术在众多领域都有广泛应用:
def ner_applications():
"""
NER技术的应用场景
"""
applications = [
{
'domain': '搜索引擎',
'use_case': '关键词高亮、实体链接',
'example': '搜索"清华大学"时高亮显示相关实体'
},
{
'domain': '知识图谱',
'use_case': '实体抽取、关系发现',
'example': '从新闻中抽取人物、地点、事件关系'
},
{
'domain': '智能客服',
'use_case': '意图理解、槽位填充',
'example': '识别用户询问中的产品名称、时间等关键信息'
},
{
'domain': '金融风控',
'use_case': '实体识别、风险评估',
'example': '识别交易文本中的公司名、金额、时间等'
},
{
'domain': '医疗健康',
'use_case': '疾病识别、药物抽取',
'example': '从病历中识别症状、药品、诊断结果'
},
{
'domain': '舆情分析',
'use_case': '话题跟踪、情感分析',
'example': '识别新闻中的关键人物、机构、事件'
}
]
print("NER应用场景:")
for i, app in enumerate(applications, 1):
print(f"{i}. {app['domain']}")
print(f" 用途: {app['use_case']}")
print(f" 示例: {app['example']}")
ner_applications()#BIO标注法详解
BIO标注法是序列标注中最常用的标注方法,通过给每个词赋予特定标签来表示实体边界和类型。
#BIO标注规则
def bio_annotation_rules():
"""
BIO标注法详细规则
"""
print("BIO标注法规则:")
print("B-xxx: 实体的开始(Begin),实体的第一个词")
print("I-xxx: 实体的中间或结尾(Inside),实体的其他词")
print("O: 非实体(Outside),不属于任何实体")
print("\n示例解析:")
print("文本: 小明在北京大学读书")
print("BIO标注: B-PER O B-LOC I-ORG O")
print("实体恢复: 小明(PER), 北京(LOC), 大学(ORG) -> 北京大学(ORG)")
print("\n扩展标注法:")
print("BIOES标注法:")
print("B-xxx: 实体开始(Begin)")
print("I-xxx: 实体中间(Inside)")
print("E-xxx: 实体结尾(End)")
print("S-xxx: 单独实体(Singleton)")
print("O: 非实体")
bio_annotation_rules()#BIO标注示例
def bio_annotation_examples():
"""
BIO标注详细示例
"""
examples = [
{
'text': '小明在北京大学读书',
'tokens': ['小', '明', '在', '北京', '大学', '读', '书'],
'labels': ['B-PER', 'I-PER', 'O', 'B-ORG', 'I-ORG', 'O', 'O'],
'entities': [('小明', 'PER'), ('北京大学', 'ORG')]
},
{
'text': '马云是阿里巴巴的创始人',
'tokens': ['马', '云', '是', '阿', '里', '巴', '巴', '的', '创', '始', '人'],
'labels': ['B-PER', 'I-PER', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O'],
'entities': [('马云', 'PER'), ('阿里巴巴', 'ORG')]
}
]
for i, example in enumerate(examples, 1):
print(f"示例 {i}: {example['text']}")
print("分词结果:", example['tokens'])
print("BIO标签:", example['labels'])
print("识别实体:", example['entities'])
print()
bio_annotation_examples()#标注数据格式
def ner_data_formats():
"""
NER数据的不同格式
"""
print("NER数据格式:")
print("\n1. CoNLL格式 (每行一个词及其标签):")
conll_format = """小明 B-PER
在 O
北京 B-LOC
大学 I-ORG
读 O
书 O"""
print(conll_format)
print("\n2. JSON格式:")
json_format = {
"tokens": ["小明", "在", "北京", "大学", "读", "书"],
"labels": ["B-PER", "O", "B-LOC", "I-ORG", "O", "O"],
"entities": [
{"text": "小明", "label": "PER", "start": 0, "end": 2},
{"text": "北京", "label": "LOC", "start": 3, "end": 5},
{"text": "大学", "label": "ORG", "start": 3, "end": 7}
]
}
print(json_format)
print("\n3. IOBES格式 (更精细的边界标注):")
iobes_format = """小 B-PER
明 E-PER
在 O
北 B-ORG
京 I-ORG
大 I-ORG
学 E-ORG
读 O
书 O"""
print(iobes_format)
ner_data_formats()#中文NER数据集
#常用中文NER数据集
def chinese_ner_datasets():
"""
常用的中文NER数据集
"""
datasets = {
'MSRA': {
'size': '约50,000句',
'entities': ['PER', 'ORG', 'LOC'],
'source': '新闻文本',
'format': 'BIES',
'download': 'CLUE benchmark'
},
'OntoNotes4': {
'size': '约16,000句',
'entities': ['PER', 'ORG', 'LOC', 'MISC'],
'source': '多领域文本',
'format': 'BIO',
'download': 'CoNLL 2012'
},
'Weibo': {
'size': '约1,300句',
'entities': ['PER', 'ORG', 'LOC', 'TIME'],
'source': '微博文本',
'format': 'BIO',
'download': 'LTP toolkit'
},
'Resume': {
'size': '约1,700句',
'entities': ['NAME', 'ORG', 'RACE', 'TITLE', 'EDU', 'PRO', 'CONT', 'LOC'],
'source': '简历文本',
'format': 'BIO',
'download': '哈工大'
}
}
print("中文NER数据集对比:")
for name, info in datasets.items():
print(f"\n{name} 数据集:")
for key, value in info.items():
print(f" {key}: {value}")
chinese_ner_datasets()#数据预处理
import pandas as pd
import re
import jieba
class NERDataProcessor:
"""
NER数据预处理器
"""
def __init__(self):
self.entity_types = ['PER', 'ORG', 'LOC']
def text_to_bios(self, text: str, entities: list) -> tuple:
"""
将文本和实体转换为BIO标注
entities: [(start, end, label), ...]
"""
tokens = list(text)
labels = ['O'] * len(tokens)
# 按起始位置排序实体
entities = sorted(entities, key=lambda x: x[0])
for start, end, label in entities:
if start < len(labels) and end <= len(labels):
labels[start] = f'B-{label}'
for i in range(start + 1, end):
labels[i] = f'I-{label}'
return tokens, labels
def bios_to_entities(self, tokens: list, labels: list) -> list:
"""
从BIO标注恢复实体
"""
entities = []
current_entity = None
for i, (token, label) in enumerate(zip(tokens, labels)):
if label.startswith('B-'):
# 结束当前实体
if current_entity:
entities.append((''.join(current_entity[0]), current_entity[1], current_entity[2]))
# 开始新实体
current_entity = ([token], i, label[2:])
elif label.startswith('I-'):
# 继续当前实体
if current_entity and current_entity[2] == label[2:]:
current_entity[0].append(token)
else:
# I标签但没有B标签,忽略或处理为B标签
if current_entity:
entities.append((''.join(current_entity[0]), current_entity[1], current_entity[2]))
current_entity = ([token], i, label[2:])
else: # O标签
if current_entity:
entities.append((''.join(current_entity[0]), current_entity[1], current_entity[2]))
current_entity = None
# 处理最后一个实体
if current_entity:
entities.append((''.join(current_entity[0]), current_entity[1], current_entity[2]))
return entities
def data_processing_example():
"""
数据预处理示例
"""
processor = NERDataProcessor()
# 示例文本和实体
text = "小明在北京大学读书"
entities = [(0, 2, 'PER'), (3, 7, 'ORG')] # (start, end, label)
tokens, labels = processor.text_to_bios(text, entities)
recovered_entities = processor.bios_to_entities(tokens, labels)
print(f"原文本: {text}")
print(f"分词: {tokens}")
print(f"BIO标签: {labels}")
print(f"恢复实体: {recovered_entities}")
data_processing_example()#基于BERT的NER实现
#使用Transformers进行NER
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
pipeline
)
import torch
class BertNER:
"""
基于BERT的NER实现
"""
def __init__(self, model_name: str = "bert-base-chinese"):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# 预定义标签集
self.label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
self.id2label = {i: label for i, label in enumerate(self.label_list)}
self.label2id = {label: i for i, label in enumerate(self.label_list)}
# 初始化模型(这里使用预训练的中文NER模型)
# 在实际使用中,应该加载fine-tuned的NER模型
self.model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(self.label_list),
id2label=self.id2label,
label2id=self.label2id
)
def predict(self, text: str):
"""
预测文本中的实体
"""
# 使用pipeline进行预测(实际使用时需要有fine-tuned模型)
ner_pipeline = pipeline(
"token-classification",
model=self.model,
tokenizer=self.tokenizer,
aggregation_strategy="simple" # 聚合相邻的相同标签
)
results = ner_pipeline(text)
# 格式化结果
formatted_results = []
for result in results:
formatted_results.append({
'entity': result['word'],
'label': result['entity'],
'confidence': result['score'],
'start': result['start'],
'end': result['end']
})
return formatted_results
def bert_ner_example():
"""
BERT NER示例
"""
# 由于我们没有fine-tuned的模型,这里展示代码结构
print("BERT NER实现结构:")
print("1. 加载预训练模型和分词器")
print("2. 定义标签集")
print("3. 微调模型")
print("4. 使用pipeline进行预测")
# 模拟预测结果
text = "小明在北京大学读书"
print(f"\n输入文本: {text}")
print("预测结果: [{'entity': '小明', 'label': 'PER', 'confidence': 0.98}, {'entity': '北京大学', 'label': 'ORG', 'confidence': 0.95}]")
bert_ner_example()#实际的NER模型实现
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer
)
from datasets import Dataset
import numpy as np
def create_ner_trainer():
"""
创建NER训练器的完整示例
"""
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 定义标签
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
num_labels = len(label_list)
# 加载模型
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=num_labels
)
# 示例数据
train_data = {
"tokens": [["小", "明", "在", "北", "京", "大", "学", "读", "书"]],
"labels": [[1, 2, 0, 3, 4, 4, 0, 0, 0]] # 对应B-PER, I-PER, O, B-ORG, I-ORG等
}
train_dataset = Dataset.from_dict(train_data)
def align_labels_with_tokens(labels, word_ids):
"""
将词级别标签对齐到token级别
"""
aligned_labels = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
# 特殊token,如[CLS], [SEP], [PAD]
aligned_labels.append(-100)
elif word_idx != previous_word_idx:
# 新词的开始
aligned_labels.append(labels[word_idx])
else:
# 同一个词的子词,根据标签类型决定
label = labels[word_idx]
if label % 2 == 1: # B-标签
aligned_labels.append(label + 1) # 转为I-标签
else:
aligned_labels.append(label)
previous_word_idx = word_idx
return aligned_labels
def tokenize_and_align_labels(examples):
"""
分词并对齐标签
"""
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True,
max_length=128,
padding=True
)
labels = []
for i, label in enumerate(examples["labels"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = align_labels_with_tokens(label, word_ids)
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
# 处理数据集
tokenized_dataset = train_dataset.map(
tokenize_and_align_labels,
batched=True
)
# 训练参数
training_args = TrainingArguments(
output_dir="./ner_model",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="f1",
seed=42
)
# 评估指标
from seqeval.metrics import accuracy_score, classification_report, f1_score
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# 移除被忽略的标签 (-100)
true_predictions = [
[label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]
for prediction, label in zip(predictions, labels)
]
results = {
"accuracy": accuracy_score(true_labels, true_predictions),
"f1": f1_score(true_labels, true_predictions),
}
return results
# 创建训练器
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # 这里应该有独立的验证集
compute_metrics=compute_metrics,
)
print("NER训练器已创建,包含以下组件:")
print("1. BERT模型适配为Token Classification任务")
print("2. BIO标签对齐处理")
print("3. 评估指标 (seqeval)")
print("4. 训练参数配置")
create_ner_trainer()#模型微调完整流程
#数据准备
import json
from datasets import Dataset
import pandas as pd
def prepare_ner_data():
"""
准备NER训练数据的完整流程
"""
print("NER数据准备流程:")
# 1. 加载原始数据
# 这里使用模拟数据,实际项目中从文件或数据库加载
raw_data = [
{
"text": "小明在北京大学读书",
"entities": [{"start": 0, "end": 2, "label": "PER"}, {"start": 3, "end": 7, "label": "ORG"}]
},
{
"text": "马云是阿里巴巴的创始人",
"entities": [{"start": 0, "end": 2, "label": "PER"}, {"start": 4, "end": 8, "label": "ORG"}]
}
]
# 2. 转换为token-level标签
def convert_to_bio_format(data_item):
text = data_item["text"]
entities = data_item["entities"]
# 初始化标签序列
char_labels = ['O'] * len(text)
# 标记实体
for ent in entities:
start, end, label = ent["start"], ent["end"], ent["label"]
char_labels[start] = f"B-{label}"
for i in range(start + 1, end):
char_labels[i] = f"I-{label}"
return {
"tokens": list(text),
"labels": char_labels
}
# 转换所有数据
processed_data = [convert_to_bio_format(item) for item in raw_data]
# 创建Dataset对象
dataset = Dataset.from_list(processed_data)
print(f"数据集大小: {len(dataset)}")
print(f"示例数据: {dataset[0]}")
return dataset
ner_dataset = prepare_ner_data()#模型微调实现
def fine_tune_ner_model():
"""
NER模型微调完整实现
"""
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer
)
from seqeval.metrics import classification_report, f1_score
import numpy as np
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 标签定义
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
num_labels = len(label_list)
# 加载模型
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=num_labels
)
def tokenize_and_align_labels(examples):
"""
分词并调整标签
"""
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True,
padding=True,
max_length=128
)
labels = []
for i, label in enumerate(examples["labels"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
# 同一个词的子词,使用I-标签或保持原标签
if label[word_idx] % 2 == 1: # B-标签
label_ids.append(label[word_idx] + 1) # 转为I-标签
else:
label_ids.append(label[word_idx])
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
# 假设我们有训练和验证数据集
# train_dataset = ner_dataset.train_test_split(test_size=0.2)['train']
# val_dataset = ner_dataset.train_test_split(test_size=0.2)['test']
# 由于数据量小,这里只展示代码结构
print("NER模型微调流程:")
print("1. 加载预训练BERT模型")
print("2. 准备训练数据(tokenize + label alignment)")
print("3. 配置训练参数")
print("4. 定义评估指标(seqeval)")
print("5. 开始训练")
# 计算评估指标函数
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# 移除被忽略的标签
true_predictions = [
[label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]
for prediction, label in zip(predictions, labels)
]
return {
"f1": f1_score(true_labels, true_predictions),
"report": classification_report(true_labels, true_predictions)
}
print("\n评估指标函数已定义,使用seqeval计算F1分数")
return compute_metrics
metrics_func = fine_tune_ner_model()#评估指标与性能分析
#NER评估指标详解
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np
def ner_evaluation_metrics():
"""
NER评估指标详解
"""
print("NER评估指标:")
# 示例预测结果
y_true = [["B-PER", "I-PER", "O", "B-ORG", "I-ORG"]]
y_pred = [["B-PER", "I-PER", "O", "B-ORG", "I-ORG"]]
print(f"准确率: {accuracy_score(y_true, y_pred)}")
print(f"精确率: {precision_score(y_true, y_pred)}")
print(f"召回率: {recall_score(y_true, y_pred)}")
print(f"F1分数: {f1_score(y_true, y_pred)}")
print("\n分类报告:")
print(classification_report(y_true, y_pred))
print("\n各指标含义:")
print("1. 准确率(Accuracy): 正确预测的token数 / 总token数")
print("2. 精确率(Precision): 预测为实体的token中实际为实体的比例")
print("3. 召回率(Recall): 实体token中被正确识别的比例")
print("4. F1分数: 精确率和召回率的调和平均数")
print("\n实体级别的评估:")
print("需要将token级别的预测结果聚合为完整的实体进行评估")
ner_evaluation_metrics()#性能分析工具
def performance_analysis():
"""
NER模型性能分析
"""
print("NER模型性能分析维度:")
print("\n1. 实体类型分析:")
print(" - 各实体类型的精确率、召回率、F1分数")
print(" - PER、ORG、LOC等不同类型的表现差异")
print("\n2. 长度分析:")
print(" - 不同长度实体的识别效果")
print(" - 短实体 vs 长实体的性能对比")
print("\n3. 边界分析:")
print(" - 实体边界的准确性")
print(" - 漏检、误检、边界错误的情况")
print("\n4. 上下文分析:")
print(" - 不同语境下的识别效果")
print(" - 嵌套实体的处理能力")
print("\n5. 错误分析:")
print(" - 常见错误类型统计")
print(" - 错误案例分析")
print("\n性能优化建议:")
print("1. 数据增强:增加训练数据多样性")
print("2. 标签平滑:减少过拟合")
print("3. 集成学习:组合多个模型预测")
print("4. 后处理:规则修正预测结果")
performance_analysis()#实际应用案例
#智能客服实体识别
class CustomerServiceNER:
"""
智能客服实体识别系统
"""
def __init__(self):
# 这里应该加载预训练的NER模型
self.customer_entities = ['PRODUCT', 'TIME', 'MONEY', 'LOCATION']
self.business_entities = ['ORDER_ID', 'CUSTOMER_NAME', 'COMPANY']
def extract_customer_query_entities(self, query: str) -> dict:
"""
从客户查询中提取实体
"""
# 模拟实体提取
entities = {
'products': [],
'times': [],
'amounts': [],
'locations': [],
'order_ids': []
}
# 简单的关键词匹配(实际应用中使用NER模型)
import re
# 产品名称识别(简化版)
product_keywords = ['手机', '电脑', '平板', '耳机', '充电器']
for keyword in product_keywords:
if keyword in query:
entities['products'].append(keyword)
# 时间识别
time_patterns = [r'(\d{4}年\d{1,2}月\d{1,2}日)', r'(\d{4}-\d{2}-\d{2})', r'(今天|明天|后天|昨天)']
for pattern in time_patterns:
matches = re.findall(pattern, query)
entities['times'].extend(matches)
# 金额识别
money_patterns = [r'(\d+元)', r'(\d+\.\d+元)', r'(¥\d+)', r'(\$\d+)']
for pattern in money_patterns:
matches = re.findall(pattern, query)
entities['amounts'].extend(matches)
return entities
def customer_service_demo():
"""
客服实体识别演示
"""
ner_system = CustomerServiceNER()
queries = [
"我想查询订单号12345678的手机配送情况,预计明天送达",
"我买的电脑价格是5999元,什么时候能退款?",
"我的订单昨天下的,到现在还没发货"
]
print("智能客服实体识别演示:")
for i, query in enumerate(queries, 1):
print(f"\n查询 {i}: {query}")
entities = ner_system.extract_customer_query_entities(query)
print("提取的实体:")
for entity_type, entity_list in entities.items():
if entity_list:
print(f" {entity_type}: {entity_list}")
customer_service_demo()#新闻文本实体抽取
class NewsNERExtractor:
"""
新闻文本实体抽取系统
"""
def __init__(self):
self.news_entities = ['PERSON', 'ORGANIZATION', 'LOCATION', 'EVENT', 'TIME']
def extract_news_entities(self, news_text: str) -> dict:
"""
从新闻文本中抽取实体
"""
# 模拟新闻实体抽取(实际应用中使用fine-tuned NER模型)
entities = {
'persons': [],
'organizations': [],
'locations': [],
'events': [],
'times': []
}
# 这里使用简化的实体抽取逻辑
# 实际应用中应该使用训练好的NER模型
# 模拟抽取结果
entities['persons'] = ['张三', '李四'] if '张三' in news_text or '李四' in news_text else []
entities['organizations'] = ['某公司', '政府部门'] if '公司' in news_text or '政府' in news_text else []
entities['locations'] = ['北京', '上海'] if '北京' in news_text or '上海' in news_text else []
return entities
def news_extraction_demo():
"""
新闻实体抽取演示
"""
extractor = NewsNERExtractor()
news_samples = [
"昨日,张三在北京参加了某公司的发布会",
"李四表示,政府部门将加大对企业的支持力度"
]
print("新闻实体抽取演示:")
for i, news in enumerate(news_samples, 1):
print(f"\n新闻 {i}: {news}")
entities = extractor.extract_news_entities(news)
for entity_type, entity_list in entities.items():
if entity_list:
print(f" {entity_type}: {entity_list}")
news_extraction_demo()#相关教程
#总结
命名实体识别(NER)技术的核心要点:
- 标注方法:熟练掌握BIO、BIOES等标注方法
- 模型选择:基于BERT等预训练模型进行微调
- 数据质量:高质量标注数据是性能保证
- 评估指标:使用seqeval等专业工具评估
- 应用实践:结合具体业务场景进行优化
💡 核心要点:NER是信息抽取的基础,为知识图谱、问答系统等高级NLP应用提供关键支撑。
🔗 扩展阅读
- Hugging Face Token Classification
- Chinese NER Benchmarks
- Sequence Labeling with BERT
- BERT-based NER Survey
📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)
🔗 相关章节:Hugging Face实战 · Prompt Engineering基础

