#🔥 Deep learning practical tutorial: YOLO target detection + twin network to achieve verification code matching
This tutorial covers two classic tasks: YOLO-based target detection (locating label, title, char and other elements in the map) and twin network-based similarity learning (determining whether two verification code images are the same character). The codes are tested and ready for use in industrial projects.
#📦 Environment preparation (general)
Before running any scripts, make sure to install the following dependencies:
pip install ultralytics torch torchvision opencv-python numpy onnxruntime onnx
# 可选:导出 OpenVINO 时需安装 openvino-dev
pip install openvino-devGPU Check (NVIDIA GPU strongly recommended):
import torch
print(torch.cuda.is_available()) # 输出 True 表示可用#Module 1: YOLO target detection (training → export → prediction)
This module uses the Ultralytics YOLO framework (supports YOLOv5/v8/v26, etc.) to train a custom detector and demonstrates exporting the ONNX model and CPU/GPU inference.
#1. Prepare data set
#Directory structure
datasets/
└── bilbil/
├── train/
│ ├── images/ # 训练图片 (.jpg, .png)
│ └── labels/ # 对应 .txt 标签
└── detect/
├── images/ # 验证/测试图片
└── labels/ # 对应标签#Tag format (YOLO format)
Each picture has the same name.txtFile, per line format:
<类别ID> <x_center> <y_center> <width> <height>
(All coordinates are normalized to between 0~1)
#Dataset configuration filebilbil.yaml
path: D:\captcha\yolo_img_bilbil # 数据集根目录
train: train/images # 训练图片相对路径
val: detect/images # 验证图片相对路径
names:
0: label
1: title
2: char#2. Training script (detailed explanation of key parameters)
| Parameters | Default value | Description |
|---|---|---|
imgsz | 640 | Input size, if the target is small, it can be increased to 800/1024 (increases video memory usage) |
optimizer | 'MuSGD' | YOLO new optimizer, smooth convergence; if loss appears NaN, it can be changedAdamW |
close_mosaic | 10 | The last 10 epochs turn off mosaic enhancement to improve accuracy |
patience | 50 | If the verification indicator does not improve for 50 consecutive rounds, it will stop early |
batch | 16 | Adjusted according to video memory (16 is suitable for 8GB video memory) |
Complete training scripttrain.py:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from ultralytics import YOLO
import torch
def main():
# 配置参数
data_yaml = r"D:\captcha\yolov5\data\bilbil.yaml"
pretrained_model = "yolo26n.pt" # 可选 n/s/m/l/x
epochs = 100
imgsz = 640
batch = 16
workers = 8
lr0 = 0.01
lrf = 0.01
optimizer = 'MuSGD'
momentum = 0.937
weight_decay = 0.0005
warmup_epochs = 3
close_mosaic = 10
patience = 50
device = 0 if torch.cuda.is_available() else 'cpu'
# 加载模型并训练
model = YOLO(pretrained_model)
results = model.train(
data=data_yaml,
epochs=epochs,
imgsz=imgsz,
batch=batch,
workers=workers,
lr0=lr0,
lrf=lrf,
optimizer=optimizer,
momentum=momentum,
weight_decay=weight_decay,
warmup_epochs=warmup_epochs,
close_mosaic=close_mosaic,
patience=patience,
device=device,
plots=True,
save=True
)
# 验证最佳模型
val_results = model.val(data=data_yaml, imgsz=imgsz, batch=batch)
print(f"mAP50: {val_results.box.map50:.4f}, mAP50-95: {val_results.box.map:.4f}")
# 导出 ONNX
model.export(format="onnx", imgsz=imgsz, dynamic=False)
print("✅ 训练完成,模型已导出为 ONNX")
if __name__ == "__main__":
main()The training output is saved inruns/detect/train*/, key documents:
weights/best.pt– Optimal weightresults.png– Training curveconfusion_matrix.png– Confusion matrix
#3. Export model (ONNX/OpenVINO/TensorRT)
Standalone export scriptexport.py:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from ultralytics import YOLO
def main():
model_path = r"weights\best.pt" # 训练好的权重
imgsz = 640
data_yaml = r"bilbil.yaml" # INT8 量化时需要
model = YOLO(model_path)
# 导出 ONNX
model.export(format="onnx", imgsz=imgsz, batch=1, device="cpu", opset=14)
# 导出 OpenVINO FP32
# model.export(format="openvino", imgsz=imgsz, half=False)
# 导出 OpenVINO INT8(需要 data_yaml 做校准)
# model.export(format="openvino", imgsz=imgsz, int8=True, data=data_yaml)
# 导出 TensorRT(需 GPU)
# model.export(format="engine", imgsz=imgsz)
if __name__ == "__main__":
main()#4. ONNX inference (pure CPU/GPU example)
import cv2
import numpy as np
import onnxruntime as ort
class YOLO26ONNX:
def __init__(self, model_path, conf_threshold=0.5):
self.session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
self.conf_threshold = conf_threshold
self.input_name = self.session.get_inputs()[0].name
self.input_shape = self.session.get_inputs()[0].shape # [1,3,640,640]
self.output_name = self.session.get_outputs()[0].name
def letterbox(self, image, target_size=(640,640)):
h, w = image.shape[:2]
scale = min(target_size[0]/w, target_size[1]/h)
new_w, new_h = int(w*scale), int(h*scale)
resized = cv2.resize(image, (new_w, new_h))
canvas = np.full((target_size[1], target_size[0], 3), 114, dtype=np.uint8)
dw, dh = (target_size[0]-new_w)//2, (target_size[1]-new_h)//2
canvas[dh:dh+new_h, dw:dw+new_w] = resized
return canvas, scale, (dw, dh, new_w, new_h)
def preprocess(self, image):
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
padded, scale, (dw, dh, _, _) = self.letterbox(image_rgb, (self.input_shape[3], self.input_shape[2]))
padded = padded.astype(np.float32) / 255.0
tensor = np.transpose(padded, (2,0,1))[None] # NCHW
return tensor, scale, (dw, dh)
def inference(self, image):
tensor, scale, (dw, dh) = self.preprocess(image)
outputs = self.session.run([self.output_name], {self.input_name: tensor})[0] # (1,300,6)
detections = []
for det in outputs[0]:
x1,y1,x2,y2,conf,cls_id = det.tolist()
if conf < self.conf_threshold:
continue
x1 = max(0, (x1 - dw) / scale)
y1 = max(0, (y1 - dh) / scale)
x2 = min(image.shape[1], (x2 - dw) / scale)
y2 = min(image.shape[0], (y2 - dh) / scale)
detections.append([int(x1), int(y1), int(x2), int(y2), conf, int(cls_id)])
return detections
if __name__ == "__main__":
yolo = YOLO26ONNX("best.onnx", conf_threshold=0.5)
img = cv2.imread("test.jpg")
results = yolo.inference(img)
for r in results:
print(f"类别{r[5]}, 置信度{r[4]:.2f}, 坐标{r[:4]}")#Module 2: Twin network (verification code text matching)
Applicable to click verification code or text matching scenarios: given two images (prompt image + candidate image), output whether they are the same character.
#Dataset structure
数据集根目录/
├── 验证码A/ # 文件夹名任意,代表一个字符/类别
│ ├── char001.jpg # 包含 'char' 字样,数字编号 001
│ ├── plan001.jpg # 包含 'plan' 字样,数字编号 001
│ ├── char002.png
│ ├── plan002.png
│ └── ... # 可以有多个不同编号的配对
├── 验证码B/
│ ├── char001.jpeg
│ ├── plan001.jpeg
│ └── ...
└── ...#1. Model structure: SiameseMobileNetV4
Adopt twin-tower shared weight + multi-feature fusion design:
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# File : SiameseMobileNetV4.py
# Time :2026/4/30 17:37
# Author :yujia
# version :python 3.6
# Description:
"""
import torch
import torch.nn as nn
import timm
class SiameseMobileNetV4(nn.Module):
"""
MobileNetV4-Conv-Small 孪生网络,特征维度 960
mobilenetv4_conv_medium 1280
mobilenetv4_hybrid_medium 1280 暂时不支持导出onnx
"""
def __init__(self, pretrained=True):
super().__init__()
# MobileNetV4-Conv-Small (卷积架构,适合移动端部署)
self.backbone = timm.create_model('mobilenetv4_conv_medium', pretrained=pretrained, num_classes=0)
self.feature_dim = 1280 # 全局池化后输出维度
self.dropout = nn.Dropout(0.2)
# 相似度判别头
self.fusion_head = nn.Sequential(
nn.Linear(self.feature_dim * 4, 512),
nn.BatchNorm1d(512),
nn.ReLU(inplace=True),
nn.Linear(512, 128),
nn.BatchNorm1d(128),
nn.ReLU(inplace=True),
nn.Linear(128, 1) # 输出 logits (不加 Sigmoid,配合 BCEWithLogitsLoss)
)
# self.fusion_head = nn.Sequential(
# nn.Linear(self.feature_dim * 4, 1)
# )
def extract_feature(self, x):
return self.dropout(self.backbone(x)) # [B, 960]
def forward(self, x1, x2):
v1 = self.extract_feature(x1)
v2 = self.extract_feature(x2)
# 特征融合:拼接原始向量、差值、乘积
fused = torch.cat([v1, v2, torch.abs(v1 - v2), v1 * v2], dim=1) # [B, 960*4]
x = self.fusion_head(fused)
return x # logits [B, 1]
if __name__ == '__main__':
if __name__ == "__main__":
# 测试代码
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Testing SiameseEfficientNet...")
model1 = SiameseMobileNetV4(pretrained=True).to(device)
img1 = torch.rand([4, 3, 112, 112]).to(device)
img2 = torch.rand([4, 3, 112, 112]).to(device)
out1 = model1(img1, img2)
print(f"Output shape: {out1.shape}, values: {out1.squeeze().tolist()}")Design Highlights:
- use
torch.abs(v1 - v2)Capture differences directly,v1 * v2Capture commonalities. - MobileNetV4 pure convolution structure, easy to export ONNX/OpenVINO.
- Add BatchNorm and Dropout to the discriminant head to prevent overfitting.
#2. Data loading and positive and negative sample construction
The data set is divided into training/validation by character folder to prevent data leakage. Each pair of sample generation rules:
- Positive sample (label=1): In the same character folder, the prompt image and the candidate image have the same number.
- Negative sample (label=0): Random combination of different numbers in the same folder or different folders.
Core processing flow:
import os
import re
import random
import numpy as np
import cv2
import torch
from torch.utils.data.dataset import Dataset
# ---------------------------------------------------#
# 图像预处理工具函数(完全保留)
# ---------------------------------------------------#
def cvtColor(image_np):
if len(image_np.shape) == 3 and image_np.shape[2] == 3:
return cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
elif len(image_np.shape) == 3 and image_np.shape[2] == 4:
bgr = cv2.cvtColor(image_np, cv2.COLOR_BGRA2BGR)
return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
else:
return cv2.cvtColor(image_np, cv2.COLOR_GRAY2RGB)
def letterbox_image(image_np, target_size):
h, w = target_size
ih, iw = image_np.shape[:2]
scale = min(w / iw, h / ih)
nw = int(iw * scale)
nh = int(ih * scale)
resized = cv2.resize(image_np, (nw, nh), interpolation=cv2.INTER_CUBIC)
new_image = np.full((h, w, 3), 128, dtype=np.uint8)
dx = (w - nw) // 2
dy = (h - nh) // 2
new_image[dy:dy+nh, dx:dx+nw] = resized
return new_image
def preprocess_input(x):
return x.astype(np.float32) / 255.0
def rand(a=0, b=1):
return np.random.rand() * (b - a) + a
# ---------------------------------------------------#
# 新的 load_dataset:构建正负样本对并划分
# ---------------------------------------------------#
def load_dataset(dataset_path, train_ratio=0.8):
"""
返回 train_samples, val_samples
每个样本为 (img1_path, img2_path, label) 标签 1=同类同编号,0=不同类或同类别不同编号
按字符文件夹整体划分训练/验证集,保证验证集文件夹在训练时不可见。
"""
# 1. 收集所有字符文件夹及其编号对
folder_pairs = [] # 元素:(folder_path, [ (char_path, plan_path), ... ])
for root, dirs, files in os.walk(dataset_path):
char_files = [f for f in files if 'char' in f.lower()]
plan_files = [f for f in files if 'plan' in f.lower()]
if not char_files or not plan_files:
continue
char_dict = {}
plan_dict = {}
for f in char_files:
nums = re.findall(r'\d+', f)
if nums:
char_dict[nums[0]] = os.path.join(root, f)
for f in plan_files:
nums = re.findall(r'\d+', f)
if nums:
plan_dict[nums[0]] = os.path.join(root, f)
pairs = []
for num, char_path in char_dict.items():
if num in plan_dict:
pairs.append((char_path, plan_dict[num]))
if pairs:
folder_pairs.append((root, pairs))
print(f"共找到 {sum(len(p) for _, p in folder_pairs)} 个有效图像对,文件夹总数: {len(folder_pairs)}")
# 2. 按文件夹整体划分训练/验证集
random.seed(42)
random.shuffle(folder_pairs)
num_train_folders = int(len(folder_pairs) * train_ratio)
train_folders = folder_pairs[:num_train_folders]
val_folders = folder_pairs[num_train_folders:]
# 3. 在每个 split 内构建正负样本对
def build_samples(folders):
# 文件夹内所有编号对,以及文件夹索引(用于跨文件夹选取)
all_pairs = [] # (char_path, plan_path, folder_idx)
folder_idx_map = {} # folder_path -> idx
for idx, (folder_path, pairs) in enumerate(folders):
folder_idx_map[folder_path] = idx
for char_path, plan_path in pairs:
all_pairs.append((char_path, plan_path, idx))
# 同文件夹内部,按照 folder_idx 分组
folder_to_indices = {}
for i, (_, _, fidx) in enumerate(all_pairs):
folder_to_indices.setdefault(fidx, []).append(i)
samples = []
for idx, (char_path, plan_path, fidx) in enumerate(all_pairs):
# 正样本:本身的编号对
samples.append((char_path, plan_path, 1))
# 负样本构建:优先从同文件夹其他编号对,否则从其他文件夹任意对
other_pairs_indices = folder_to_indices[fidx]
if len(other_pairs_indices) > 1:
# 同文件夹有多个编号对,随机选一个不同于当前的
while True:
neg_local_idx = random.choice(other_pairs_indices)
if neg_local_idx != idx:
break
neg_char, neg_plan, _ = all_pairs[neg_local_idx]
samples.append((char_path, neg_plan, 0))
return samples
train_samples = build_samples(train_folders)
val_samples = build_samples(val_folders)
print(f"训练集样本数: {len(train_samples)} (其中正: {sum(l for _,_,l in train_samples)}, 负: {sum(1 for _,_,l in train_samples if l==0)})")
print(f"验证集样本数: {len(val_samples)} (其中正: {sum(l for _,_,l in val_samples)}, 负: {sum(1 for _,_,l in val_samples if l==0)})")
return train_samples, val_samples
# ---------------------------------------------------#
# 全新的 SiameseDataset:只负责图像读取和增强
# ---------------------------------------------------#
class SiameseDataset(Dataset):
def __init__(self, samples, input_shape=(112,112), random=True):
self.samples = samples # list of (img1_path, img2_path, label)
self.input_shape = input_shape
self.random = random
def __len__(self):
return len(self.samples)
def __getitem__(self, index):
img1_path, img2_path, label = self.samples[index]
img1 = self._load_and_preprocess(img1_path)
img2 = self._load_and_preprocess(img2_path)
return [img1, img2], np.float32(label)
def _load_and_preprocess(self, img_path):
image = cv2.imread(img_path)
if image is None:
raise FileNotFoundError(...)
image = cvtColor(image)
image = letterbox_image(image, self.input_shape)
# 增强必须作用在 uint8 上(避免浮点色域溢出)
if self.random:
image = self._apply_augment(image) # 现在 image 仍是 uint8
# 最后归一化
image = preprocess_input(image) # 到这里变成 float32 [0,1]
image = np.transpose(image, (2, 0, 1))
return image
def _apply_augment(self, image):
"""你可以在此处替换为更强大的 albumentations 增强"""
h, w = image.shape[:2]
if rand() < 0.5:
image = cv2.flip(image, 1)
if rand() < 0.5:
angle = np.random.randint(-15, 15)
center = (w//2, h//2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
image = cv2.warpAffine(image, M, (w, h), borderValue=(128,128,128))
if rand() < 0.5:
hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV).astype(np.float32)
h_shift = rand(-0.1, 0.1) * 180
s_scale = rand(1-0.7, 1+0.7)
v_scale = rand(1-0.3, 1+0.3)
hsv[:,:,0] = (hsv[:,:,0] + h_shift) % 180
hsv[:,:,1] = np.clip(hsv[:,:,1] * s_scale, 0, 255)
hsv[:,:,2] = np.clip(hsv[:,:,2] * v_scale, 0, 255)
image = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)
return image
# ---------------------------------------------------#
# 新的 collate_fn,与训练代码接口完全兼容
# ---------------------------------------------------#
def dataset_collate(batch):
left_imgs = [item[0][0] for item in batch] # 每个是 (C,H,W) numpy
right_imgs = [item[0][1] for item in batch]
labels = [item[1] for item in batch]
left_tensor = torch.from_numpy(np.array(left_imgs)).float()
right_tensor = torch.from_numpy(np.array(right_imgs)).float()
labels_tensor = torch.from_numpy(np.array(labels)).float().view(-1, 1)
images = torch.stack([left_tensor, right_tensor], dim=0) # (2, B, C, H, W)
return images, labels_tensor
# ---------------------------------------------------#
# 测试:显示几对样本供人工检查
# ---------------------------------------------------#
if __name__ == '__main__':
data_path = r"D:\captcha\Siamese\data\jiyan"
train_samples, val_samples = load_dataset(data_path)
train_dataset = SiameseDataset(train_samples, input_shape=(112,112), random=True)
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=4,
shuffle=True,
collate_fn=dataset_collate
)
import matplotlib.pyplot as plt
for batch_idx, (images, labels) in enumerate(train_loader):
x1, x2 = images # (2, B, C, H, W)
labs = labels.squeeze().numpy()
def to_display(tensor):
img = tensor.numpy().transpose(1,2,0)
img = (img * 255).clip(0,255).astype(np.uint8)
return img
num = min(4, len(x1))
fig, axes = plt.subplots(num, 2, figsize=(8, 4*num))
for i in range(num):
axes[i,0].imshow(to_display(x1[i]))
axes[i,1].imshow(to_display(x2[i]))
axes[i,0].set_title(f"Left (label={labs[i]})")
axes[i,1].set_title(f"Right (label={labs[i]})")
for ax in axes[i]:
ax.axis('off')
plt.tight_layout()
plt.show()
break#3. Training script (hierarchical learning rate + Focal Loss)
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# File : train.py
# Time :2026/4/30 16:48
# Author :yujia
# version :python 3.6
# Description:
"""
import torch.nn.functional as F
import os
import time
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.optim.lr_scheduler import CosineAnnealingLR
# 导入你的数据集相关函数(确保它们在同一目录或被正确导入)
from dataloader import load_dataset, SiameseDataset, dataset_collate
from SiameseEfficientNet import SiameseEfficientNet
from SiameseEdgeNeXt import SiameseEdgeNeXt
from SiameseMobileNetV4 import SiameseMobileNetV4
# ---------------------------- 配置 ----------------------------
DATA_PATH = r"D:\captcha\Siamese\data\vercode1117"
MODEL_TYPE = "MobileNetV4" # "edgenext" 或 "efficientnet" MobileNetV4
PRETRAINED = True
INPUT_SIZE = (112, 112)
BATCH_SIZE = 32 # 实际每卡样本数,因为每个 sample 贡献 2 张图,实际 batch 为 64 对
EPOCHS = 80
LR_BACKBONE = 1e-4
LR_HEAD = 1e-3
WEIGHT_DECAY = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 保存路径
SAVE_DIR = "./checkpoints"
os.makedirs(SAVE_DIR, exist_ok=True)
# ---------------------------- 工具函数 ----------------------------
def compute_metrics(labels, logits):
"""计算准确率、AUC(二分类)"""
probs = torch.sigmoid(torch.tensor(logits)).numpy()
preds = (probs >= 0.5).astype(int).flatten()
labels = np.array(labels).flatten()
acc = accuracy_score(labels, preds)
try:
auc = roc_auc_score(labels, probs.flatten())
except:
auc = 0.5
return acc, auc, preds, probs
def train_one_epoch(model, loader, criterion, optimizer, device, epoch, total_epochs):
model.train()
total_loss, all_labels, all_logits = [], [], []
pbar = tqdm(loader, desc=f"Train Epoch {epoch}/{total_epochs}", leave=False)
for images, labels_tensor in pbar:
x1 = images[0].to(device)
x2 = images[1].to(device)
targets = labels_tensor.to(device).float().view(-1, 1)
logits = model(x1, x2)
loss = focal_bce_loss(logits, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss.append(loss.item())
all_labels.extend(targets.cpu().tolist())
all_logits.extend(logits.detach().cpu().tolist())
# 实时显示当前 loss
pbar.set_postfix({'loss': f"{loss.item():.4f}"})
avg_loss = np.mean(total_loss)
acc, auc, preds, probs = compute_metrics(all_labels, all_logits)
return avg_loss, acc, auc
def validate(model, loader, criterion, device):
model.eval()
total_loss, all_labels, all_logits = [], [], []
pbar = tqdm(loader, desc="Validation", leave=False)
with torch.no_grad():
for images, labels_tensor in pbar:
x1 = images[0].to(device)
x2 = images[1].to(device)
targets = labels_tensor.to(device).float().view(-1, 1)
logits = model(x1, x2)
loss = focal_bce_loss(logits, targets)
total_loss.append(loss.item())
all_labels.extend(targets.cpu().tolist())
all_logits.extend(logits.cpu().tolist())
pbar.set_postfix({'loss': f"{loss.item():.4f}"})
avg_loss = np.mean(total_loss)
acc, auc, preds, probs = compute_metrics(all_labels, all_logits)
return avg_loss, acc, auc
def focal_bce_loss(logits, targets, gamma=2.0, alpha=0.25, smoothing=0.1):
# 标签平滑
targets = targets * (1 - smoothing) + 0.5 * smoothing
bce = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
pt = torch.exp(-bce)
focal = alpha * (1 - pt) ** gamma * bce
return focal.mean()
def train():
# ---------------------------- 准备数据 ----------------------------
print("Loading dataset...")
train_samples, val_samples = load_dataset(DATA_PATH, train_ratio=0.8)
train_dataset = SiameseDataset(train_samples, input_shape=INPUT_SIZE, random=True)
val_dataset = SiameseDataset(val_samples, input_shape=INPUT_SIZE, random=False)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
collate_fn=dataset_collate, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
collate_fn=dataset_collate, num_workers=0, pin_memory=True)
# ---------------------------- 构建模型 ----------------------------
if MODEL_TYPE == "edgenext":
model = SiameseEdgeNeXt(pretrained=PRETRAINED)
elif MODEL_TYPE == "efficientnet":
model = SiameseEfficientNet(pretrained=PRETRAINED)
elif MODEL_TYPE == "MobileNetV4":
model = SiameseMobileNetV4(pretrained=PRETRAINED)
else:
raise ValueError("MODEL_TYPE must be 'edgenext' or 'efficientnet'")
model = model.to(DEVICE)
# ---------------------------- 损失函数、优化器、调度器 ----------------------------
criterion = nn.BCEWithLogitsLoss() # 输入 logits,目标 0/1
# 分层学习率:backbone 较小,融合头较大
optimizer = optim.AdamW([
{'params': model.backbone.parameters(), 'lr': LR_BACKBONE},
{'params': model.fusion_head.parameters(), 'lr': LR_HEAD},
], weight_decay=WEIGHT_DECAY)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
# 早停相关
best_val_acc = 0
patience = 15
early_stop_counter = 0
# ---------------------------- 训练 ----------------------------
print("\nStart training...")
for epoch in range(EPOCHS):
start_time = time.time()
# 传入 epoch 和 EPOCHS 用于进度条描述
train_loss, train_acc, train_auc = train_one_epoch(
model, train_loader, criterion, optimizer, DEVICE, epoch + 1, EPOCHS
)
val_loss, val_acc, val_auc = validate(model, val_loader, criterion, DEVICE)
scheduler.step()
lr_backbone = optimizer.param_groups[0]['lr']
lr_head = optimizer.param_groups[1]['lr']
print(f"\nEpoch {epoch + 1:03d}/{EPOCHS} | Time: {time.time() - start_time:.1f}s | "
f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} AUC: {train_auc:.4f} | "
f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} AUC: {val_auc:.4f} | "
f"LR: backbone={lr_backbone:.2e}, head={lr_head:.2e}")
torch.save(model.state_dict(), os.path.join(SAVE_DIR, f"last_{MODEL_TYPE}.pth"))
# 保存最佳模型
if val_acc > best_val_acc:
best_val_acc = val_acc
early_stop_counter = 0
torch.save(model.state_dict(), os.path.join(SAVE_DIR, f"best_{MODEL_TYPE}.pth"))
print(f"\n => Best model saved (val_acc={val_acc:.4f})")
else:
early_stop_counter += 1
if early_stop_counter >= patience:
print(f"Early stopping triggered after {epoch + 1} epochs.")
break
print("Training finished. Best val loss: {:.4f}".format(best_val_acc))
if __name__ == '__main__':
train()
Training monitoring metrics: Validation set accuracy (Val Acc) and AUC. If the training accuracy is much higher than the verification accuracy, you need to increase Dropout or reduce the model capacity.
#4. Reasoning and Deployment
Export ONNX after training is complete:
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# File : export.py.py
# Time :2026/4/30 17:53
# Author :yujia
# version :python 3.6
# Description:
"""
import os
import torch
import torch.nn as nn
import timm
from SiameseMobileNetV4 import SiameseMobileNetV4
def export_onnx(model, onnx_path, input_size=(112, 112)):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
x1 = torch.randn(1, 3, *input_size, device=device)
x2 = torch.randn(1, 3, *input_size, device=device)
dynamic_axes = {
"input1": {0: "batch"},
"input2": {0: "batch"},
# "logits": {0: "batch"}, # 输出也声明为动态 batch
}
torch.onnx.export(
model,
(x1, x2),
onnx_path,
export_params=True,
opset_version=14, # 稳定且支持动态轴
do_constant_folding=True,
input_names=["input1", "input2"],
output_names=["logits"],
dynamic_axes=dynamic_axes,
dynamo=False # 关键:禁用 dynamo,使用经典 TorchScript 导出
)
print(f"✅ ONNX exported to: {onnx_path}")
def validate_onnx(onnx_path, input_size=(112, 112)):
import onnxruntime
import numpy as np
session = onnxruntime.InferenceSession(onnx_path)
x1 = np.random.randn(2, 3, *input_size).astype(np.float32) # batch=2 测试动态
x2 = np.random.randn(2, 3, *input_size).astype(np.float32)
ort_inputs = {
session.get_inputs()[0].name: x1,
session.get_inputs()[1].name: x2
}
outputs = session.run(None, ort_inputs)
print(f"✅ Validate OK. Output shape: {outputs[0].shape}")
if __name__ == "__main__":
WEIGHT_PATH = "checkpoints/best_MobileNetV4.pth"
ONNX_PATH = "onnx/siamese_mobilenetv4_hybrid_medium.onnx"
os.makedirs("onnx", exist_ok=True)
# 创建模型并加载训练权重
model = SiameseMobileNetV4(pretrained=False)
state_dict = torch.load(WEIGHT_PATH, map_location="cpu")
model.load_state_dict(state_dict, strict=True)
# 导出 ONNX
export_onnx(model, ONNX_PATH, input_size=(112, 112))
# 验证动态 batch
validate_onnx(ONNX_PATH, input_size=(112, 112))Inference using ONNX Runtime:
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# File : val_onnx.py
# Time :2026/4/30 17:54
# Author :yujia
# version :python 3.6
# Description:
"""
import os
import cv2
import numpy as np
import onnxruntime as ort
def cvtColor(image_np):
"""确保图像为 3 通道 RGB 格式"""
if len(image_np.shape) == 3 and image_np.shape[2] == 3:
return cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
elif len(image_np.shape) == 3 and image_np.shape[2] == 4:
bgr = cv2.cvtColor(image_np, cv2.COLOR_BGRA2BGR)
return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
else:
return cv2.cvtColor(image_np, cv2.COLOR_GRAY2RGB)
def letterbox_image(image_np, target_size):
h, w = target_size
ih, iw = image_np.shape[:2]
scale = min(w / iw, h / ih)
nw = int(iw * scale)
nh = int(ih * scale)
resized = cv2.resize(image_np, (nw, nh), interpolation=cv2.INTER_CUBIC)
new_image = np.full((h, w, 3), 128, dtype=np.uint8)
dx = (w - nw) // 2
dy = (h - nh) // 2
new_image[dy:dy+nh, dx:dx+nw] = resized
return new_image
def preprocess_input(x):
return x.astype(np.float32) / 255.0
def preprocess_image(img: np.ndarray, input_size=(112, 112)) -> np.ndarray:
"""读取图像并预处理,返回形状为 [1, 3, H, W] 的 numpy 数组"""
img = cvtColor(img)
img = letterbox_image(img, input_size)
img = preprocess_input(img) # 归一化到 [0, 1]
img = np.transpose(img, (2, 0, 1)).astype(np.float32) # HWC -> CHW
return np.expand_dims(img, axis=0) # [1, 3, H, W]
# ===================== ONNX 推理接口 =====================
class ONNXInference:
def __init__(self, onnx_path: str, device: str = 'cpu', input_size=(112, 112)):
"""
onnx_path : ONNX 模型文件路径
device : 'cpu' 或 'cuda' (需要 onnxruntime-gpu)
input_size: 输入图像尺寸,需与导出时一致
"""
self.input_size = input_size
# 配置推理提供者
providers = ['CPUExecutionProvider']
if device == 'cuda':
providers.insert(0, 'CUDAExecutionProvider')
self.session = ort.InferenceSession(onnx_path, providers=providers)
self.input_names = [inp.name for inp in self.session.get_inputs()]
self.output_names = [out.name for out in self.session.get_outputs()]
print(f"ONNX model loaded. Inputs: {self.input_names}, Outputs: {self.output_names}")
def predict_pair(self, img1_path: str, img2_path: str) -> float:
"""比较两张图像,返回相似概率 (0~1)"""
img1 = preprocess_image(img1_path, self.input_size)
img2 = preprocess_image(img2_path, self.input_size)
# 注意:输入名称需与导出时一致(默认为 'input1', 'input2')
ort_inputs = {
self.input_names[0]: img1,
self.input_names[1]: img2
}
logits = self.session.run(self.output_names, ort_inputs)[0] # shape: [1, 1]
prob = 1.0 / (1.0 + np.exp(-logits)) # sigmoid
return float(prob[0, 0])
def reason_all_batch(self, image_1_list, image_2_list):
"""
批量计算两组图片之间的所有组合相似度
:param image_1_list: 图片路径列表(或已预处理数组),长度 N
:param image_2_list: 图片路径列表(或已预处理数组),长度 M
:return: 二维列表 scores[N][M],scores[i][j] 为 image_1[i] 与 image_2[j] 的相似概率
"""
N = len(image_1_list)
M = len(image_2_list)
processed_1 = [preprocess_image(img) for img in image_1_list]
processed_2 = [preprocess_image(img) for img in image_2_list]
# 2. 构造笛卡尔积 batch
x1_list = []
x2_list = []
for p1 in processed_1:
x1_list.extend([p1] * M) # 每个 char 复制 M 份
x2_list.extend(processed_2) # 每份配对所有 target
# 沿 batch 轴拼接
x1_batch = np.concatenate(x1_list, axis=0) # (N*M, C, H, W)
x2_batch = np.concatenate(x2_list, axis=0)
print(x1_batch.shape, x2_batch.shape)
# 3. 一次推理(注意输入名需与 ONNX 模型保持一致)
# 如果你导出的模型输入名为 "input1", "input2",请替换这里
ort_inputs = {self.input_names[0]: x1_batch, self.input_names[1]: x2_batch}
logits = self.session.run(self.output_names, ort_inputs)[0] # (N*M, 1)
# 4. Sigmoid 得到概率
probs = 1.0 / (1.0 + np.exp(-logits)) # 稳定的 sigmoid
probs = probs.flatten().tolist()
# 5. 重塑成 N×M 矩阵
scores = [probs[i * M: (i + 1) * M] for i in range(N)]
return scores
# ===================== 使用示例 =====================
if __name__ == '__main__':
# 配置
ONNX_PATH = "onnx/siamese_mobilenetv4_hybrid_medium.onnx" # 你的 ONNX 模型路径
DEVICE = "cpu" # 或 "cuda"
# 初始化 ONNX 推理器
infer = ONNXInference(ONNX_PATH, device=DEVICE)
char_1 = cv2.imread("char_1.jpg")
plan_1 = cv2.imread("plan_1.jpg")
plan_2 = cv2.imread("plan_2.jpg")
# 单对相似度预测
prob = infer.predict_pair(char_1, plan_1)
print(f"两图相似概率: {prob:.4f}")
prob = infer.predict_pair(char_1, plan_2)
print(f"两图相似概率: {prob:.4f}")
#✅ Summary
| Module | Applicable scenarios | Key technologies |
|---|---|---|
| YOLO detection | Locating multiple targets in captchas (characters, titles, etc.) | Mosaic enhancements, MuSGD optimizer, ONNX deployment |
| Twin network | Click verification code text matching, similarity judgment | Feature difference fusion, hierarchical learning rate, Focal Loss |
Both codes support GPU acceleration and lightweight deployment, and can be directly integrated into production environments. If you have any questions, you are welcome to make adjustments based on the comments in the code.

