🔥 Deep learning practical tutorial: YOLO target detection + twin network to achieve verification code matching

This tutorial covers two classic tasks: YOLO-based target detection (locating label, title, char and other elements in the map) and twin network-based similarity learning (determining whether two verification code images are the same character). The codes are tested and ready for use in industrial projects.


📦 Environment preparation (general)

Before running any scripts, make sure to install the following dependencies:

pip install ultralytics torch torchvision opencv-python numpy onnxruntime onnx
# 可选:导出 OpenVINO 时需安装 openvino-dev
pip install openvino-dev

GPU Check (NVIDIA GPU strongly recommended):

import torch
print(torch.cuda.is_available())  # 输出 True 表示可用

Module 1: YOLO target detection (training → export → prediction)

This module uses the Ultralytics YOLO framework (supports YOLOv5/v8/v26, etc.) to train a custom detector and demonstrates exporting the ONNX model and CPU/GPU inference.

1. Prepare data set

Directory structure

datasets/
└── bilbil/
    ├── train/
    │   ├── images/      # 训练图片 (.jpg, .png)
    │   └── labels/      # 对应 .txt 标签
    └── detect/
        ├── images/      # 验证/测试图片
        └── labels/      # 对应标签

Tag format (YOLO format)

Each picture has the same name.txtFile, per line format: <类别ID> <x_center> <y_center> <width> <height>
(All coordinates are normalized to between 0~1)

Dataset configuration filebilbil.yaml

path: D:\captcha\yolo_img_bilbil   # 数据集根目录
train: train/images                # 训练图片相对路径
val: detect/images                 # 验证图片相对路径

names:
  0: label
  1: title
  2: char

2. Training script (detailed explanation of key parameters)

ParametersDefault valueDescription
imgsz640Input size, if the target is small, it can be increased to 800/1024 (increases video memory usage)
optimizer'MuSGD'YOLO new optimizer, smooth convergence; if loss appears NaN, it can be changedAdamW
close_mosaic10The last 10 epochs turn off mosaic enhancement to improve accuracy
patience50If the verification indicator does not improve for 50 consecutive rounds, it will stop early
batch16Adjusted according to video memory (16 is suitable for 8GB video memory)

Complete training scripttrain.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from ultralytics import YOLO
import torch

def main():
    # 配置参数
    data_yaml = r"D:\captcha\yolov5\data\bilbil.yaml"
    pretrained_model = "yolo26n.pt"   # 可选 n/s/m/l/x
    epochs = 100
    imgsz = 640
    batch = 16
    workers = 8
    lr0 = 0.01
    lrf = 0.01
    optimizer = 'MuSGD'
    momentum = 0.937
    weight_decay = 0.0005
    warmup_epochs = 3
    close_mosaic = 10
    patience = 50
    device = 0 if torch.cuda.is_available() else 'cpu'

    # 加载模型并训练
    model = YOLO(pretrained_model)
    results = model.train(
        data=data_yaml,
        epochs=epochs,
        imgsz=imgsz,
        batch=batch,
        workers=workers,
        lr0=lr0,
        lrf=lrf,
        optimizer=optimizer,
        momentum=momentum,
        weight_decay=weight_decay,
        warmup_epochs=warmup_epochs,
        close_mosaic=close_mosaic,
        patience=patience,
        device=device,
        plots=True,
        save=True
    )

    # 验证最佳模型
    val_results = model.val(data=data_yaml, imgsz=imgsz, batch=batch)
    print(f"mAP50: {val_results.box.map50:.4f}, mAP50-95: {val_results.box.map:.4f}")

    # 导出 ONNX
    model.export(format="onnx", imgsz=imgsz, dynamic=False)
    print("✅ 训练完成,模型已导出为 ONNX")

if __name__ == "__main__":
    main()

The training output is saved inruns/detect/train*/, key documents:

  • weights/best.pt– Optimal weight
  • results.png– Training curve
  • confusion_matrix.png– Confusion matrix

3. Export model (ONNX/OpenVINO/TensorRT)

Standalone export scriptexport.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from ultralytics import YOLO

def main():
    model_path = r"weights\best.pt"   # 训练好的权重
    imgsz = 640
    data_yaml = r"bilbil.yaml"        # INT8 量化时需要

    model = YOLO(model_path)

    # 导出 ONNX
    model.export(format="onnx", imgsz=imgsz, batch=1, device="cpu", opset=14)

    # 导出 OpenVINO FP32
    # model.export(format="openvino", imgsz=imgsz, half=False)

    # 导出 OpenVINO INT8(需要 data_yaml 做校准)
    # model.export(format="openvino", imgsz=imgsz, int8=True, data=data_yaml)

    # 导出 TensorRT(需 GPU)
    # model.export(format="engine", imgsz=imgsz)

if __name__ == "__main__":
    main()

4. ONNX inference (pure CPU/GPU example)

import cv2
import numpy as np
import onnxruntime as ort

class YOLO26ONNX:
    def __init__(self, model_path, conf_threshold=0.5):
        self.session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
        self.conf_threshold = conf_threshold
        self.input_name = self.session.get_inputs()[0].name
        self.input_shape = self.session.get_inputs()[0].shape          # [1,3,640,640]
        self.output_name = self.session.get_outputs()[0].name

    def letterbox(self, image, target_size=(640,640)):
        h, w = image.shape[:2]
        scale = min(target_size[0]/w, target_size[1]/h)
        new_w, new_h = int(w*scale), int(h*scale)
        resized = cv2.resize(image, (new_w, new_h))
        canvas = np.full((target_size[1], target_size[0], 3), 114, dtype=np.uint8)
        dw, dh = (target_size[0]-new_w)//2, (target_size[1]-new_h)//2
        canvas[dh:dh+new_h, dw:dw+new_w] = resized
        return canvas, scale, (dw, dh, new_w, new_h)

    def preprocess(self, image):
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        padded, scale, (dw, dh, _, _) = self.letterbox(image_rgb, (self.input_shape[3], self.input_shape[2]))
        padded = padded.astype(np.float32) / 255.0
        tensor = np.transpose(padded, (2,0,1))[None]   # NCHW
        return tensor, scale, (dw, dh)

    def inference(self, image):
        tensor, scale, (dw, dh) = self.preprocess(image)
        outputs = self.session.run([self.output_name], {self.input_name: tensor})[0]  # (1,300,6)
        detections = []
        for det in outputs[0]:
            x1,y1,x2,y2,conf,cls_id = det.tolist()
            if conf < self.conf_threshold:
                continue
            x1 = max(0, (x1 - dw) / scale)
            y1 = max(0, (y1 - dh) / scale)
            x2 = min(image.shape[1], (x2 - dw) / scale)
            y2 = min(image.shape[0], (y2 - dh) / scale)
            detections.append([int(x1), int(y1), int(x2), int(y2), conf, int(cls_id)])
        return detections

if __name__ == "__main__":
    yolo = YOLO26ONNX("best.onnx", conf_threshold=0.5)
    img = cv2.imread("test.jpg")
    results = yolo.inference(img)
    for r in results:
        print(f"类别{r[5]}, 置信度{r[4]:.2f}, 坐标{r[:4]}")

Module 2: Twin network (verification code text matching)

Applicable to click verification code or text matching scenarios: given two images (prompt image + candidate image), output whether they are the same character.

Dataset structure

数据集根目录/                     
├── 验证码A/                  # 文件夹名任意,代表一个字符/类别
│   ├── char001.jpg              # 包含 'char' 字样,数字编号 001
│   ├── plan001.jpg              # 包含 'plan' 字样,数字编号 001
│   ├── char002.png
│   ├── plan002.png
│   └── ...                      # 可以有多个不同编号的配对
├── 验证码B/
│   ├── char001.jpeg
│   ├── plan001.jpeg
│   └── ...
└── ...

1. Model structure: SiameseMobileNetV4

Adopt twin-tower shared weight + multi-feature fusion design:

# !/usr/bin/env python
# -*-coding:utf-8 -*-

"""
# File       : SiameseMobileNetV4.py
# Time       :2026/4/30 17:37
# Author     :yujia
# version    :python 3.6
# Description:
"""
import torch
import torch.nn as nn
import timm

class SiameseMobileNetV4(nn.Module):
    """
    MobileNetV4-Conv-Small 孪生网络,特征维度 960
    mobilenetv4_conv_medium                1280
    mobilenetv4_hybrid_medium              1280  暂时不支持导出onnx
    """
    def __init__(self, pretrained=True):
        super().__init__()
        # MobileNetV4-Conv-Small (卷积架构,适合移动端部署)
        self.backbone = timm.create_model('mobilenetv4_conv_medium', pretrained=pretrained, num_classes=0)
        self.feature_dim = 1280   # 全局池化后输出维度
        self.dropout = nn.Dropout(0.2)

        # 相似度判别头
        self.fusion_head = nn.Sequential(
            nn.Linear(self.feature_dim * 4, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 1)        # 输出 logits (不加 Sigmoid,配合 BCEWithLogitsLoss)
        )

        # self.fusion_head = nn.Sequential(
        #     nn.Linear(self.feature_dim * 4, 1)
        # )

    def extract_feature(self, x):
        return self.dropout(self.backbone(x))   # [B, 960]

    def forward(self, x1, x2):
        v1 = self.extract_feature(x1)
        v2 = self.extract_feature(x2)
        # 特征融合:拼接原始向量、差值、乘积
        fused = torch.cat([v1, v2, torch.abs(v1 - v2), v1 * v2], dim=1)   # [B, 960*4]
        x = self.fusion_head(fused)
        return x                                 # logits [B, 1]


if __name__ == '__main__':
    if __name__ == "__main__":
        # 测试代码
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        print("Testing SiameseEfficientNet...")
        model1 = SiameseMobileNetV4(pretrained=True).to(device)
        img1 = torch.rand([4, 3, 112, 112]).to(device)
        img2 = torch.rand([4, 3, 112, 112]).to(device)
        out1 = model1(img1, img2)
        print(f"Output shape: {out1.shape}, values: {out1.squeeze().tolist()}")

Design Highlights:

  • usetorch.abs(v1 - v2)Capture differences directly,v1 * v2Capture commonalities.
  • MobileNetV4 pure convolution structure, easy to export ONNX/OpenVINO.
  • Add BatchNorm and Dropout to the discriminant head to prevent overfitting.

2. Data loading and positive and negative sample construction

The data set is divided into training/validation by character folder to prevent data leakage. Each pair of sample generation rules:

  • Positive sample (label=1): In the same character folder, the prompt image and the candidate image have the same number.
  • Negative sample (label=0): Random combination of different numbers in the same folder or different folders.

Core processing flow:

import os
import re
import random
import numpy as np
import cv2
import torch
from torch.utils.data.dataset import Dataset

# ---------------------------------------------------#
# 图像预处理工具函数(完全保留)
# ---------------------------------------------------#
def cvtColor(image_np):
    if len(image_np.shape) == 3 and image_np.shape[2] == 3:
        return cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
    elif len(image_np.shape) == 3 and image_np.shape[2] == 4:
        bgr = cv2.cvtColor(image_np, cv2.COLOR_BGRA2BGR)
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    else:
        return cv2.cvtColor(image_np, cv2.COLOR_GRAY2RGB)

def letterbox_image(image_np, target_size):
    h, w = target_size
    ih, iw = image_np.shape[:2]
    scale = min(w / iw, h / ih)
    nw = int(iw * scale)
    nh = int(ih * scale)
    resized = cv2.resize(image_np, (nw, nh), interpolation=cv2.INTER_CUBIC)
    new_image = np.full((h, w, 3), 128, dtype=np.uint8)
    dx = (w - nw) // 2
    dy = (h - nh) // 2
    new_image[dy:dy+nh, dx:dx+nw] = resized
    return new_image

def preprocess_input(x):
    return x.astype(np.float32) / 255.0

def rand(a=0, b=1):
    return np.random.rand() * (b - a) + a

# ---------------------------------------------------#
# 新的 load_dataset:构建正负样本对并划分
# ---------------------------------------------------#
def load_dataset(dataset_path, train_ratio=0.8):
    """
    返回 train_samples, val_samples
    每个样本为 (img1_path, img2_path, label)  标签 1=同类同编号,0=不同类或同类别不同编号
    按字符文件夹整体划分训练/验证集,保证验证集文件夹在训练时不可见。
    """
    # 1. 收集所有字符文件夹及其编号对
    folder_pairs = []          # 元素:(folder_path, [ (char_path, plan_path), ... ])
    for root, dirs, files in os.walk(dataset_path):
        char_files = [f for f in files if 'char' in f.lower()]
        plan_files = [f for f in files if 'plan' in f.lower()]
        if not char_files or not plan_files:
            continue

        char_dict = {}
        plan_dict = {}
        for f in char_files:
            nums = re.findall(r'\d+', f)
            if nums:
                char_dict[nums[0]] = os.path.join(root, f)
        for f in plan_files:
            nums = re.findall(r'\d+', f)
            if nums:
                plan_dict[nums[0]] = os.path.join(root, f)

        pairs = []
        for num, char_path in char_dict.items():
            if num in plan_dict:
                pairs.append((char_path, plan_dict[num]))
        if pairs:
            folder_pairs.append((root, pairs))

    print(f"共找到 {sum(len(p) for _, p in folder_pairs)} 个有效图像对,文件夹总数: {len(folder_pairs)}")

    # 2. 按文件夹整体划分训练/验证集
    random.seed(42)
    random.shuffle(folder_pairs)
    num_train_folders = int(len(folder_pairs) * train_ratio)
    train_folders = folder_pairs[:num_train_folders]
    val_folders = folder_pairs[num_train_folders:]

    # 3. 在每个 split 内构建正负样本对
    def build_samples(folders):
        # 文件夹内所有编号对,以及文件夹索引(用于跨文件夹选取)
        all_pairs = []          # (char_path, plan_path, folder_idx)
        folder_idx_map = {}     # folder_path -> idx
        for idx, (folder_path, pairs) in enumerate(folders):
            folder_idx_map[folder_path] = idx
            for char_path, plan_path in pairs:
                all_pairs.append((char_path, plan_path, idx))

        # 同文件夹内部,按照 folder_idx 分组
        folder_to_indices = {}
        for i, (_, _, fidx) in enumerate(all_pairs):
            folder_to_indices.setdefault(fidx, []).append(i)

        samples = []
        for idx, (char_path, plan_path, fidx) in enumerate(all_pairs):
            # 正样本:本身的编号对

            samples.append((char_path, plan_path, 1))

            # 负样本构建:优先从同文件夹其他编号对,否则从其他文件夹任意对
            other_pairs_indices = folder_to_indices[fidx]
            if len(other_pairs_indices) > 1:
                # 同文件夹有多个编号对,随机选一个不同于当前的
                while True:
                    neg_local_idx = random.choice(other_pairs_indices)
                    if neg_local_idx != idx:
                        break
                neg_char, neg_plan, _ = all_pairs[neg_local_idx]

                samples.append((char_path, neg_plan, 0))

        return samples

    train_samples = build_samples(train_folders)
    val_samples = build_samples(val_folders)

    print(f"训练集样本数: {len(train_samples)} (其中正: {sum(l for _,_,l in train_samples)}, 负: {sum(1 for _,_,l in train_samples if l==0)})")
    print(f"验证集样本数: {len(val_samples)} (其中正: {sum(l for _,_,l in val_samples)}, 负: {sum(1 for _,_,l in val_samples if l==0)})")
    return train_samples, val_samples


# ---------------------------------------------------#
# 全新的 SiameseDataset:只负责图像读取和增强
# ---------------------------------------------------#
class SiameseDataset(Dataset):
    def __init__(self, samples, input_shape=(112,112), random=True):
        self.samples = samples    # list of (img1_path, img2_path, label)
        self.input_shape = input_shape
        self.random = random

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img1_path, img2_path, label = self.samples[index]

        img1 = self._load_and_preprocess(img1_path)
        img2 = self._load_and_preprocess(img2_path)

        return [img1, img2], np.float32(label)

    def _load_and_preprocess(self, img_path):
        image = cv2.imread(img_path)
        if image is None:
            raise FileNotFoundError(...)
        image = cvtColor(image)
        image = letterbox_image(image, self.input_shape)

        # 增强必须作用在 uint8 上(避免浮点色域溢出)
        if self.random:
            image = self._apply_augment(image)  # 现在 image 仍是 uint8

        # 最后归一化
        image = preprocess_input(image)  # 到这里变成 float32 [0,1]
        image = np.transpose(image, (2, 0, 1))
        return image

    def _apply_augment(self, image):
        """你可以在此处替换为更强大的 albumentations 增强"""
        h, w = image.shape[:2]
        if rand() < 0.5:
            image = cv2.flip(image, 1)
        if rand() < 0.5:
            angle = np.random.randint(-15, 15)
            center = (w//2, h//2)
            M = cv2.getRotationMatrix2D(center, angle, 1.0)
            image = cv2.warpAffine(image, M, (w, h), borderValue=(128,128,128))
        if rand() < 0.5:
            hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV).astype(np.float32)
            h_shift = rand(-0.1, 0.1) * 180
            s_scale = rand(1-0.7, 1+0.7)
            v_scale = rand(1-0.3, 1+0.3)
            hsv[:,:,0] = (hsv[:,:,0] + h_shift) % 180
            hsv[:,:,1] = np.clip(hsv[:,:,1] * s_scale, 0, 255)
            hsv[:,:,2] = np.clip(hsv[:,:,2] * v_scale, 0, 255)
            image = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)
        return image


# ---------------------------------------------------#
# 新的 collate_fn,与训练代码接口完全兼容
# ---------------------------------------------------#
def dataset_collate(batch):
    left_imgs = [item[0][0] for item in batch]   # 每个是 (C,H,W) numpy
    right_imgs = [item[0][1] for item in batch]
    labels = [item[1] for item in batch]

    left_tensor = torch.from_numpy(np.array(left_imgs)).float()
    right_tensor = torch.from_numpy(np.array(right_imgs)).float()
    labels_tensor = torch.from_numpy(np.array(labels)).float().view(-1, 1)

    images = torch.stack([left_tensor, right_tensor], dim=0)  # (2, B, C, H, W)
    return images, labels_tensor


# ---------------------------------------------------#
# 测试:显示几对样本供人工检查
# ---------------------------------------------------#
if __name__ == '__main__':
    data_path = r"D:\captcha\Siamese\data\jiyan"
    train_samples, val_samples = load_dataset(data_path)
    train_dataset = SiameseDataset(train_samples, input_shape=(112,112), random=True)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=4,
        shuffle=True,
        collate_fn=dataset_collate
    )

    import matplotlib.pyplot as plt
    for batch_idx, (images, labels) in enumerate(train_loader):
        x1, x2 = images  # (2, B, C, H, W)
        labs = labels.squeeze().numpy()
        def to_display(tensor):
            img = tensor.numpy().transpose(1,2,0)
            img = (img * 255).clip(0,255).astype(np.uint8)
            return img
        num = min(4, len(x1))
        fig, axes = plt.subplots(num, 2, figsize=(8, 4*num))
        for i in range(num):
            axes[i,0].imshow(to_display(x1[i]))
            axes[i,1].imshow(to_display(x2[i]))
            axes[i,0].set_title(f"Left (label={labs[i]})")
            axes[i,1].set_title(f"Right (label={labs[i]})")
            for ax in axes[i]:
                ax.axis('off')
        plt.tight_layout()
        plt.show()
        break

3. Training script (hierarchical learning rate + Focal Loss)

# !/usr/bin/env python
# -*-coding:utf-8 -*-

"""
# File       : train.py
# Time       :2026/4/30 16:48
# Author     :yujia
# version    :python 3.6
# Description:
"""
import torch.nn.functional as F
import os
import time
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.optim.lr_scheduler import CosineAnnealingLR

# 导入你的数据集相关函数(确保它们在同一目录或被正确导入)
from dataloader import load_dataset, SiameseDataset, dataset_collate
from SiameseEfficientNet import SiameseEfficientNet
from SiameseEdgeNeXt import SiameseEdgeNeXt
from SiameseMobileNetV4 import SiameseMobileNetV4
# ---------------------------- 配置 ----------------------------
DATA_PATH = r"D:\captcha\Siamese\data\vercode1117"
MODEL_TYPE = "MobileNetV4"        # "edgenext" 或 "efficientnet" MobileNetV4
PRETRAINED = True
INPUT_SIZE  = (112, 112)
BATCH_SIZE  = 32               # 实际每卡样本数,因为每个 sample 贡献 2 张图,实际 batch 为 64 对
EPOCHS      = 80
LR_BACKBONE = 1e-4
LR_HEAD     = 1e-3
WEIGHT_DECAY = 1e-4
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 保存路径
SAVE_DIR = "./checkpoints"
os.makedirs(SAVE_DIR, exist_ok=True)



# ---------------------------- 工具函数 ----------------------------
def compute_metrics(labels, logits):
    """计算准确率、AUC(二分类)"""
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int).flatten()
    labels = np.array(labels).flatten()
    acc = accuracy_score(labels, preds)
    try:
        auc = roc_auc_score(labels, probs.flatten())
    except:
        auc = 0.5
    return acc, auc, preds, probs

def train_one_epoch(model, loader, criterion, optimizer, device, epoch, total_epochs):
    model.train()
    total_loss, all_labels, all_logits = [], [], []
    pbar = tqdm(loader, desc=f"Train Epoch {epoch}/{total_epochs}", leave=False)
    for images, labels_tensor in pbar:
        x1 = images[0].to(device)
        x2 = images[1].to(device)
        targets = labels_tensor.to(device).float().view(-1, 1)

        logits = model(x1, x2)
        loss = focal_bce_loss(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss.append(loss.item())
        all_labels.extend(targets.cpu().tolist())
        all_logits.extend(logits.detach().cpu().tolist())

        # 实时显示当前 loss
        pbar.set_postfix({'loss': f"{loss.item():.4f}"})

    avg_loss = np.mean(total_loss)
    acc, auc, preds, probs = compute_metrics(all_labels, all_logits)
    return avg_loss, acc, auc

def validate(model, loader, criterion, device):
    model.eval()
    total_loss, all_labels, all_logits = [], [], []
    pbar = tqdm(loader, desc="Validation", leave=False)
    with torch.no_grad():
        for images, labels_tensor in pbar:
            x1 = images[0].to(device)
            x2 = images[1].to(device)
            targets = labels_tensor.to(device).float().view(-1, 1)

            logits = model(x1, x2)
            loss = focal_bce_loss(logits, targets)

            total_loss.append(loss.item())
            all_labels.extend(targets.cpu().tolist())
            all_logits.extend(logits.cpu().tolist())

            pbar.set_postfix({'loss': f"{loss.item():.4f}"})

    avg_loss = np.mean(total_loss)
    acc, auc, preds, probs = compute_metrics(all_labels, all_logits)
    return avg_loss, acc, auc


def focal_bce_loss(logits, targets, gamma=2.0, alpha=0.25, smoothing=0.1):
    # 标签平滑
    targets = targets * (1 - smoothing) + 0.5 * smoothing
    bce = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    pt = torch.exp(-bce)
    focal = alpha * (1 - pt) ** gamma * bce
    return focal.mean()

def train():
    # ---------------------------- 准备数据 ----------------------------
    print("Loading dataset...")
    train_samples, val_samples = load_dataset(DATA_PATH, train_ratio=0.8)
    train_dataset = SiameseDataset(train_samples, input_shape=INPUT_SIZE, random=True)
    val_dataset = SiameseDataset(val_samples, input_shape=INPUT_SIZE, random=False)


    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                              collate_fn=dataset_collate, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
                            collate_fn=dataset_collate, num_workers=0, pin_memory=True)


    # ---------------------------- 构建模型 ----------------------------

    if MODEL_TYPE == "edgenext":
        model = SiameseEdgeNeXt(pretrained=PRETRAINED)
    elif MODEL_TYPE == "efficientnet":
        model = SiameseEfficientNet(pretrained=PRETRAINED)
    elif MODEL_TYPE == "MobileNetV4":
        model = SiameseMobileNetV4(pretrained=PRETRAINED)
    else:
        raise ValueError("MODEL_TYPE must be 'edgenext' or 'efficientnet'")

    model = model.to(DEVICE)

    # ---------------------------- 损失函数、优化器、调度器 ----------------------------
    criterion = nn.BCEWithLogitsLoss()  # 输入 logits,目标 0/1

    # 分层学习率:backbone 较小,融合头较大
    optimizer = optim.AdamW([
        {'params': model.backbone.parameters(), 'lr': LR_BACKBONE},
        {'params': model.fusion_head.parameters(), 'lr': LR_HEAD},
    ], weight_decay=WEIGHT_DECAY)

    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

    # 早停相关
    best_val_acc = 0
    patience = 15
    early_stop_counter = 0

    # ---------------------------- 训练 ----------------------------
    print("\nStart training...")
    for epoch in range(EPOCHS):
        start_time = time.time()
        # 传入 epoch 和 EPOCHS 用于进度条描述
        train_loss, train_acc, train_auc = train_one_epoch(
            model, train_loader, criterion, optimizer, DEVICE, epoch + 1, EPOCHS
        )
        val_loss, val_acc, val_auc = validate(model, val_loader, criterion, DEVICE)

        scheduler.step()

        lr_backbone = optimizer.param_groups[0]['lr']
        lr_head = optimizer.param_groups[1]['lr']

        print(f"\nEpoch {epoch + 1:03d}/{EPOCHS} | Time: {time.time() - start_time:.1f}s | "
              f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} AUC: {train_auc:.4f} | "
              f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} AUC: {val_auc:.4f} | "
              f"LR: backbone={lr_backbone:.2e}, head={lr_head:.2e}")
        torch.save(model.state_dict(), os.path.join(SAVE_DIR, f"last_{MODEL_TYPE}.pth"))
        # 保存最佳模型
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            early_stop_counter = 0
            torch.save(model.state_dict(), os.path.join(SAVE_DIR, f"best_{MODEL_TYPE}.pth"))
            print(f"\n  => Best model saved (val_acc={val_acc:.4f})")
        else:
            early_stop_counter += 1

        if early_stop_counter >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

    print("Training finished. Best val loss: {:.4f}".format(best_val_acc))


if __name__ == '__main__':
    train()

Training monitoring metrics: Validation set accuracy (Val Acc) and AUC. If the training accuracy is much higher than the verification accuracy, you need to increase Dropout or reduce the model capacity.

4. Reasoning and Deployment

Export ONNX after training is complete:

# !/usr/bin/env python
# -*-coding:utf-8 -*-

"""
# File       : export.py.py
# Time       :2026/4/30 17:53
# Author     :yujia
# version    :python 3.6
# Description:
"""
import os
import torch
import torch.nn as nn
import timm

from SiameseMobileNetV4 import SiameseMobileNetV4



def export_onnx(model, onnx_path, input_size=(112, 112)):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    x1 = torch.randn(1, 3, *input_size, device=device)
    x2 = torch.randn(1, 3, *input_size, device=device)

    dynamic_axes = {
        "input1": {0: "batch"},
        "input2": {0: "batch"},
        # "logits": {0: "batch"},       # 输出也声明为动态 batch
    }

    torch.onnx.export(
        model,
        (x1, x2),
        onnx_path,
        export_params=True,
        opset_version=14,            # 稳定且支持动态轴
        do_constant_folding=True,
        input_names=["input1", "input2"],
        output_names=["logits"],
        dynamic_axes=dynamic_axes,
        dynamo=False                  # 关键:禁用 dynamo,使用经典 TorchScript 导出
    )
    print(f"✅ ONNX exported to: {onnx_path}")


def validate_onnx(onnx_path, input_size=(112, 112)):
    import onnxruntime
    import numpy as np

    session = onnxruntime.InferenceSession(onnx_path)
    x1 = np.random.randn(2, 3, *input_size).astype(np.float32)   # batch=2 测试动态
    x2 = np.random.randn(2, 3, *input_size).astype(np.float32)

    ort_inputs = {
        session.get_inputs()[0].name: x1,
        session.get_inputs()[1].name: x2
    }
    outputs = session.run(None, ort_inputs)
    print(f"✅ Validate OK. Output shape: {outputs[0].shape}")


if __name__ == "__main__":
    WEIGHT_PATH = "checkpoints/best_MobileNetV4.pth"
    ONNX_PATH   = "onnx/siamese_mobilenetv4_hybrid_medium.onnx"

    os.makedirs("onnx", exist_ok=True)

    # 创建模型并加载训练权重
    model = SiameseMobileNetV4(pretrained=False)
    state_dict = torch.load(WEIGHT_PATH, map_location="cpu")
    model.load_state_dict(state_dict, strict=True)

    # 导出 ONNX
    export_onnx(model, ONNX_PATH, input_size=(112, 112))

    # 验证动态 batch
    validate_onnx(ONNX_PATH, input_size=(112, 112))

Inference using ONNX Runtime:

# !/usr/bin/env python
# -*-coding:utf-8 -*-

"""
# File       : val_onnx.py
# Time       :2026/4/30 17:54
# Author     :yujia
# version    :python 3.6
# Description:
"""
import os
import cv2
import numpy as np
import onnxruntime as ort


def cvtColor(image_np):
    """确保图像为 3 通道 RGB 格式"""
    if len(image_np.shape) == 3 and image_np.shape[2] == 3:
        return cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
    elif len(image_np.shape) == 3 and image_np.shape[2] == 4:
        bgr = cv2.cvtColor(image_np, cv2.COLOR_BGRA2BGR)
        return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    else:
        return cv2.cvtColor(image_np, cv2.COLOR_GRAY2RGB)

def letterbox_image(image_np, target_size):
    h, w = target_size
    ih, iw = image_np.shape[:2]
    scale = min(w / iw, h / ih)
    nw = int(iw * scale)
    nh = int(ih * scale)
    resized = cv2.resize(image_np, (nw, nh), interpolation=cv2.INTER_CUBIC)
    new_image = np.full((h, w, 3), 128, dtype=np.uint8)
    dx = (w - nw) // 2
    dy = (h - nh) // 2
    new_image[dy:dy+nh, dx:dx+nw] = resized
    return new_image

def preprocess_input(x):
    return x.astype(np.float32) / 255.0

def preprocess_image(img: np.ndarray, input_size=(112, 112)) -> np.ndarray:
    """读取图像并预处理,返回形状为 [1, 3, H, W] 的 numpy 数组"""
    img = cvtColor(img)
    img = letterbox_image(img, input_size)
    img = preprocess_input(img)                     # 归一化到 [0, 1]
    img = np.transpose(img, (2, 0, 1)).astype(np.float32)  # HWC -> CHW
    return np.expand_dims(img, axis=0)              # [1, 3, H, W]


# ===================== ONNX 推理接口 =====================
class ONNXInference:
    def __init__(self, onnx_path: str, device: str = 'cpu', input_size=(112, 112)):
        """
        onnx_path : ONNX 模型文件路径
        device    : 'cpu' 或 'cuda' (需要 onnxruntime-gpu)
        input_size: 输入图像尺寸,需与导出时一致
        """
        self.input_size = input_size

        # 配置推理提供者
        providers = ['CPUExecutionProvider']
        if device == 'cuda':
            providers.insert(0, 'CUDAExecutionProvider')

        self.session = ort.InferenceSession(onnx_path, providers=providers)
        self.input_names = [inp.name for inp in self.session.get_inputs()]
        self.output_names = [out.name for out in self.session.get_outputs()]
        print(f"ONNX model loaded. Inputs: {self.input_names}, Outputs: {self.output_names}")

    def predict_pair(self, img1_path: str, img2_path: str) -> float:
        """比较两张图像,返回相似概率 (0~1)"""
        img1 = preprocess_image(img1_path, self.input_size)
        img2 = preprocess_image(img2_path, self.input_size)

        # 注意:输入名称需与导出时一致(默认为 'input1', 'input2')
        ort_inputs = {
            self.input_names[0]: img1,
            self.input_names[1]: img2
        }
        logits = self.session.run(self.output_names, ort_inputs)[0]   # shape: [1, 1]
        prob = 1.0 / (1.0 + np.exp(-logits))  # sigmoid
        return float(prob[0, 0])

    def reason_all_batch(self, image_1_list, image_2_list):
        """
        批量计算两组图片之间的所有组合相似度
        :param image_1_list: 图片路径列表(或已预处理数组),长度 N
        :param image_2_list: 图片路径列表(或已预处理数组),长度 M
        :return: 二维列表 scores[N][M],scores[i][j] 为 image_1[i] 与 image_2[j] 的相似概率
        """
        N = len(image_1_list)
        M = len(image_2_list)
        processed_1 = [preprocess_image(img) for img in image_1_list]
        processed_2 = [preprocess_image(img) for img in image_2_list]

        # 2. 构造笛卡尔积 batch
        x1_list = []
        x2_list = []
        for p1 in processed_1:
            x1_list.extend([p1] * M)  # 每个 char 复制 M 份
            x2_list.extend(processed_2)  # 每份配对所有 target
        # 沿 batch 轴拼接
        x1_batch = np.concatenate(x1_list, axis=0)  # (N*M, C, H, W)
        x2_batch = np.concatenate(x2_list, axis=0)
        print(x1_batch.shape, x2_batch.shape)
        # 3. 一次推理(注意输入名需与 ONNX 模型保持一致)
        # 如果你导出的模型输入名为 "input1", "input2",请替换这里
        ort_inputs = {self.input_names[0]: x1_batch, self.input_names[1]: x2_batch}
        logits = self.session.run(self.output_names, ort_inputs)[0]  # (N*M, 1)

        # 4. Sigmoid 得到概率
        probs = 1.0 / (1.0 + np.exp(-logits))  # 稳定的 sigmoid
        probs = probs.flatten().tolist()

        # 5. 重塑成 N×M 矩阵
        scores = [probs[i * M: (i + 1) * M] for i in range(N)]
        return scores



# ===================== 使用示例 =====================
if __name__ == '__main__':
    # 配置
    ONNX_PATH = "onnx/siamese_mobilenetv4_hybrid_medium.onnx"   # 你的 ONNX 模型路径
    DEVICE = "cpu"                                 # 或 "cuda"

    # 初始化 ONNX 推理器
    infer = ONNXInference(ONNX_PATH, device=DEVICE)
    char_1 = cv2.imread("char_1.jpg")
    plan_1 = cv2.imread("plan_1.jpg")
    plan_2 = cv2.imread("plan_2.jpg")

    # 单对相似度预测
    prob = infer.predict_pair(char_1, plan_1)
    print(f"两图相似概率: {prob:.4f}")

    prob = infer.predict_pair(char_1, plan_2)
    print(f"两图相似概率: {prob:.4f}")


✅ Summary

ModuleApplicable scenariosKey technologies
YOLO detectionLocating multiple targets in captchas (characters, titles, etc.)Mosaic enhancements, MuSGD optimizer, ONNX deployment
Twin networkClick verification code text matching, similarity judgmentFeature difference fusion, hierarchical learning rate, Focal Loss

Both codes support GPU acceleration and lightweight deployment, and can be directly integrated into production environments. If you have any questions, you are welcome to make adjustments based on the comments in the code.