Qwen2.5-VL算法优化：提升目标检测准确率-育师

Qwen2.5-VL算法优化：提升目标检测准确率

1. 理解Qwen2.5-VL的目标检测能力

Qwen2.5-VL不是传统意义上的目标检测模型，而是一个视觉语言大模型，它通过自然语言指令完成视觉理解任务。当我们说"提升目标检测准确率"，实际上是指优化它在物体定位、边界框生成和关键点识别等空间感知任务上的表现。这种能力体现在它能精准输出bbox_2d坐标、point_2d坐标，并以结构化JSON格式返回结果。

与YOLO或Faster R-CNN这类专用检测模型不同，Qwen2.5-VL的定位能力源于其多模态架构设计。它采用原生动态分辨率视觉编码器，能够直接处理不同尺寸的图像，不再依赖固定分辨率缩放。更重要的是，它使用基于图像实际尺寸的绝对坐标表示法，而不是传统的相对坐标（0-1范围），这使得定位结果更精确、更稳定。

在实际应用中，你会发现Qwen2.5-VL对复杂场景的适应性很强。比如在一张包含多个重叠物体的图片中，它不仅能识别出所有目标，还能准确区分它们的空间关系；在文档解析场景中，它不仅能定位文字区域，还能理解表格结构和图表布局；在视频理解中，它结合动态帧率训练和绝对时间编码，实现了秒级事件定位能力。

不过需要明确一点：Qwen2.5-VL的"目标检测"是通过提示工程和后处理实现的，而不是传统意义上的端到端检测网络。因此，我们的优化工作主要围绕如何让这个强大的基础模型更好地完成空间感知任务展开，而不是修改其核心架构。

2. 损失函数调整策略

Qwen2.5-VL作为预训练好的大模型，通常不直接调整其内部损失函数。但我们可以从两个层面进行"损失函数优化"：一是微调阶段的损失设计，二是推理阶段的后处理损失。

2.1 微调阶段的损失函数设计

当需要针对特定领域（如医疗影像、工业质检）进行微调时，可以设计复合损失函数：

import torch import torch.nn as nn from torch.nn import functional as F class QwenVLDetectionLoss(nn.Module): def __init__(self, bbox_weight=1.0, label_weight=0.5, iou_threshold=0.5, smooth_l1_beta=0.1): super().__init__() self.bbox_weight = bbox_weight self.label_weight = label_weight self.iou_threshold = iou_threshold self.smooth_l1_loss = nn.SmoothL1Loss(beta=smooth_l1_beta) def forward(self, pred_boxes, pred_labels, target_boxes, target_labels): # 计算边界框回归损失（Smooth L1） bbox_loss = self.smooth_l1_loss(pred_boxes, target_boxes) # 计算标签分类损失（交叉熵） label_loss = F.cross_entropy(pred_labels, target_labels) # 添加IoU感知损失：鼓励预测框与真实框有更高交并比 iou_loss = self._iou_loss(pred_boxes, target_boxes) # 综合损失 total_loss = (self.bbox_weight * bbox_loss + self.label_weight * label_loss + 0.3 * iou_loss) return total_loss def _iou_loss(self, pred_boxes, target_boxes): """计算IoU损失""" # 将坐标转换为[x1,y1,x2,y2]格式 pred_x1, pred_y1, pred_x2, pred_y2 = pred_boxes.T target_x1, target_y1, target_x2, target_y2 = target_boxes.T # 计算交集 inter_x1 = torch.max(pred_x1, target_x1) inter_y1 = torch.max(pred_y1, target_y1) inter_x2 = torch.min(pred_x2, target_x2) inter_y2 = torch.min(pred_y2, target_y2) inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * \ torch.clamp(inter_y2 - inter_y1, min=0) # 计算并集 pred_area = (pred_x2 - pred_x1) * (pred_y2 - pred_y1) target_area = (target_x2 - target_x1) * (target_y2 - target_y1) union_area = pred_area + target_area - inter_area # IoU = 交集/并集 iou = inter_area / (union_area + 1e-6) # IoU损失 = 1 - IoU return torch.mean(1 - iou) # 使用示例 criterion = QwenVLDetectionLoss() loss = criterion(pred_boxes, pred_labels, targets_boxes, targets_labels)

2.2 推理阶段的后处理损失优化

由于Qwen2.5-VL输出的是文本形式的JSON，我们需要设计后处理逻辑来提高定位精度：

import json import numpy as np from typing import List, Dict, Tuple def refine_bbox_predictions(raw_output: str, image_width: int, image_height: int, confidence_threshold: float = 0.7) -> List[Dict]: """ 对Qwen2.5-VL原始输出进行后处理优化 """ try: # 解析原始JSON输出 if raw_output.strip().startswith('['): predictions = json.loads(raw_output.strip()) else: # 处理可能的非标准JSON格式 start_idx = raw_output.find('[') end_idx = raw_output.rfind(']') + 1 if start_idx != -1 and end_idx != 0: predictions = json.loads(raw_output[start_idx:end_idx]) else: return [] except json.JSONDecodeError: return [] refined_predictions = [] for pred in predictions: # 标准化边界框坐标 if 'bbox_2d' in pred: bbox = pred['bbox_2d'] # 确保坐标在图像范围内 x1, y1, x2, y2 = [max(0, min(int(coord), image_width if i%2==0 else image_height)) for i, coord in enumerate(bbox)] # 修正坐标顺序（确保x1<x2, y1<y2） x1, x2 = min(x1, x2), max(x1, x2) y1, y2 = min(y1, y2), max(y1, y2) # 计算置信度（基于坐标合理性） area_ratio = ((x2-x1)*(y2-y1)) / (image_width * image_height) confidence = min(0.95, 0.5 + area_ratio * 0.5) # 基于面积的置信度 if confidence >= confidence_threshold: refined_predictions.append({ 'bbox_2d': [x1, y1, x2, y2], 'label': pred.get('label', 'object'), 'confidence': confidence }) return refined_predictions # 使用示例 raw_response = '''[ {"bbox_2d": [19, 3, 84, 125], "label": "ice cream"}, {"bbox_2d": [167, 0, 288, 134], "label": "flip flops"} ]''' refined = refine_bbox_predictions(raw_response, 1024, 768) print(f"优化后得到 {len(refined)} 个高质量检测结果")

3. 数据增强策略

Qwen2.5-VL的数据增强策略与传统CV模型有所不同。由于它是多模态大模型，数据增强需要同时考虑图像和文本两个维度，重点在于提升模型对各种视觉变化的鲁棒性。

3.1 图像增强策略

针对Qwen2.5-VL的特点，我们推荐以下增强方法：

import cv2 import numpy as np from PIL import Image, ImageEnhance, ImageFilter import random class QwenVLDataAugmentation: def __init__(self, p=0.5): self.p = p def __call__(self, image: Image.Image, text_prompt: str) -> tuple: """对图像和文本提示进行联合增强""" augmented_image = image.copy() augmented_text = text_prompt # 随机选择增强操作 if random.random() < self.p: augmented_image = self._random_brightness_contrast(augmented_image) if random.random() < self.p: augmented_image = self._random_noise(augmented_image) if random.random() < self.p: augmented_image = self._random_blur(augmented_image) if random.random() < self.p: augmented_image = self._random_occlusion(augmented_image) # 文本增强：同义词替换和句式变换 if random.random() < self.p: augmented_text = self._augment_text(text_prompt) return augmented_image, augmented_text def _random_brightness_contrast(self, image: Image.Image) -> Image.Image: """随机调整亮度和对比度""" enhancer = ImageEnhance.Brightness(image) factor = random.uniform(0.8, 1.2) image = enhancer.enhance(factor) enhancer = ImageEnhance.Contrast(image) factor = random.uniform(0.8, 1.2) image = enhancer.enhance(factor) return image def _random_noise(self, image: Image.Image) -> Image.Image: """添加高斯噪声""" img_array = np.array(image) noise = np.random.normal(0, 0.01, img_array.shape) noisy_img = np.clip(img_array + noise * 255, 0, 255).astype(np.uint8) return Image.fromarray(noisy_img) def _random_blur(self, image: Image.Image) -> Image.Image: """随机模糊""" blur_radius = random.choice([0, 1, 2, 3]) if blur_radius > 0: image = image.filter(ImageFilter.GaussianBlur(blur_radius)) return image def _random_occlusion(self, image: Image.Image) -> Image.Image: """随机遮挡""" img_array = np.array(image) h, w = img_array.shape[:2] # 随机选择遮挡区域 occlusion_h = random.randint(h//20, h//5) occlusion_w = random.randint(w//20, w//5) x = random.randint(0, w - occlusion_w) y = random.randint(0, h - occlusion_h) # 用灰色遮挡 img_array[y:y+occlusion_h, x:x+occlusion_w] = 128 return Image.fromarray(img_array) def _augment_text(self, text: str) -> str: """文本增强：同义词替换和句式变换""" # 简单的同义词替换（实际应用中可使用更复杂的NLP库） replacements = { 'locate': ['find', 'detect', 'identify', 'spot'], 'every': ['all', 'each', 'the'], 'output': ['return', 'provide', 'generate', 'give'], 'coordinates': ['positions', 'locations', 'points', 'boxes'] } words = text.split() augmented_words = [] for word in words: clean_word = word.strip('.,!?;:') if clean_word.lower() in replacements: replacement = random.choice(replacements[clean_word.lower()]) # 保持首字母大写 if clean_word[0].isupper(): replacement = replacement.capitalize() augmented_words.append(replacement + word[len(clean_word):]) else: augmented_words.append(word) return ' '.join(augmented_words) # 使用示例 augmentor = QwenVLDataAugmentation(p=0.7) original_image = Image.open("sample.jpg") original_prompt = "Locate every cake and describe their features, output the bbox coordinates in JSON format." augmented_image, augmented_prompt = augmentor(original_image, original_prompt) print(f"增强后的提示：{augmented_prompt}")

3.2 提示工程增强

Qwen2.5-VL的核心优势在于其强大的提示理解能力，因此提示工程本身就是一种重要的"数据增强"：

class PromptEngineeringEnhancer: def __init__(self): self.variations = [ # 不同的指令风格 "Find all instances of {object} in the image and provide bounding box coordinates.", "Detect {object} in this picture and return precise location information.", "Where are all the {object} located? Give me exact pixel coordinates.", "Identify every {object} and output their positions as [x1,y1,x2,y2] format.", # 不同的详细程度 "Locate {object} and describe their features including size, color, and position.", "Find {object} and give me bounding boxes with labels.", "Just output the bounding box coordinates for {object} in JSON format.", # 不同的约束条件 "Only detect {object} that are fully visible in the image.", "Find {object} and ignore any partially occluded ones.", "Detect {object} with high confidence, filter out uncertain detections." ] def generate_variations(self, base_prompt: str, objects: List[str]) -> List[str]: """为给定提示生成多种变体""" variations = [] # 如果提示中包含对象类型，生成针对性变体 for obj in objects: for variation in self.variations: if '{object}' in variation: variations.append(variation.format(object=obj)) # 添加基础提示的改写 variations.append(base_prompt) variations.append(base_prompt.replace("Locate", "Find")) variations.append(base_prompt.replace("output", "return")) variations.append(base_prompt + " Be precise and accurate.") return list(set(variations)) # 去重 # 使用示例 enhancer = PromptEngineeringEnhancer() base_prompt = "Locate every cake and describe their features, output the bbox coordinates in JSON format." objects = ["cake", "cupcake", "pastry"] prompt_variations = enhancer.generate_variations(base_prompt, objects) print(f"生成了 {len(prompt_variations)} 种不同的提示变体")

4. 模型蒸馏实践

Qwen2.5-VL系列包含3B、7B、72B等多个参数规模的模型，模型蒸馏可以帮助我们在保持性能的同时降低部署成本。这里介绍两种实用的蒸馏策略：

4.1 轻量级模型微调蒸馏

使用Qwen2.5-VL-7B作为学生模型，Qwen2.5-VL-72B作为教师模型：

import torch import torch.nn as nn from transformers import AutoModelForCausalLM, AutoTokenizer from torch.utils.data import Dataset, DataLoader class QwenVLDistillationDataset(Dataset): def __init__(self, image_paths, prompts, teacher_outputs, tokenizer, image_processor, max_length=512): self.image_paths = image_paths self.prompts = prompts self.teacher_outputs = teacher_outputs self.tokenizer = tokenizer self.image_processor = image_processor self.max_length = max_length def __len__(self): return len(self.image_paths) def __getitem__(self, idx): # 加载图像 image = self.image_processor( Image.open(self.image_paths[idx]).convert('RGB') ) # 编码提示 prompt = self.prompts[idx] inputs = self.tokenizer( prompt, return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length" ) # 教师输出作为软标签 soft_label = self.tokenizer.encode( self.teacher_outputs[idx], return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length" ) return { 'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'labels': soft_label.squeeze(), 'pixel_values': image.pixel_values.squeeze() } class DistillationLoss(nn.Module): def __init__(self, alpha=0.7, temperature=2.0): super().__init__() self.alpha = alpha self.temperature = temperature self.ce_loss = nn.CrossEntropyLoss() self.kl_loss = nn.KLDivLoss(reduction='batchmean') def forward(self, student_logits, teacher_logits, labels): # 学生模型的硬标签损失 hard_loss = self.ce_loss(student_logits.view(-1, student_logits.size(-1)), labels.view(-1)) # 蒸馏损失：KL散度 student_log_probs = torch.log_softmax(student_logits / self.temperature, dim=-1) teacher_probs = torch.softmax(teacher_logits / self.temperature, dim=-1) distillation_loss = self.kl_loss(student_log_probs, teacher_probs) # 综合损失 total_loss = self.alpha * hard_loss + (1 - self.alpha) * distillation_loss * (self.temperature ** 2) return total_loss # 蒸馏训练循环示例 def distill_qwen_vl(teacher_model, student_model, train_dataloader, optimizer, device, distillation_loss): student_model.train() total_loss = 0 for batch in train_dataloader: # 将数据移到设备 input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) pixel_values = batch['pixel_values'].to(device) labels = batch['labels'].to(device) # 教师模型前向传播（不计算梯度） with torch.no_grad(): teacher_outputs = teacher_model( input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels ) teacher_logits = teacher_outputs.logits # 学生模型前向传播 student_outputs = student_model( input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, labels=labels ) student_logits = student_outputs.logits # 计算蒸馏损失 loss = distillation_loss(student_logits, teacher_logits, labels) # 反向传播 optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(train_dataloader)

4.2 知识蒸馏的实用技巧

在实际蒸馏过程中，我们发现以下技巧特别有效：

class PracticalDistillationTricks: @staticmethod def adaptive_temperature_schedule(epoch: int, total_epochs: int) -> float: """自适应温度调度：初期高温，后期低温""" if epoch < total_epochs * 0.3: return 4.0 # 初期高温，帮助知识迁移 elif epoch < total_epochs * 0.7: return 2.0 # 中期适中温度 else: return 1.0 # 后期低温，聚焦细节 @staticmethod def selective_distillation_mask(teacher_logits: torch.Tensor, student_logits: torch.Tensor, threshold: float = 0.1) -> torch.Tensor: """选择性蒸馏掩码：只对教师模型高置信度的预测进行蒸馏""" # 计算教师模型的softmax概率 teacher_probs = torch.softmax(teacher_logits, dim=-1) max_probs, _ = torch.max(teacher_probs, dim=-1) # 创建掩码：只对高置信度位置进行蒸馏 mask = (max_probs > threshold).float() return mask.unsqueeze(-1) # 扩展维度以匹配logits @staticmethod def feature_map_distillation(student_features: torch.Tensor, teacher_features: torch.Tensor, weight: float = 0.2) -> torch.Tensor: """特征图蒸馏：对视觉编码器的中间特征进行蒸馏""" # 简单的MSE损失 feature_loss = torch.mean((student_features - teacher_features) ** 2) return weight * feature_loss @staticmethod def consistency_regularization(student_outputs: List[torch.Tensor], weight: float = 0.1) -> torch.Tensor: """一致性正则化：对同一输入的不同增强版本施加一致性约束""" if len(student_outputs) < 2: return torch.tensor(0.0) # 计算不同增强版本输出的一致性 consistency_loss = 0 for i in range(len(student_outputs)): for j in range(i+1, len(student_outputs)): # 使用余弦相似度作为一致性度量 cos_sim = torch.nn.functional.cosine_similarity( student_outputs[i].flatten(), student_outputs[j].flatten(), dim=0 ) consistency_loss += (1 - cos_sim) ** 2 return weight * consistency_loss / (len(student_outputs) * (len(student_outputs)-1) / 2) # 使用示例 tricks = PracticalDistillationTricks() current_temp = tricks.adaptive_temperature_schedule(epoch=10, total_epochs=50) print(f"当前蒸馏温度：{current_temp}") # 在训练循环中使用 if epoch % 5 == 0: current_temp = tricks.adaptive_temperature_schedule(epoch, total_epochs) distillation_loss.temperature = current_temp

5. 超参数调优指南

Qwen2.5-VL的超参数调优需要特别关注其多模态特性，以下是我们经过大量实验验证的有效策略：

5.1 关键超参数设置

class QwenVLHyperparameterTuner: def __init__(self): # 基于大量实验总结的最佳实践范围 self.param_ranges = { 'learning_rate': [1e-6, 5e-5], 'batch_size': [1, 4, 8], 'num_epochs': [3, 5, 10], 'warmup_ratio': [0.05, 0.1, 0.2], 'weight_decay': [0.01, 0.05, 0.1], 'gradient_accumulation_steps': [4, 8, 16], 'max_length': [256, 512, 1024], 'temperature': [0.1, 0.5, 1.0], 'top_p': [0.8, 0.9, 0.95] } def get_optimal_config(self, task_type: str = "detection") -> dict: """根据任务类型返回推荐的超参数配置""" configs = { 'detection': { 'learning_rate': 2e-5, 'batch_size': 4, 'num_epochs': 5, 'warmup_ratio': 0.1, 'weight_decay': 0.01, 'gradient_accumulation_steps': 8, 'max_length': 512, 'temperature': 0.1, # 低温度提高定位精度 'top_p': 0.9 }, 'ocr': { 'learning_rate': 1e-5, 'batch_size': 2, 'num_epochs': 3, 'warmup_ratio': 0.05, 'weight_decay': 0.05, 'gradient_accumulation_steps': 16, 'max_length': 1024, 'temperature': 0.01, # 极低温度确保文本准确性 'top_p': 0.8 }, 'document_parsing': { 'learning_rate': 5e-6, 'batch_size': 1, 'num_epochs': 10, 'warmup_ratio': 0.2, 'weight_decay': 0.1, 'gradient_accumulation_steps': 16, 'max_length': 1024, 'temperature': 0.2, 'top_p': 0.95 } } return configs.get(task_type, configs['detection']) def grid_search(self, model, train_dataset, val_dataset, param_grid: dict = None) -> dict: """简单的网格搜索实现""" if param_grid is None: param_grid = self.param_ranges best_score = float('-inf') best_params = {} # 生成参数组合（简化版） from itertools import product param_names = list(param_grid.keys()) param_values = list(param_grid.values()) # 由于完整网格搜索计算量大，我们采样部分组合 import random sampled_combinations = [] for _ in range(20): # 采样20种组合 combo = {} for name, values in zip(param_names, param_values): combo[name] = random.choice(values) sampled_combinations.append(combo) for params in sampled_combinations: # 这里应该是实际的训练和评估过程 # 为简洁起见，我们模拟评估结果 score = self._simulate_evaluation(model, train_dataset, val_dataset, params) if score > best_score: best_score = score best_params = params return {'best_params': best_params, 'best_score': best_score} def _simulate_evaluation(self, model, train_dataset, val_dataset, params) -> float: """模拟评估过程（实际使用时替换为真实评估）""" # 这里应该运行实际的训练和验证 # 返回mAP、F1-score或其他相关指标 import random return random.uniform(0.6, 0.9) # 模拟0.6-0.9之间的分数 # 使用示例 tuner = QwenVLHyperparameterTuner() optimal_config = tuner.get_optimal_config("detection") print("目标检测任务推荐超参数配置：") for k, v in optimal_config.items(): print(f" {k}: {v}")

5.2 学习率调度策略

针对Qwen2.5-VL的多模态特性，我们推荐以下学习率调度策略：

import math import torch from torch.optim.lr_scheduler import LambdaLR class QwenVLLearningRateScheduler: @staticmethod def cosine_with_warmup(optimizer, num_training_steps: int, num_warmup_steps: int = 0, min_lr_ratio: float = 0.1, last_epoch: int = -1): """余弦退火学习率调度器""" def lr_lambda(current_step: int): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float( max(1, num_training_steps - num_warmup_steps) ) return max( min_lr_ratio, 0.5 * (1.0 + math.cos(math.pi * progress)) ) return LambdaLR(optimizer, lr_lambda, last_epoch) @staticmethod def layerwise_learning_rate_decay(optimizer, num_layers: int, decay_rate: float = 0.95): """层间学习率衰减：底层学习率更高，顶层更低""" # 这需要在模型初始化时设置不同的参数组 # 示例：为视觉编码器和语言模型设置不同学习率 param_groups = [] # 视觉编码器参数（通常学习率更高） visual_params = [] # 语言模型参数（通常学习率更低） language_params = [] # 实际应用中需要根据模型结构分离参数 # 这里提供一个概念性示例 for name, param in model.named_parameters(): if 'vision' in name or 'encoder' in name: visual_params.append(param) else: language_params.append(param) param_groups = [ {'params': visual_params, 'lr': 2e-5}, {'params': language_params, 'lr': 1e-5} ] return torch.optim.AdamW(param_groups) @staticmethod def adaptive_lr_on_plateau(optimizer, patience: int = 3, factor: float = 0.5, min_lr: float = 1e-7): """基于验证指标的自适应学习率调整""" from torch.optim.lr_scheduler import ReduceLROnPlateau return ReduceLROnPlateau( optimizer, mode='max', factor=factor, patience=patience, min_lr=min_lr, verbose=True ) # 使用示例 optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) scheduler = QwenVLLearningRateScheduler.cosine_with_warmup( optimizer, num_training_steps=1000, num_warmup_steps=100 )

6. 完整训练与评估脚本

以下是完整的端到端训练和评估脚本，整合了前面介绍的所有优化技术：

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Qwen2.5-VL目标检测优化训练脚本 """ import os import json import time import logging import argparse from pathlib import Path from typing import Dict, List, Tuple, Optional import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from torch.cuda.amp import autocast, GradScaler from transformers import ( AutoProcessor, AutoModelForVision2Seq, TrainingArguments, Trainer, set_seed ) from datasets import load_dataset # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class QwenVLDetectionDataset(Dataset): """Qwen2.5-VL目标检测数据集""" def __init__(self, dataset_path: str, processor, max_length: int = 512): self.dataset = load_dataset(dataset_path) self.processor = processor self.max_length = max_length def __len__(self): return len(self.dataset['train']) def __getitem__(self, idx): item = self.dataset['train'][idx] # 处理图像 image = item['image'] if hasattr(image, 'convert'): image = image.convert('RGB') # 构建提示 prompt = self._build_detection_prompt(item) # 处理文本 text = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" # 使用processor处理多模态输入 inputs = self.processor( text=text, images=image, return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length" ) # 准备标签（用于训练） # 注意：Qwen2.5-VL使用自回归方式，标签是文本的token ids labels = self.processor.tokenizer( item['ground_truth_json'], # 假设数据集中有ground truth JSON return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length" ).input_ids # 设置标签，-100表示忽略位置（遵循transformers惯例） labels[labels == self.processor.tokenizer.pad_token_id] = -100 inputs['labels'] = labels.squeeze() return { 'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'pixel_values': inputs['pixel_values'].squeeze(), 'labels': inputs['labels'].squeeze() } def _build_detection_prompt(self, item) -> str: """构建检测提示""" # 根据数据集内容动态构建提示 if 'objects' in item: objects = ', '.join(item['objects']) return f"Locate all {objects} in the image and output bounding box coordinates in JSON format." else: return "Locate every object in the image and output bounding box coordinates in JSON format." def compute_metrics(eval_pred): """计算评估指标""" predictions, labels = eval_pred # 解码预测和标签 decoded_preds = processor.tokenizer.batch_decode(predictions, skip_special_tokens=True) decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True) # 计算准确率（简单字符串匹配） correct = 0 total = len(decoded_preds) for pred, label in zip(decoded_preds, decoded_labels): # 简单的JSON格式检查 if pred.strip().startswith('[') and label.strip().startswith('['): correct += 1 return {'accuracy': correct / total} def main(): parser = argparse.ArgumentParser(description='Qwen2.5-VL目标检测优化训练') parser.add_argument('--model_name', type=str, default='Qwen/Qwen2.5-VL-7B-Instruct', help='预训练模型名称') parser.add_argument('--dataset_path', type=str, required=True, help='数据集路径') parser.add_argument('--output_dir', type=str, default='./qwen25vl-detection-finetuned', help='输出目录') parser.add_argument('--learning_rate', type=float, default=2e-5, help='学习率') parser.add_argument('--per_device_train_batch_size', type=int, default=2, help='每设备训练批次大小') parser.add_argument('--num_train_epochs', type=int, default=5, help='训练轮数') parser.add_argument('--warmup_ratio', type=float, default=0.1, help='预热比例') parser.add_argument('--save_steps', type=int, default=500, help='保存步骤') parser.add_argument('--eval_steps', type=int, default=250, help='评估步骤') parser.add_argument('--seed', type=int, default=42, help='随机种子') args = parser.parse_args() # 设置随机种子 set_seed(args.seed) # 加载处理器和模型 logger.info(f"Loading processor and model: {args.model_name}") processor = AutoProcessor.from_pretrained(args.model_name) model = AutoModelForVision2Seq.from_pretrained( args.model_name, torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, low_cpu_mem_usage=True ) # 准备数据集 logger.info("Preparing dataset...") train_dataset = QwenVLDetectionDataset( args.dataset_path, processor, max_length=1024 ) # 训练参数 training_args = TrainingArguments( output_dir=args.output_dir, per_device_train_batch_size=args.per_device_train_batch_size, per_device_eval_batch_size=args.per_device_train_batch_size, num_train_epochs=args.num_train_epochs, warmup_ratio=args.warmup_ratio, learning_rate=args.learning_rate, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), save_steps=args.save_steps, eval_steps=args.eval_steps, logging_steps=