Skip to content

xtuner 在沐曦上跑报错ValueError: 'K' is not in list #1196

@fak111

Description

@fak111

机器环境

Image

操作

# 更新软件包列表
apt update

# 安装 git
apt install -y git

git clone https://github.com/fak111/xtuner.git
cd xtuner
pip install -e . -i https://pypi.mirrors.ustc.edu.cn/simple/

torchrun --nproc-per-node 1 vl.py

modelscope download --model OpenGVLab/InternVL3_5-1B-HF --local_dir /root/data/model/InternVL3_5-1B-HF
modelscope download --dataset JimmyMa99/VLM-formula-recognition-dataset_intern_camp --local_dir ./VLM-formula-recognition-dataset_intern_camp

报错

Image

vl.py

from xtuner.v1.model import InternVL3P5Dense1BConfig
from xtuner.v1.train import Trainer, TrainerConfig
from xtuner.v1.config import AdamWConfig, LRConfig
from xtuner.v1.datasets import InternS1VLTokenizeFnConfig, DataloaderConfig, DatasetConfig
from xtuner.v1.loss import CELossConfig
import sys
# model config - 启用梯度检查点
model_cfg = InternVL3P5Dense1BConfig(use_gradient_checkpointing=True)  # 添加这一行

# dataset and dataloader config
sample_max_length = 1024
pack_max_length = 2048

dataset_config = [
    {
        "dataset": DatasetConfig(
            name="formula_recognition",
            anno_path="/root/share/datasets/VLM-formula-recognition-dataset_intern_camp/train/train_mini_abs.jsonl",
            media_root="/root/share/datasets/VLM-formula-recognition-dataset_intern_camp/train/",
            sample_ratio=1.0,
            class_name="VLMJsonlDataset",
        ),
        # 使用 InternVL3.5 模板,确保 prompt 与视觉 token 对齐
        "tokenize_fn": InternS1VLTokenizeFnConfig(
            model_cfg=model_cfg,
            max_length=sample_max_length,
            template_name="internvl-3.5",
        ),
    }
]
dataloader_config = DataloaderConfig(
    dataset_config_list=dataset_config,
    pack_max_length=pack_max_length,
    num_workers=8,
    pack_level="soft",
    collator="intern_s1_vl_sft_collator",
)

# 优化学习率配置 - 提高学习率以加快收敛
optim_cfg = AdamWConfig(
    lr=3e-5,           # 提高学习率,从1e-6到3e-5
    weight_decay=0.01, # 添加权重衰减防止过拟合
    betas=(0.9, 0.95), # 优化Adam参数
    foreach=False
)
lr_cfg = LRConfig(
    lr_type="cosine",
    warmup_ratio=0.1,  # 增加warmup比例,让模型更稳定地开始训练
    min_lr_ratio=0.1   # 添加最小学习率比例
)

load_from = "/root/share/new_models/InternVL3.5/InternVL3_5-1B-HF"
tokenizer = "/root/share/new_models/InternVL3.5/InternVL3_5-1B-HF"

# trainer config
trainer = TrainerConfig(
    load_from=load_from,
    model_cfg=model_cfg,
    optim_cfg=optim_cfg,
    dataloader_cfg=dataloader_config,
    lr_cfg=lr_cfg,
    tokenizer_path=tokenizer,
    global_batch_size=16,
    gradient_accumulation_steps=2,
    total_epoch=10,
    work_dir="/root/user/xtuner/test_pr/vlhf/",
    loss_cfg=CELossConfig(mode="chunk", chunk_size=1024),
    # strict_load=False,  # 添加这一行
)
trainer = Trainer.from_config(trainer)
# 检查模型是否正确加载了预训练权重
print(f"Model device: {next(trainer._engine.model.parameters()).device}")
print(f"Model dtype: {next(trainer._engine.model.parameters()).dtype}")
# sys.exit(0)
trainer.fit()


Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions