From 570548ae9ef0aa8e6d23380550cb8b424242b534 Mon Sep 17 00:00:00 2001 From: sanghui_ilu Date: Thu, 27 Feb 2025 12:20:32 +0800 Subject: [PATCH] add Yi-VL-6B link #IBGZE4 add Yi-VL-6B --- nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md | 63 +++++++++++++++++++ .../LLaMA-Factory/yi_vl_6b_full_sft.yaml | 45 +++++++++++++ .../LLaMA-Factory/yi_vl_6b_lora_sft.yaml | 42 +++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md create mode 100644 nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml create mode 100644 nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml diff --git a/nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md b/nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md new file mode 100644 index 000000000..382212723 --- /dev/null +++ b/nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md @@ -0,0 +1,63 @@ +# Yi-VL-6B SFT (LLaMA-Factory) + +## Model description + Yi Visual Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images. + Yi-VL demonstrates exceptional performance, ranking first among all existing open-source models in the latest benchmarks including MMMU in English and CMMMU in Chinese (based on data available up to January 2024). + + Yi-VL offers the following features: + Multi-round text-image conversations: Yi-VL can take both text and images as inputs and produce text outputs. Currently, it supports multi-round visual question answering with one image. + Bilingual text support: Yi-VL supports conversations in both English and Chinese, including text recognition in images. + Strong image comprehension: Yi-VL is adept at analyzing visuals, making it an efficient tool for tasks like extracting, organizing, and summarizing information from images. + Fine-grained image resolution: Yi-VL supports image understanding at a higher resolution of 448×448. + +## Step 1: Installation + +```sh +git clone -b main https://github.com/hiyouga/LLaMA-Factory.git +git -C LLaMA-Factory/ checkout 1481af5dc9bc99807ae0ee5a438bf0a279cafb66 + +cp yi_vl_6b_full_sft.yaml LLaMA-Factory/examples/train_full/ +cp yi_vl_6b_lora_sft.yaml LLaMA-Factory/examples/train_lora/ + +cd LLaMA-Factory/ +pip3 install -r requirements.txt +pip3 install --no-deps -e . + +``` + +## Step 2: Preparing model + +```sh +mkdir -p /home/model_zoos/nlp/Yi-VL-6B-hf + +# download model Yi-VL-6B-hf (https://huggingface.co/BUAADreamer/Yi-VL-6B-hf), and then put it in /home/model_zoos/nlp/Yi-VL-6B-hf + +``` + +## Step 3: Training + +```sh + export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 +``` + +### Full SFT +```sh +llamafactory-cli train examples/train_full/yi_vl_6b_full_sft.yaml +``` + +### Lora SFT +```sh +llamafactory-cli train examples/train_lora/yi_vl_6b_lora_sft.yaml +``` + +## Results + +| GPUs | Model | type | train_samples_per_second | +|-------------|------------|------|--------------------------| +| BI-V150 x 8 | Yi-VL-6B | full | 0.546 | +| BI-V150 x 8 | Yi-VL-6B | lora | 2.474 | + + +## Reference + +- [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) diff --git a/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml new file mode 100644 index 000000000..214dcde09 --- /dev/null +++ b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml @@ -0,0 +1,45 @@ +### model +model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: full +freeze_vision_tower: true # choices: [true, false] +freeze_multi_modal_projector: true # choices: [true, false] +train_mm_proj_only: false # choices: [true, false] +deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + + +### dataset +dataset: mllm_demo +template: yi_vl +cutoff_len: 2048 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/Yi-VL-6B/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 2 +learning_rate: 1.0e-5 +num_train_epochs: 10 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.01 +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml new file mode 100644 index 000000000..33e436607 --- /dev/null +++ b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml @@ -0,0 +1,42 @@ +### model +model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 8 +lora_target: all + +### dataset +dataset: mllm_demo +template: llava +cutoff_len: 2048 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/yi1_5-6b/lora/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 + +### eval +val_size: 0.01 +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 -- Gitee