From 570548ae9ef0aa8e6d23380550cb8b424242b534 Mon Sep 17 00:00:00 2001
From: sanghui_ilu <hui.sang@iluvatar.com>
Date: Thu, 27 Feb 2025 12:20:32 +0800
Subject: [PATCH] add Yi-VL-6B

link #IBGZE4
add Yi-VL-6B
---
 nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md      | 63 +++++++++++++++++++
 .../LLaMA-Factory/yi_vl_6b_full_sft.yaml      | 45 +++++++++++++
 .../LLaMA-Factory/yi_vl_6b_lora_sft.yaml      | 42 +++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md
 create mode 100644 nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml
 create mode 100644 nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml

diff --git a/nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md b/nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md
new file mode 100644
index 000000000..382212723
--- /dev/null
+++ b/nlp/llm/Yi-VL-6B/LLaMA-Factory/README.md
@@ -0,0 +1,63 @@
+# Yi-VL-6B SFT (LLaMA-Factory)
+
+## Model description
+    Yi Visual Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images. 
+    Yi-VL demonstrates exceptional performance, ranking first among all existing open-source models in the latest benchmarks including MMMU in English and CMMMU in Chinese (based on data available up to January 2024).
+
+    Yi-VL offers the following features:
+        Multi-round text-image conversations: Yi-VL can take both text and images as inputs and produce text outputs. Currently, it supports multi-round visual question answering with one image.
+        Bilingual text support: Yi-VL supports conversations in both English and Chinese, including text recognition in images.
+        Strong image comprehension: Yi-VL is adept at analyzing visuals, making it an efficient tool for tasks like extracting, organizing, and summarizing information from images.
+        Fine-grained image resolution: Yi-VL supports image understanding at a higher resolution of 448×448.
+
+## Step 1: Installation
+
+```sh
+git clone -b main https://github.com/hiyouga/LLaMA-Factory.git
+git -C LLaMA-Factory/ checkout 1481af5dc9bc99807ae0ee5a438bf0a279cafb66
+
+cp yi_vl_6b_full_sft.yaml LLaMA-Factory/examples/train_full/
+cp yi_vl_6b_lora_sft.yaml LLaMA-Factory/examples/train_lora/
+
+cd LLaMA-Factory/
+pip3 install -r requirements.txt
+pip3 install --no-deps -e .
+
+```
+
+## Step 2: Preparing model
+
+```sh
+mkdir -p /home/model_zoos/nlp/Yi-VL-6B-hf
+
+# download model Yi-VL-6B-hf (https://huggingface.co/BUAADreamer/Yi-VL-6B-hf), and then put it in /home/model_zoos/nlp/Yi-VL-6B-hf
+
+```
+
+## Step 3: Training
+
+```sh
+ export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+```
+
+### Full SFT
+```sh
+llamafactory-cli train examples/train_full/yi_vl_6b_full_sft.yaml
+```
+
+### Lora SFT
+```sh
+llamafactory-cli train examples/train_lora/yi_vl_6b_lora_sft.yaml
+```
+
+## Results
+
+| GPUs        | Model      | type | train_samples_per_second |
+|-------------|------------|------|--------------------------|
+| BI-V150 x 8 | Yi-VL-6B   | full | 0.546                    |
+| BI-V150 x 8 | Yi-VL-6B   | lora | 2.474                    |
+
+
+## Reference
+
+- [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)
diff --git a/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml
new file mode 100644
index 000000000..214dcde09
--- /dev/null
+++ b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_full_sft.yaml
@@ -0,0 +1,45 @@
+### model
+model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+freeze_vision_tower: true  # choices: [true, false]
+freeze_multi_modal_projector: true  # choices: [true, false]
+train_mm_proj_only: false  # choices: [true, false]
+deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+
+
+### dataset
+dataset: mllm_demo
+template: yi_vl
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/Yi-VL-6B/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-5
+num_train_epochs: 10
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.01
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
diff --git a/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml
new file mode 100644
index 000000000..33e436607
--- /dev/null
+++ b/nlp/llm/Yi-VL-6B/LLaMA-Factory/yi_vl_6b_lora_sft.yaml
@@ -0,0 +1,42 @@
+### model
+model_name_or_path: /home/model_zoos/nlp/Yi-VL-6B-hf
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+
+### dataset
+dataset: mllm_demo
+template: llava
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/yi1_5-6b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.01
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
-- 
Gitee