diff --git a/multimodal/vision-language_model/MoE-LLaVA/README_phi2.md b/multimodal/vision-language_model/MoE-LLaVA/README_phi2.md new file mode 100644 index 0000000000000000000000000000000000000000..00cac7b01b7a6ac1284e58f56bd99cfbd54bc07a --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/README_phi2.md @@ -0,0 +1,78 @@ +# MoE-LLaVA-phi-2.7b +## Model description + +MoE-LLaVA: Mixture of Experts for Large Vision-Language Models, the Language Models is phi-2.7b + + +## Prepare + +### Install requirements + +```bash + +cd MoE-LLaVA +pip install --upgrade pip # enable PEP 660 support +pip3 install -e . +pip3 install --upgrade pydantic + +``` +### load data and weights +数据集和权重需要链接到当前目录 MoE-LLaVA 里 +[数据集地址](http://files.deepspark.org.cn:880/deepspark/) +格式如下: +```bash +MoE-LLaVA/ +├── gitattributes +├── llava_image +├── llava_image.zip +├── mimicit_tune +├── README.md +└── train_json +``` +[权重-clip-vit-large-patch14-336](http://files.deepspark.org.cn:880/deepspark/openai/) +格式如下: +```bash +openai/ +└── clip-vit-large-patch14-336 + ├── config.json + ├── configuration.json + ├── merges.txt + ├── preprocessor_config.json + ├── pytorch_model.bin + ├── README.md + ├── special_tokens_map.json + ├── tf_model.h5 + ├── tokenizer_config.json + ├── tokenizer.json + └── vocab.json +``` +[权重-phi-2.7b](http://files.deepspark.org.cn:880/deepspark/phi-2) +格式如下: +```bash +phi-2/ +├── added_tokens.json +├── CODE_OF_CONDUCT.md +├── config.json +├── generation_config.json +├── gitattributes +├── LICENSE +├── merges.txt +├── model-00001-of-00002.safetensors +├── model-00002-of-00002.safetensors +├── model.safetensors.index.json +├── NOTICE.md +├── README.md +├── SECURITY.md +├── special_tokens_map.json +├── tokenizer_config.json +├── tokenizer.json +└── vocab.json + +``` + + +## Train +```bash +cd scripts/v1/phi2 +bash pretrain.sh +``` diff --git a/multimodal/vision-language_model/MoE-LLaVA/README_qwen.md b/multimodal/vision-language_model/MoE-LLaVA/README_qwen.md new file mode 100644 index 0000000000000000000000000000000000000000..c9725b959dca6a1d8312b465299f828c5d1b44aa --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/README_qwen.md @@ -0,0 +1,87 @@ + +# MoE-LLaVA-Qwen-1_8B +## Model description + +MoE-LLaVA: Mixture of Experts for Large Vision-Language Models, the Language Models is Qwen-1_8B + + +## Prepare + +### Install requirements + +```bash + +cd MoE-LLaVA +pip install --upgrade pip # enable PEP 660 support +pip3 install -e . +pip3 install --upgrade pydantic + +``` +### load data and weights +数据集和权重需要链接到当前目录 MoE-LLaVA 里 +[数据集地址](http://files.deepspark.org.cn:880/deepspark/) +格式如下: +```bash +MoE-LLaVA/ +├── gitattributes +├── llava_image +├── llava_image.zip +├── mimicit_tune +├── README.md +└── train_json +``` +[权重-clip-vit-large-patch14-336](http://files.deepspark.org.cn:880/deepspark/openai/) +格式如下: +```bash +openai/ +└── clip-vit-large-patch14-336 + ├── config.json + ├── configuration.json + ├── merges.txt + ├── preprocessor_config.json + ├── pytorch_model.bin + ├── README.md + ├── special_tokens_map.json + ├── tf_model.h5 + ├── tokenizer_config.json + ├── tokenizer.json + └── vocab.json +``` +[权重-Qwen-1_8B](http://files.deepspark.org.cn:880/deepspark/Qwen-1_8B) +格式如下: +```bash +Qwen-1_8B/ +├── assets +│   ├── logo.jpg +│   ├── qwen_tokenizer.png +│   ├── tokenizer.png +│   └── wechat.png +├── cache_autogptq_cuda_256.cpp +├── cache_autogptq_cuda_kernel_256.cu +├── config.json +├── configuration_qwen.py +├── cpp_kernels.py +├── generation_config.json +├── gitattributes +├── LICENSE +├── model-00001-of-00002.safetensors +├── model-00002-of-00002.safetensors +├── modeling_qwen.py +├── model.safetensors.index.json +├── NOTICE +├── qwen_generation_utils.py +├── qwen.tiktoken +├── README.md +├── tokenization_qwen.py +└── tokenizer_config.json +``` + + +## Train +```bash +cd scripts/v1/qwen +bash pretrain.sh +``` + + + diff --git a/multimodal/vision-language_model/MoE-LLaVA/README_stablelm.md b/multimodal/vision-language_model/MoE-LLaVA/README_stablelm.md new file mode 100644 index 0000000000000000000000000000000000000000..7f12257286ac4bf7f1fc10c8ef8854363fa52f95 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/README_stablelm.md @@ -0,0 +1,72 @@ +# MoE-LLaVA-stablelm-2-1_6b +## Model description + +MoE-LLaVA: Mixture of Experts for Large Vision-Language Models, the Language Models is stablelm-2-1_6b + +## Prepare + +### Install requirements + +```bash + +cd MoE-LLaVA +pip install --upgrade pip # enable PEP 660 support +pip3 install -e . +pip3 install --upgrade pydantic + +``` +### load data and weights +数据集和权重需要链接到当前目录 MoE-LLaVA 里 +[数据集地址](http://files.deepspark.org.cn:880/deepspark/) +格式如下: +```bash +MoE-LLaVA/ +├── gitattributes +├── llava_image +├── llava_image.zip +├── mimicit_tune +├── README.md +└── train_json +``` +[权重-clip-vit-large-patch14-336](http://files.deepspark.org.cn:880/deepspark/openai/) +格式如下: +```bash +openai/ +└── clip-vit-large-patch14-336 + ├── config.json + ├── configuration.json + ├── merges.txt + ├── preprocessor_config.json + ├── pytorch_model.bin + ├── README.md + ├── special_tokens_map.json + ├── tf_model.h5 + ├── tokenizer_config.json + ├── tokenizer.json + └── vocab.json +``` +[权重-stablelm-2-1_6b](http://files.deepspark.org.cn:880/deepspark/stablelm-2-1_6b) +格式如下: +```bash +stablelm-2-1_6b/ +├── config.json +├── configuration_stablelm.py +├── generation_config.json +├── gitattributes +├── LICENSE.md +├── merges.txt +├── modeling_stablelm.py +├── model.safetensors +├── README.md +├── special_tokens_map.json +├── tokenizer_config.json +├── tokenizer.json +└── vocab.json +``` + + +## Train +```bash +cd scripts/v1/stablelm-2-1_6b +bash pretrain.sh +``` diff --git a/multimodal/vision-language_model/MoE-LLaVA/assets/image.jpg b/multimodal/vision-language_model/MoE-LLaVA/assets/image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..112c0dfd309292189679525cf13b3ba2648d3dab Binary files /dev/null and b/multimodal/vision-language_model/MoE-LLaVA/assets/image.jpg differ diff --git a/multimodal/vision-language_model/MoE-LLaVA/assets/intro.jpg b/multimodal/vision-language_model/MoE-LLaVA/assets/intro.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49ccc4e0864a0d804278e41913a26d8beaff476b Binary files /dev/null and b/multimodal/vision-language_model/MoE-LLaVA/assets/intro.jpg differ diff --git a/multimodal/vision-language_model/MoE-LLaVA/assets/intro0.jpg b/multimodal/vision-language_model/MoE-LLaVA/assets/intro0.jpg new file mode 100644 index 0000000000000000000000000000000000000000..beb1964a16270700dd963c4d158943906ff307da Binary files /dev/null and b/multimodal/vision-language_model/MoE-LLaVA/assets/intro0.jpg differ diff --git a/multimodal/vision-language_model/MoE-LLaVA/assets/logo.png b/multimodal/vision-language_model/MoE-LLaVA/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..d57a23ca9f2309bcebcfbec148a24be4b541ee07 Binary files /dev/null and b/multimodal/vision-language_model/MoE-LLaVA/assets/logo.png differ diff --git a/multimodal/vision-language_model/MoE-LLaVA/assets/modelscope_logo.png b/multimodal/vision-language_model/MoE-LLaVA/assets/modelscope_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..0286d28f488f6425c9671d2b0ec81db56aede2f8 Binary files /dev/null and b/multimodal/vision-language_model/MoE-LLaVA/assets/modelscope_logo.png differ diff --git a/multimodal/vision-language_model/MoE-LLaVA/cog.yaml b/multimodal/vision-language_model/MoE-LLaVA/cog.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8b7e0a1b8c249c94720e6e572609fa99bf51794a --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/cog.yaml @@ -0,0 +1,37 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + gpu: true + + python_version: "3.11" + + python_packages: + - "torch==2.0.1" + - "accelerate==0.21.0" + - "bitsandbytes==0.41.0" + - "deepspeed==0.9.5" + - "einops-exts==0.0.4" + - "einops==0.6.1" + - "gradio==3.35.2" + - "gradio_client==0.2.9" + - "httpx==0.24.0" + - "markdown2==2.4.10" + - "numpy==1.26.0" + - "peft==0.4.0" + - "scikit-learn==1.2.2" + - "sentencepiece==0.1.99" + - "shortuuid==1.0.11" + - "timm==0.6.13" + - "tokenizers==0.15.1" + - "torch==2.0.1" + - "torchvision==0.15.2" + - "transformers==4.37.0" + - "wandb==0.15.12" + - "wavedrom==2.0.3.post3" + - "Pygments==2.16.1" + run: + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/multimodal/vision-language_model/MoE-LLaVA/docs/CUSTOM.md b/multimodal/vision-language_model/MoE-LLaVA/docs/CUSTOM.md new file mode 100644 index 0000000000000000000000000000000000000000..fbc8a62f53d2fbd91cba34eb6788dd534da53593 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/docs/CUSTOM.md @@ -0,0 +1,214 @@ + + +- The most **IMPORTANT** thing, make sure you understand the behavior of the tokenizer. +- We provide some samples on how different tokenizer behaviors should be changed. +- At the end it describes how to convert LLaVA style models to the MoE architecture. + +## Don't have special tokens, but can add special tokens + +For those tokenizers that don't have special tokens, but can add special tokens, such as QWenTokenizer or PhiTokenizer. You need to add special tokens. + +### QWenTokenizer + +#### Tokenizer + +Insert the following code after initializing the tokenizer [here](): +```python +tokenizer.add_special_tokens({ + 'unk_token': '<|extra_0|>', + 'eos_token': '<|endoftext|>' +}) +``` + +#### `preprocess_qwen` function + +Copy the `preprocess_qwen` function from the `preprocess_v1` function and modify the following: +``` +round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 # for eos_token +instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 # instruction_len is before the answer +``` + +Defining the use of `preprocess_qwen` in the `preprocess` function [here](). +``` +if conversation_lib.default_conversation.version.startswith("qwen"): # for qwen + return preprocess_qwen(sources, tokenizer, has_image=has_image) +``` + +#### `conv_qwen` conversation template + +Add a new conversation template such as `conv_qwen` [here](), replacing `sep2` with `eos_token`, and modify the value of `version`. + +```python +conv_qwen = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="qwen", # replace + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", # replace with eos_token +) +``` + +Don't forget to register the newly defined conversation template [here](). + +```python +conv_templates = { + ... + "qwen": conv_qwen, # the key is "qwen" + ... +} +``` + +Remember the key for the registered dialogue conversation, such as `qwen`. And modify the `--version qwen` in the commands for Stage 2 and Stage 3. **DO NOT need to modify the `--version plain` in Stage 1.** + +### PhiTokenizer + +#### Tokenizer + +Insert the following code after initializing the tokenizer [here](): +```python +tokenizer.add_special_tokens({ + 'unk_token': '<|extra_0|>', +# 'eos_token': '<|endoftext|>' Not needed because it already exists. +}) +``` + +#### `preprocess_phi` function + +Copy the `preprocess_phi` function from the `preprocess_v1` function and modify the following: +``` +round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 # for eos_token +instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 # instruction_len is before the answer +``` + +Defining the use of `preprocess_phi` in the `preprocess` function [here](). +``` +if conversation_lib.default_conversation.version.startswith("phi"): # for phi + return preprocess_phi(sources, tokenizer, has_image=has_image) +``` + +#### `conv_phi` conversation template + +Add a new conversation template such as `conv_phi` [here](), replacing `sep2` with `eos_token`, and modify the value of `version`. + +```python +conv_phi = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="phi", # replace + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", # replace with eos_token +) +``` + +Don't forget to register the newly defined conversation template [here](). + +```python +conv_templates = { + ... + "phi": conv_phi, # the key is "phi" + ... +} +``` + +Remember the key for the registered dialogue conversation, such as `phi`. And modify the `--version phi` in the commands for Stage 2 and Stage 3. **DO NOT need to modify the `--version plain` in Stage 1.** + + +## CAN NOT add special tokens + +### StableLMTokenizer + +#### Tokenizer + +For those tokenizers that can **not** add special tokens, such as `StableLMTokenizer`. + +First find all the special tokens of the tokenizer. + +``` +tokenizer.special_tokens +>>> {'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|fim_pad|>': 100261, '': 100262, '': 100263, '': 100264, '': 100265, '': 100266, '': 100267, '': 100268, '': 100269, '': 100270, '': 100271, '': 100272, '': 100273, '': 100274, '': 100275, '<|endofprompt|>': 100276, '<|im_start|>': 100277, '<|im_end|>': 100278, '<|pause|>': 100279, '<|reg0|>': 100280, '<|reg1|>': 100281, '<|reg2|>': 100282, '<|reg3|>': 100283, '<|reg4|>': 100284, '<|reg5|>': 100285, '<|reg6|>': 100286, '<|reg7|>': 100287, '<|extra0|>': 100288} +``` + +Choosing a less important token, e.g., `<|reg0|>`. You need to make sure the tokenizer has `unk_token` [here](). + +``` +tokenizer.unk_token = '<|reg0|>' +``` + +#### `preprocess_stablelm` function + +Copy the `preprocess_stablelm` function from the `preprocess_v1` function and modify the following: +``` +total_len = int(target.ne(tokenizer.pad_token_id).sum()) + conversation.count(conv.sep2) # pad_token_id == eos_token_id +... +round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 # for eos_token +instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 # instruction_len is before the answer +``` + +Defining the use of `preprocess_stablelm` in the `preprocess` function [here](). +``` +if conversation_lib.default_conversation.version.startswith("stablelm"): # for stablelm + return preprocess_stablelm(sources, tokenizer, has_image=has_image) +``` + +#### `conv_stablelm` conversation template + +Add a new conversation template such as `conv_stablelm` [here](), replacing `sep2` with `eos_token`, and modify the value of `version`. + +```python +conv_stablelm = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="stablelm", # replace + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", # replace with eos_token +) +``` + +Don't forget to register the newly defined conversation template [here](). + +```python +conv_templates = { + ... + "stablelm": conv_stablelm, # the key is "stablelm" + ... +} +``` + +Remember the key for the registered dialogue conversation, such as `stablelm`. And modify the `--version stablelm` in the commands for Stage 2 and Stage 3. **DO NOT need to modify the `--version plain` in Stage 1.** + +## The behavior of the tokenizer is consistent with `LlamaTokenizer` + +### LlamaTokenizer + +If the behavior of your tokenizer is consistent with `LlamaTokenizer`. You can just use the already defined conversation template. Beware of the differences brought about by different transformers versions, **we strongly recommend using `LlamaTokenizer` on version 4.31.0**. + +For example, for the `LlamaTokenizer`, `bos_token` is ``, `eos_token` is ``, and `unk_token` is ``. +When the tokenizer encodes one sentence, the resulting output should include the `bos_token_id`. In following example, the `bos_token_id` is 1. + + +```python +tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5", cache_dir='cache_dir') +tokenizer(['This is first sentence', 'Test'], return_tensors='pt', padding=True) +# Output: {'input_ids': tensor([[ 1, 910, 338, 937, 10541], +# [ 1, 4321, 0, 0, 0]]), +# 'attention_mask': tensor([[1, 1, 1, 1, 1], +# [1, 1, 0, 0, 0]])} +``` +Passing the `--version v1` in the commands for Stage 2 and Stage 3. **DO NOT need to modify the `--version plain` in Stage 1.** + +## Converting models to MoE architectures + +Refer to [llava_stablelm_moe.py](moellava/model/language_model/llava_stablelm_moe.py), [llava_qwen_moe.py](moellava/model/language_model/llava_llama_moe.py), [llava_phi_moe.py](moellava/model/language_model/llava_phi_moe.py), [llava_mistral_moe.py](moellava/model/language_model/llava_mistral_moe.py) and [llava_llama_moe.py](moellava/model/language_model/llava_llama_moe.py) + diff --git a/multimodal/vision-language_model/MoE-LLaVA/docs/EVAL.md b/multimodal/vision-language_model/MoE-LLaVA/docs/EVAL.md new file mode 100644 index 0000000000000000000000000000000000000000..a02ad02548222ebdcd61cf566a939167a8f7dea8 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/docs/EVAL.md @@ -0,0 +1,271 @@ +## Data preparation + +- Following LLaVA's instructions. **You MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. +- It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `eval`. This also provides a general structure for all datasets. + +After downloading all of them, organize the data as follows in `eval`. + +```Shell +eval +├── gqa +│   ├── answers +│   ├── data +│   └── llava_gqa_testdev_balanced.jsonl +├── llava-bench-in-the-wild +│   ├── answers +│   ├── answers_gpt4.jsonl +│   ├── bard_0718.jsonl +│   ├── bing_chat_0629.jsonl +│   ├── context.jsonl +│   ├── images +│   ├── questions.jsonl +│   ├── README.md +│   └── reviews +├── mmbench +│   ├── answers +│   ├── answers_upload +│   ├── mmbench_dev_20230712.tsv +│   └── mmbench_dev_en_20231003.tsv +├── MME +│   ├── answers +│   ├── convert_answer_to_mme.py +│   └── llava_mme.jsonl +├── mm-vet +│   ├── answers +│   ├── bard_set.json +│   ├── convert_answers.py +│   ├── images +│   ├── llava-mm-vet.jsonl +│   ├── mm-vet.json +│   └── results +├── pope +│   ├── answers +│   ├── coco +│   ├── llava_pope_test.jsonl +│   └── val2014 +├── scienceqa +│   ├── answers +│   ├── images +│   ├── llava_test_CQM-A.json +│   ├── pid_splits.json +│   └── problems.json +├── seed_bench +│   ├── answers +│   ├── answers_upload +│   ├── extract_video_frames.py +│   └── llava-seed-bench.jsonl +├── textvqa +│   ├── answers +│   ├── llava_textvqa_val_v051_ocr.jsonl +│   ├── TextVQA_0.5.1_val.json +│   └── train_images +├── vizwiz +│   ├── answers +│   ├── answers_upload +│   ├── llava_test.jsonl +│   ├── test +│   ├── test.json +│   ├── train.json +│   └── val.json +└── vqav2 + ├── answers + ├── answers_upload + ├── llava_vqav2_mscoco_test2015.jsonl + ├── llava_vqav2_mscoco_test-dev2015.jsonl + └── test2015 +``` + + +## Validating +Our image validation code comes from LLaVA, thanks for their contribution! + +You can refer to the official repository for validation, but we also provide [off-the-shelf](scripts/v1/eval) scripts. + + +### VQAv2 + +1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `eval/vqav2`. +2. Multi-GPU inference. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1/eval/llava/vqav2.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/vqav2.sh +``` + +3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `eval/vqav2/answers_upload`. + +### GQA + +1. Download the data following the official instructions [here](https://cs.stanford.edu/people/dorarad/gqa/download.html) and put under `eval/gqa/data`. +2. Multi-GPU inference + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1/eval/llava/gqa.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/gqa.sh +``` + +### VisWiz + +1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `eval/vizwiz`. +2. Single-GPU inference. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/moe_llava/vizwiz.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/vizwiz.sh +``` + +3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/1911/my-submission): `eval/vizwiz/answers_upload`. + +### ScienceQA + +1. Under `eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA). +2. Single-GPU inference and evaluate. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/moe_llava/sqa.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/sqa.sh +``` + + +### TextVQA + +1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `eval/textvqa`. +2. Single-GPU inference and evaluate. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/moe_llava/textvqa.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/textvqa.sh +``` + + +### POPE + +1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `eval/pope`. +2. Single-GPU inference and evaluate. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/moe_llava/pope.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/pope.sh +``` + +### MME +1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation). +2. Downloaded images to `MME_Benchmark_release_version`. +3. Put the official `eval_tool` and `MME_Benchmark_release_version` under `eval/MME`. +4. Single-GPU inference and evaluate. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/llava/mme.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/mme.sh +``` + +### MMBench + +1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `eval/mmbench`. +2. Single-GPU inference. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/llava/mmbench.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/mmbench.sh +``` + +3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `eval/mmbench/answers_upload/mmbench_dev_20230712`. + + +### MMBench-CN + +1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `eval/mmbench`. +2. Single-GPU inference. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/llava/mmbench_cn.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/mmbench_cn.sh +``` + +3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `eval/mmbench/answers_upload/mmbench_dev_cn_20231003`. + + +### SEED-Bench + +1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `eval/seed_bench/SEED-Bench-image`. +2. Extract the video frame in the middle from the downloaded videos, and put them under `eval/seed_bench/SEED-Bench-video-image`. +3. Multiple-GPU inference and evaluate. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1/eval/llava/seed.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/seed.sh +``` + +4. Optionally, submit the results to the leaderboard: `eval/seed_bench/answers_upload` using the official jupyter notebook. + + + +### LLaVA-Bench-in-the-Wild + +1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `eval/llava-bench-in-the-wild`. +2. Single-GPU inference and evaluate. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/moe_llava/llavabench.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/llavabench.sh +``` + + +### MM-Vet + +1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `eval/mmvet`. +2. Single-GPU inference. + +**LLaVA-based** model +```Shell +CUDA_VISIBLE_DEVICES=0 bash scripts/v1/eval/moe_llava/mmvet.sh +``` +**MoE-based** model +```Shell +bash scripts/v1/eval/moe_llava/mmvet.sh +``` + diff --git a/multimodal/vision-language_model/MoE-LLaVA/docs/LORA.md b/multimodal/vision-language_model/MoE-LLaVA/docs/LORA.md new file mode 100644 index 0000000000000000000000000000000000000000..80c947713def36cfb410a69f1b669945828ea1f8 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/docs/LORA.md @@ -0,0 +1,24 @@ +## Training for LoRA tuning models +Coming soon... + +## Evaluation for LoRA tuning models + +You can evaluate the model directly after LoRA tuning as [EVAL.md](../docs/EVAL.md). + +Or you can evaluate it after merging weights as follows. + +### Optional + +You can use `script/merge_moe_lora_weights.py` to merge the LoRA weights. + +```Shell +deepspeed --include localhost:0 script/merge_lora_weights.py \ + --model-path checkpoints/moellava-phi-moe-lora \ + --save-model-path checkpoints/moellava-phi-moe-merge +``` + +> [!Warning] +> 🚨 Please do not have `lora` in `--save-model-path` and `lora` should in `--model-path`. + + +Then evaluate `checkpoints/llavaphi-moe-merge` as [EVAL.md](../docs/EVAL.md) diff --git a/multimodal/vision-language_model/MoE-LLaVA/docs/TRAIN.md b/multimodal/vision-language_model/MoE-LLaVA/docs/TRAIN.md new file mode 100644 index 0000000000000000000000000000000000000000..a1af3f1bd771883af5b4cb77044378c99b4289d5 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/docs/TRAIN.md @@ -0,0 +1,69 @@ +## Data preparation + +- The LLaVA-PT is from [LLaVA](https://github.com/haotian-liu/LLaVA). +- The Hybird-FT is from [SViT](https://github.com/BAAI-DCAI/Visual-Instruction-Tuning), [LVIS](https://github.com/X2FD/LVIS-INSTRUCT4V), [LRV](https://github.com/FuxiaoLiu/LRV-Instruction), [MIMIC-IT](https://github.com/Luodian/Otter). +- The LLaVA-FT is from [LLaVA](https://github.com/haotian-liu/LLaVA). +- Download the training annotations. You can download from [Baidu Disk](https://pan.baidu.com/s/1rwub9o0T3_7ZHbPZzCiLZw?pwd=0yhi), [Google Disk](https://drive.google.com/file/d/13YxtVowfhUIpGOCODhKFstoRBvogF4od/view?usp=sharing), [Peking University Disk](https://disk.pku.edu.cn/link/AA10683317FB824FB9B2427A6B268EAADB) or [Hugging Face](https://huggingface.co/datasets/LanguageBind/MoE-LLaVA/tree/main/train_json) + + +We also provide the processed data as follows. The link is to BaiDu Disk. +
+ + + + + + + + + + + + + +
Data groupUsageLink
LLaVA-PTStage 1LLaVA 1.5-558k
Hybird-FTStage 2SViT-157k, LVIS-220k, LRV-331k, MIMIC-IT-256k
LLaVA-FTStage 3LLaVA 1.5-mix-665k
+
+ +**For those who can not easily access to BaiDu Disk**, you can download data from [Hugging Face](https://huggingface.co/datasets/LanguageBind/MoE-LLaVA). + +After downloading all of them, organize the data as follows in ```IMAGE_FOLDER```. + +```Shell +IMAGE_FOLDER +├── llava_image +├── llava_image_tune +├── lvis_tune +├── lrv_tune +├── svit_tune +└── mimicit_tune + └── LA +``` + + +## Training +Specify your `IMAGE_FOLDER` and `JSON_FOLDER` according to the data preparation. + +For training on 384 resolution, we use `google/siglip-so400m-patch14-384` as `image_tower`. Notably, if you pass the `--image_tower google/siglip-so400m-patch14-384`, you should upgrade the version of transformers to 4.37.0. + +### Qwen +- Stage 1 pretraining script: [pretrain.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/qwen/pretrain.sh). +- Stage 2 tuning script: [finetune.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/qwen/finetune.sh). +- Stage 3 moe-tuning script: [finetune_moe.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/qwen/finetune_moe.sh). + +### Phi2 +- Stage 1 pretraining script: [pretrain.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/phi2/pretrain.sh). +- Stage 2 tuning script: [finetune.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/phi2/finetune.sh). +- Stage 3 moe-tuning script: [finetune_moe.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/phi2/finetune_moe.sh). + +### StableLM +- Stage 1 pretraining script: [pretrain.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/stablelm/pretrain.sh). +- Stage 2 tuning script: [finetune.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/stablelm/finetune.sh). +- Stage 3 moe-tuning script: [finetune_moe.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/stablelm/finetune_moe.sh). + +### OpenChat + + + +- Stage 1 pretraining script: [pretrain.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/openchat/pretrain.sh). +- Stage 2 tuning script: [finetune.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/openchat/finetune.sh). +- Stage 3 moe-tuning script: [finetune_moe.sh](https://github.com/PKU-YuanGroup/MoE-LLaVA/tree/main/scripts/v1/openchat/finetune_moe.sh). diff --git a/multimodal/vision-language_model/MoE-LLaVA/docs/VISUALIZATION.md b/multimodal/vision-language_model/MoE-LLaVA/docs/VISUALIZATION.md new file mode 100644 index 0000000000000000000000000000000000000000..8a70bff50f1fb9dd5e71541dbcec08f06b4028f7 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/docs/VISUALIZATION.md @@ -0,0 +1,52 @@ +## Visualization + +Please note that this tutorial is **for MoE models only**. + +### Getting expert logits + +For visualization, the first step is to get the logits of the experts. GQA and VQAv2 are not currently supported as they generally require multi-GPUs to run. Please change to single GPU if needed. + +In [EVAL.md](https://github.com/PKU-YuanGroup/MoE-LLaVA/blob/main/docs/EVAL.md) we describe how to perform validation. Then, for example, we just need to add `--return_gating_logit "phi_sciqa"` to get the expert logits on ScienceQA benchmark. + +```Bash +cd ~/MoE-LLaVA +CKPT_NAME="MoE-LLaVA-Phi2-2.7B-4e" +CKPT="checkpoints/${CKPT_NAME}" +EVAL="eval" +HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 deepspeed --include localhost:0 moellava/eval/model_vqa_science.py \ + --model-path ${CKPT} \ + --question-file ${EVAL}/scienceqa/llava_test_CQM-A.json \ + --image-folder ${EVAL}/scienceqa/images/test \ + --answers-file ${EVAL}/scienceqa/answers/${CKPT_NAME}.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode phi \ + --return_gating_logit "phi_sciqa" # add this command +``` + +Then, you will get ``phi_sciqa.pt``. Now you can try the other benchmarks through `--return_gating_logit`. + +### Distribution of expert loadings + +``` +python moellava/vis/vis1.py --input phi_sciqa.pt +``` + +![image](https://github.com/PKU-YuanGroup/MoE-LLaVA/assets/62638829/0a908801-b24a-4e0d-9537-1383c20ea36e) + +### Distribution of modalities across different experts + +``` +python moellava/vis/vis2.py --input phi_sciqa.pt +``` + +![image](https://github.com/PKU-YuanGroup/MoE-LLaVA/assets/62638829/f1e686ef-ecd5-4b21-a096-fa93c3ef4ae2) + +### Activated pathways + +``` +pip install mplsoccer +python moellava/vis/vis3.py --input phi_sciqa.pt +``` + +![image](https://github.com/PKU-YuanGroup/MoE-LLaVA/assets/62638829/7f952f7d-2f2d-47d3-80d5-ca733e422aaa) diff --git a/multimodal/vision-language_model/MoE-LLaVA/moellava/__init__.py b/multimodal/vision-language_model/MoE-LLaVA/moellava/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b0c24586b9a3d809417b113159b48036018d37cf --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/moellava/__init__.py @@ -0,0 +1,19 @@ +from .model import LlavaLlamaForCausalLM +from .model import MoELLaVALlamaForCausalLM +from .model import LlavaQWenForCausalLM +from .model import MoELLaVALlamaForCausalLM +import transformers +a, b, c = transformers.__version__.split('.')[:3] +if a == '4' and int(b) >= 34: + from .model import LlavaMistralForCausalLM + from .model import MoELLaVAMistralForCausalLM +if a == '4' and int(b) >= 36: + from .model import LlavaMiniCPMForCausalLM + from .model import MoELLaVAMiniCPMForCausalLM + from .model import LlavaPhiForCausalLM + from .model import MoELLaVAPhiForCausalLM + from .model import LlavaStablelmForCausalLM + from .model import MoELLaVAStablelmForCausalLM +if a == '4' and int(b) >= 37: + from .model import LlavaQwen1_5ForCausalLM + from .model import MoELLaVAQwen1_5ForCausalLM diff --git a/multimodal/vision-language_model/MoE-LLaVA/moellava/constants.py b/multimodal/vision-language_model/MoE-LLaVA/moellava/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..eda94423ca53c11637c8bacf9b7a5a3917c06c38 --- /dev/null +++ b/multimodal/vision-language_model/MoE-LLaVA/moellava/constants.py @@ -0,0 +1,27 @@ +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +WORKER_HEART_BEAT_INTERVAL = 15 + +LOGDIR = "." + +# Model Constants +IGNORE_INDEX = -100 + +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" +IMAGE_PLACEHOLDER = "" + +# ====================================================================================================== +DEFAULT_VIDEO_TOKEN = "