diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/.dockerignore b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..23ad75a8af68abd564097d43ae7571e657cce01a --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.dockerignore @@ -0,0 +1,13 @@ +.vscode +.git +.github +.venv +cache +data +docker +saves +hf_cache +output +.dockerignore +.gitattributes +.gitignore diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/.env.local b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.env.local new file mode 100644 index 0000000000000000000000000000000000000000..363317e121ac3c417bab7b18f7480df03d8de6f0 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.env.local @@ -0,0 +1,34 @@ +# Note: actually we do not support .env, just for reference +# api +API_HOST=0.0.0.0 +API_PORT=8000 +API_KEY= +API_MODEL_NAME=gpt-3.5-turbo +FASTAPI_ROOT_PATH= +# general +DISABLE_VERSION_CHECK= +FORCE_CHECK_IMPORTS= +LLAMAFACTORY_VERBOSITY= +USE_MODELSCOPE_HUB= +RECORD_VRAM= +# torchrun +FORCE_TORCHRUN= +MASTER_ADDR= +MASTER_PORT= +NNODES= +RANK= +NPROC_PER_NODE= +# wandb +WANDB_DISABLED= +WANDB_PROJECT=huggingface +WANDB_API_KEY= +# gradio ui +GRADIO_SHARE=False +GRADIO_SERVER_NAME=0.0.0.0 +GRADIO_SERVER_PORT= +GRADIO_ROOT_PATH= +# setup +ENABLE_SHORT_CONSOLE=1 +# reserved (do not use) +LLAMABOARD_ENABLED= +LLAMABOARD_WORKDIR= diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/.gitattributes b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..dfe0770424b2a19faf507a501ebfc23be8f54e7b --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/.gitignore b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..8acdb36483a62d6621daa2e31b9b111e4c1827d9 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/.gitignore @@ -0,0 +1,169 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# custom .gitignore +ms_cache/ +hf_cache/ +cache/ +config/ +saves/ +output/ +wandb/ diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/CITATION.cff b/nlp/llm/qwen2.5-7b/LLaMA-Factory/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..01b4c9fd28aed295d50c71a7a3ed2e97a69434d4 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/CITATION.cff @@ -0,0 +1,44 @@ +cff-version: 1.2.0 +date-released: 2024-03 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Zheng" + given-names: "Yaowei" +- family-names: "Zhang" + given-names: "Richong" +- family-names: "Zhang" + given-names: "Junhao" +- family-names: "Ye" + given-names: "Yanhan" +- family-names: "Luo" + given-names: "Zheyan" +- family-names: "Feng" + given-names: "Zhangchi" +- family-names: "Ma" + given-names: "Yongqiang" +title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" +url: "https://arxiv.org/abs/2403.13372" +preferred-citation: + type: conference-paper + conference: + name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)" + authors: + - family-names: "Zheng" + given-names: "Yaowei" + - family-names: "Zhang" + given-names: "Richong" + - family-names: "Zhang" + given-names: "Junhao" + - family-names: "Ye" + given-names: "Yanhan" + - family-names: "Luo" + given-names: "Zheyan" + - family-names: "Feng" + given-names: "Zhangchi" + - family-names: "Ma" + given-names: "Yongqiang" + title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" + url: "https://arxiv.org/abs/2403.13372" + year: 2024 + publisher: "Association for Computational Linguistics" + address: "Bangkok, Thailand" diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/LICENSE b/nlp/llm/qwen2.5-7b/LLaMA-Factory/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b09cd7856d58590578ee1a4f3ad45d1310a97f87 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/LICENSE @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/MANIFEST.in b/nlp/llm/qwen2.5-7b/LLaMA-Factory/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..82c51f6348a58f9da0c839a61b0063b9aba66d75 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/MANIFEST.in @@ -0,0 +1 @@ +include LICENSE requirements.txt diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/Makefile b/nlp/llm/qwen2.5-7b/LLaMA-Factory/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d1e56aaffebf1c06ee71f7ec7958753c23cce23f --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/Makefile @@ -0,0 +1,14 @@ +.PHONY: quality style test + +check_dirs := scripts src tests setup.py + +quality: + ruff check $(check_dirs) + ruff format --check $(check_dirs) + +style: + ruff check $(check_dirs) --fix + ruff format $(check_dirs) + +test: + CUDA_VISIBLE_DEVICES= pytest tests/ diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/README.md b/nlp/llm/qwen2.5-7b/LLaMA-Factory/README.md new file mode 100644 index 0000000000000000000000000000000000000000..295ac321dd55531284457d4b6245f1b37df396b7 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/README.md @@ -0,0 +1,44 @@ +# Qwen2.5-7B + +## Model description + +Qwen2.5 is the latest series of Qwen large language models. Qwen2.5 brings the following improvements upon Qwen2: +- Significantly more knowledge and has greatly improved capabilities in coding and mathematics, thanks to our specialized expert models in these domains. +- Significant improvements in instruction following, generating long texts (over 8K tokens), understanding structured data (e.g, tables), and generating structured outputs especially JSON. More resilient to the diversity of system prompts, enhancing role-play implementation and condition-setting for chatbots. +- Long-context Support up to 128K tokens and can generate up to 8K tokens. +- Multilingual support for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more. + +## Step 1: Installation + +```bash +pip3 install --no-deps -e . +``` + +## Step 2: Preparing datasets + +```bash +mkdir -p checkpoint +wget http://sw.iluvatar.ai/download/apps/pretrained/nlp/qwen2.5/qwen2.5-7b.zip +unzip qwen2.5-7b.zip +``` + +## Step 3: Training + +### full sft +```bash +llamafactory-cli train examples/train_full/qwen2_5-7b_full_sft.yaml +``` +### lora sft +``` +llamafactory-cli train examples/train_lora/qwen2_5-7b_lora_sft.yaml +``` + +## Results + +| GPUs | Model | type |train_samples_per_second | +| :-----: | :----------------------------: | :------------: | :------------: | +| BI-V150 x 8 | Qwen2.5-7b| full | 1.889 | + +## Reference + +- [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/benchmark.svg b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/benchmark.svg new file mode 100644 index 0000000000000000000000000000000000000000..60f0aa4d39fb48df9b36dd4ebcb5b294e2f4ecce --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/benchmark.svg @@ -0,0 +1,1216 @@ + + + + + + + + 2023-11-18T11:28:03.028228 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/logo.png b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..5fb3dd569342ca3cd30a582fd664145bd88b360c Binary files /dev/null and b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/logo.png differ diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/wechat.jpg b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/wechat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f2d57406bf7215c66d62b5c656bf24cb81daa9d3 Binary files /dev/null and b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/wechat.jpg differ diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/wechat_npu.jpg b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/wechat_npu.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7708e35a54c1aae5e26cb3ab15d473a57c111d68 Binary files /dev/null and b/nlp/llm/qwen2.5-7b/LLaMA-Factory/assets/wechat_npu.jpg differ diff --git a/nlp/llm/qwen2.5-7b/LLaMA-Factory/data/README.md b/nlp/llm/qwen2.5-7b/LLaMA-Factory/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1786804fa21b28dc85a3ae46bd2272717862c811 --- /dev/null +++ b/nlp/llm/qwen2.5-7b/LLaMA-Factory/data/README.md @@ -0,0 +1,419 @@ +The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it. + +Currently we support datasets in **alpaca** and **sharegpt** format. + +```json +"dataset_name": { + "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url and file_name)", + "ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url and file_name)", + "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name)", + "file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)", + "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})", + "ranking": "whether the dataset is a preference dataset or not. (default: False)", + "subset": "the name of the subset. (optional, default: None)", + "split": "the name of dataset split to be used. (optional, default: train)", + "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)", + "num_samples": "the number of samples in the dataset to be used. (optional, default: None)", + "columns (optional)": { + "prompt": "the column name in the dataset containing the prompts. (default: instruction)", + "query": "the column name in the dataset containing the queries. (default: input)", + "response": "the column name in the dataset containing the responses. (default: output)", + "history": "the column name in the dataset containing the histories. (default: None)", + "messages": "the column name in the dataset containing the messages. (default: conversations)", + "system": "the column name in the dataset containing the system prompts. (default: None)", + "tools": "the column name in the dataset containing the tool description. (default: None)", + "images": "the column name in the dataset containing the image inputs. (default: None)", + "videos": "the column name in the dataset containing the videos inputs. (default: None)", + "chosen": "the column name in the dataset containing the chosen answers. (default: None)", + "rejected": "the column name in the dataset containing the rejected answers. (default: None)", + "kto_tag": "the column name in the dataset containing the kto tags. (default: None)" + }, + "tags (optional, used for the sharegpt format)": { + "role_tag": "the key in the message represents the identity. (default: from)", + "content_tag": "the key in the message represents the content. (default: value)", + "user_tag": "the value of the role_tag represents the user. (default: human)", + "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", + "observation_tag": "the value of the role_tag represents the tool results. (default: observation)", + "function_tag": "the value of the role_tag represents the function call. (default: function_call)", + "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)" + } +} +``` + +## Alpaca Format + +### Supervised Fine-Tuning Dataset + +* [Example dataset](alpaca_en_demo.json) + +In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response. + +The `system` column will be used as the system prompt if specified. + +The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning. + +```json +[ + { + "instruction": "human instruction (required)", + "input": "human input (optional)", + "output": "model response (required)", + "system": "system prompt (optional)", + "history": [ + ["human instruction in the first round (optional)", "model response in the first round (optional)"], + ["human instruction in the second round (optional)", "model response in the second round (optional)"] + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "instruction", + "query": "input", + "response": "output", + "system": "system", + "history": "history" + } +} +``` + +### Pre-training Dataset + +- [Example dataset](c4_demo.json) + +In pre-training, only the `text` column will be used for model learning. + +```json +[ + {"text": "document"}, + {"text": "document"} +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "text" + } +} +``` + +### Preference Dataset + +Preference datasets are used for reward modeling, DPO training, ORPO and SimPO training. + +It requires a better response in `chosen` column and a worse response in `rejected` column. + +```json +[ + { + "instruction": "human instruction (required)", + "input": "human input (optional)", + "chosen": "chosen answer (required)", + "rejected": "rejected answer (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "ranking": true, + "columns": { + "prompt": "instruction", + "query": "input", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +An additional column `kto_tag` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Image Dataset + +An additional column `images` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Video Dataset + +An additional column `videos` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +## Sharegpt Format + +### Supervised Fine-Tuning Dataset + +- [Example dataset](glaive_toolcall_en_demo.json) + +Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column. + +Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "function_call", + "value": "tool arguments" + }, + { + "from": "observation", + "value": "tool result" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "system": "system prompt (optional)", + "tools": "tool description (optional)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "system": "system", + "tools": "tools" + } +} +``` + +### Pre-training Dataset + +Not yet supported, please use the [alpaca](#alpaca-format) format. + +### Preference Dataset + +- [Example dataset](dpo_en_demo.json) + +Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "gpt", + "value": "model response" + }, + { + "from": "human", + "value": "human instruction" + } + ], + "chosen": { + "from": "gpt", + "value": "chosen answer (required)" + }, + "rejected": { + "from": "gpt", + "value": "rejected answer (required)" + } + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "ranking": true, + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +- [Example dataset](kto_en_demo.json) + +KTO datasets require a extra `kto_tag` column containing the boolean human feedback. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "kto_tag": "human feedback [true/false] (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "kto_tag": "kto_tag" + } +} +``` + +### Multimodal Image Dataset + +- [Example dataset](mllm_demo.json) + +Multimodal image datasets require a `images` column containing the paths to the input images. + +The number of images should be identical to the `` tokens in the conversations. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "human instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "images": [ + "image path (required)" + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "images": "images" + } +} +``` + +### Multimodal Video Dataset + +- [Example dataset](mllm_video_demo.json) + +Multimodal video datasets require a `videos` column containing the paths to the input videos. + +The number of videos should be identical to the `