|
| 1 | + |
| 2 | +<div align=center> |
| 3 | +<img src="assets/logo.png" width="140px"> |
| 4 | +</div> |
| 5 | + |
| 6 | +# OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment across Language with Real-time Self-Aware Emotional Speech Synthesis |
| 7 | + |
| 8 | +## <font style="color:rgb(31, 35, 40);">👀</font><font style="color:rgb(31, 35, 40);"> Contents</font> |
| 9 | ++ <font style="color:rgb(31, 35, 40);">Setup</font> |
| 10 | ++ <font style="color:rgb(31, 35, 40);">Model</font> |
| 11 | ++ <font style="color:rgb(31, 35, 40);">Preparation</font> |
| 12 | ++ <font style="color:rgb(31, 35, 40);">Train</font> |
| 13 | ++ <font style="color:rgb(31, 35, 40);">Evaluation</font> |
| 14 | ++ <font style="color:rgb(31, 35, 40);">Example</font> |
| 15 | ++ <font style="color:rgb(31, 35, 40);">Citation</font> |
| 16 | + |
| 17 | +## <font style="color:rgb(31, 35, 40);">📷</font><font style="color:rgb(31, 35, 40);"> Setup</font> |
| 18 | +<font style="color:rgb(31, 35, 40);">Please follow the instructions below to install the required packages.</font> |
| 19 | + |
| 20 | +1. <font style="color:rgb(31, 35, 40);">Clone this repository</font> |
| 21 | + |
| 22 | +```plain |
| 23 | +git clone https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/OpenOMNI.git |
| 24 | +cd OpenOMNI |
| 25 | +``` |
| 26 | + |
| 27 | +1. <font style="color:rgb(31, 35, 40);">Install Package</font> |
| 28 | + |
| 29 | +```plain |
| 30 | +conda create -n openomni python=3.10 -y |
| 31 | +conda activate openomni |
| 32 | +pip install --upgrade pip # enable PEP 660 support |
| 33 | +pip install -e . |
| 34 | +pip install openai-whisper |
| 35 | +pip install transformers==4.43.4 |
| 36 | +``` |
| 37 | + |
| 38 | +1. <font style="color:rgb(31, 35, 40);">Install additional packages for training</font> |
| 39 | + |
| 40 | +```plain |
| 41 | +pip install -e ".[train]" |
| 42 | +pip install flash-attn --no-build-isolation |
| 43 | +``` |
| 44 | + |
| 45 | +## <font style="color:rgb(31, 35, 40);">Model</font> |
| 46 | + |
| 47 | + |
| 48 | +<font style="color:rgb(31, 35, 40);">Here are the pretrained weights and instruction tuning weights</font> |
| 49 | + |
| 50 | +| Stage | <font style="color:rgb(31, 35, 40);">Model</font> | <font style="color:rgb(31, 35, 40);">Speech Projector</font> | <font style="color:rgb(31, 35, 40);">Image</font><br/><font style="color:rgb(31, 35, 40);">Projector</font> | <font style="color:rgb(31, 35, 40);">IT Data</font> | <font style="color:rgb(31, 35, 40);">Download</font> | |
| 51 | +| --- | --- | --- | --- | --- | --- | |
| 52 | +| 1-1 | <font style="color:rgb(31, 35, 40);">OpenOMNI-Qwen2-7B-Stage1-1</font> | | | <font style="color:rgb(31, 35, 40);">aishell2+wetnetspeech+librispeech</font> | ckpt | |
| 53 | +| 2-1 | <font style="color:rgb(31, 35, 40);">OpenOMNI-Qwen2-7B-Stage2-1</font> | | | <font style="color:rgb(31, 35, 40);">llava-pretrain</font> | ckpt | |
| 54 | +| 2-2 | <font style="color:rgb(31, 35, 40);">OpenOMNI-Qwen2-7B-Stage2-2</font> | | | <font style="color:rgb(31, 35, 40);">mmevol</font> | ckpt | |
| 55 | +| 3-1 | <font style="color:rgb(31, 35, 40);">OpenOMNI-Qwen2-7B-Stage3-1</font> | | | <font style="color:rgb(31, 35, 40);">openomni-1M</font> | ckpt | |
| 56 | +| 3-2 | <font style="color:rgb(31, 35, 40);">OpenOMNI-Qwen2-7B-Stage3-2</font> | | | <font style="color:rgb(31, 35, 40);">openomni-prefer</font> | ckpt | |
| 57 | + |
| 58 | + |
| 59 | +## <font style="color:rgb(31, 35, 40);">Preparation</font> |
| 60 | +### <font style="color:rgb(31, 35, 40);">Dataset</font> |
| 61 | +<font style="color:rgb(31, 35, 40);">Please follow [MMEvol](https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/mmevol) to prepare the corresponding images-text datasets. Here we provide the details of speech-text datasets.</font> |
| 62 | + |
| 63 | +### <font style="color:rgb(31, 35, 40);">data structure</font> |
| 64 | +```plain |
| 65 | +datasets |
| 66 | +├── json # data receipe |
| 67 | +│ ├── openomni_stage1-1.json # speech2text pretraining |
| 68 | +│ ├── openomni_stage2-1.json # image2text pretraining |
| 69 | +│ ├── openomni_stage2-2.json # image2text instruction tuning |
| 70 | +│ ├── openomni_stage3-1.json # text2speech pretraining |
| 71 | +│ ├── openomni_stage3-2.json # text2speech emotional injection |
| 72 | +├── asr # classic bilingual speech corpus |
| 73 | +│ ├── AISHELL-4 |
| 74 | +│ ├── LibriSPeech |
| 75 | +│ ├── WeNetSpeech |
| 76 | +├── audio_en # synthetic english speech corpus for question |
| 77 | +├── audio_llava # synthetic bilingual speech corpus for answer |
| 78 | +├── audio_zh # synthetic chinese speech corpus for question |
| 79 | +├── audio_unit # synthetic bilingual speech corpus for answer |
| 80 | +├── audio_prefer # synthetic emotional bilingual speech corpus for answer |
| 81 | +├── audio_reject # synthetic emotional bilingual speech corpus for answer |
| 82 | +├── audio_ultrachat # synthetic bilingual speech corpus for answer |
| 83 | +├── ai2d |
| 84 | +│ ├── abc_images |
| 85 | +│ ├── annotations |
| 86 | +│ ├── images |
| 87 | +│ ├── questions |
| 88 | +│ └── categories.json |
| 89 | +...... |
| 90 | +
|
| 91 | +
|
| 92 | +``` |
| 93 | + |
| 94 | ++ All file/path starting with "audio" are self-synthesized. |
| 95 | ++ DPO contains approximately 9k entries for "prefer" and "reject," covering 9 types of emotions. |
| 96 | + |
| 97 | +## <font style="color:rgb(31, 35, 40);">Train</font> |
| 98 | +### <font style="color:rgb(31, 35, 40);">Speech2Text Pretrain</font> |
| 99 | +<font style="color:rgb(31, 35, 40);">Please download the MMEvol, AIShell-4, LibriSPeech, WeNetSpeech, OpenOmniData and organize the data following Preparation before training . Make sure set up the corresponding train script with correct setting (data path, weight path, and hyper-paramaters)</font> |
| 100 | + |
| 101 | +```plain |
| 102 | +bash scripts/train/llama3/speech2text_pretrain.sh |
| 103 | +bash scripts/train/qwen2/speech2text_pretrain.sh |
| 104 | +``` |
| 105 | + |
| 106 | +### <font style="color:rgb(31, 35, 40);">Image2Text Pretrain</font> |
| 107 | +<font style="color:rgb(31, 35, 40);">Please make sure you download and organize the data following</font><font style="color:rgb(31, 35, 40);"> </font>[<font style="color:rgb(31, 35, 40);">Preparation</font>](https://github.com/RainBowLuoCS/MMEvol#preparation)<font style="color:rgb(31, 35, 40);"> </font><font style="color:rgb(31, 35, 40);">before training. Make sure set up the corresponding train script with correct setting (data path, weight path, and hyper-paramaters)</font> |
| 108 | + |
| 109 | +```plain |
| 110 | +bash scripts/train/llama3/image2text_pretrain.sh |
| 111 | +bash scripts/train/qwen2/image2text_pretrain.sh |
| 112 | +``` |
| 113 | + |
| 114 | +### <font style="color:rgb(31, 35, 40);">Image2Text Instruction Tuning</font> |
| 115 | +<font style="color:rgb(31, 35, 40);">Please make sure you download and organize the data following</font><font style="color:rgb(31, 35, 40);"> </font>[<font style="color:rgb(31, 35, 40);">Preparation</font>](https://github.com/RainBowLuoCS/MMEvol#preparation)<font style="color:rgb(31, 35, 40);"> </font><font style="color:rgb(31, 35, 40);">before training. Make sure set up the corresponding train script with correct setting (data path, weight path, and hyper-paramaters)</font> |
| 116 | + |
| 117 | +```plain |
| 118 | +bash scripts/train/llama3/image2text_finetune.sh |
| 119 | +bash scripts/train/qwen2/image2text_finetue.sh |
| 120 | +``` |
| 121 | + |
| 122 | +### <font style="color:rgb(31, 35, 40);">Text2Speech Pretrain</font> |
| 123 | +<font style="color:rgb(31, 35, 40);">Please make sure you download and organize the data following</font><font style="color:rgb(31, 35, 40);"> </font>[<font style="color:rgb(31, 35, 40);">Preparation</font>](https://github.com/RainBowLuoCS/MMEvol#preparation)<font style="color:rgb(31, 35, 40);"> </font><font style="color:rgb(31, 35, 40);">before training. Make sure set up the corresponding train script with correct setting (data path, weight path, and hyper-paramaters)</font> |
| 124 | + |
| 125 | +```plain |
| 126 | +bash scripts/train/llama3/text2speech_ pretrain.sh |
| 127 | +bash scripts/train/qwen2/text2speech_ pretrain.sh |
| 128 | +``` |
| 129 | + |
| 130 | +### <font style="color:rgb(31, 35, 40);">Text2Speech Emotional DPO Tuning</font> |
| 131 | +<font style="color:rgb(31, 35, 40);">Please make sure you download and organize the data following</font><font style="color:rgb(31, 35, 40);"> </font>[<font style="color:rgb(31, 35, 40);">Preparation</font>](https://github.com/RainBowLuoCS/MMEvol#preparation)<font style="color:rgb(31, 35, 40);"> </font><font style="color:rgb(31, 35, 40);">before training. Make sure set up the corresponding train script with correct setting (data path, weight path, and hyper-paramaters)</font> |
| 132 | + |
| 133 | +```plain |
| 134 | +bash scripts/train/llama3/text2speech_ dpo.sh |
| 135 | +bash scripts/train/qwen2/text2speech_ dpo.sh |
| 136 | +``` |
| 137 | + |
| 138 | +## <font style="color:rgb(31, 35, 40);">Evaluation</font> |
| 139 | +### <font style="color:rgb(31, 35, 40);">Dataset</font> |
| 140 | +#### <font style="color:rgb(31, 35, 40);">Ensure that your api_base, key and dataset are correctly configured before evaluation.</font> |
| 141 | +### <font style="color:rgb(31, 35, 40);">data structure</font> |
| 142 | +```plain |
| 143 | +datasets |
| 144 | +├── json # data receipe |
| 145 | +│ ├── aishell2_eval.jsonl # aishell evaluation |
| 146 | +│ ├── librispeech_eval.jsonl # image2text pretraining |
| 147 | +│ ├── wenetspeech_eval.json # image2text instruction tuning |
| 148 | +│ ├── openomni_emotion_val.json |
| 149 | +├── OmniBench # OmniBench |
| 150 | +│ ├── mmdata |
| 151 | +│ ├── dataset |
| 152 | +│ ├── eval.json |
| 153 | +├── Ov-Odyssey # Ov-Odyssey Bench |
| 154 | +│ ├── av_odyssey_part1.parquet |
| 155 | +│ ├── av_odyssey_part2.parquet |
| 156 | +│ ├── av_odyssey_part3.parquet |
| 157 | +│ ├── av_odyssey_part4.parquet |
| 158 | +│ ├── av_odyssey_part5.parquet |
| 159 | +
|
| 160 | +
|
| 161 | +``` |
| 162 | + |
| 163 | +### <font style="color:rgb(31, 35, 40);">Speech-Text Evaluation </font> |
| 164 | +<font style="color:rgb(31, 35, 40);">Make sure set up the corresponding train script with correct setting (data path, weight path, and hyper-paramaters)</font> |
| 165 | + |
| 166 | +```plain |
| 167 | +python openomni/eval/llama3/asr_eavl.py |
| 168 | +python openomni/eval/qwen2/asr_eavl.py |
| 169 | +``` |
| 170 | + |
| 171 | +| <font style="color:rgb(31, 35, 40);">Model</font> | <font style="color:rgb(31, 35, 40);">LibriSpeech-test-clean</font> | <font style="color:rgb(31, 35, 40);">LibriSpeech-test-other</font> | <font style="color:rgb(31, 35, 40);">AIShell2-dev</font> | <font style="color:rgb(31, 35, 40);">AIShell2-test</font> | <font style="color:rgb(31, 35, 40);">WeNetSpeech-testnet</font> | <font style="color:rgb(31, 35, 40);">WeNetSpeech-testmeeting</font> | |
| 172 | +| --- | --- | --- | --- | --- | --- | --- | |
| 173 | +| <font style="color:rgb(31, 35, 40);">VITA</font> | 8.1 | 18.4 | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);">12.2</font> | <font style="color:rgb(31, 35, 40);">16.5</font> | |
| 174 | +| <font style="color:rgb(31, 35, 40);">EMOVA</font> | 4.0 | 8.6 | <font style="color:rgb(31, 35, 40);">10.6</font> | <font style="color:rgb(31, 35, 40);">10.3</font> | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);"></font> | |
| 175 | +| <font style="color:rgb(31, 35, 40);">MINI-OMNI</font> | 4.5 | 9.7 | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);"></font> | |
| 176 | +| <font style="color:rgb(31, 35, 40);">Freeze-Omni</font> | 3.29 | 7.4 | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);"></font> | <font style="color:rgb(31, 35, 40);">8.57</font> | <font style="color:rgb(31, 35, 40);">10.09</font> | |
| 177 | +| <font style="color:rgb(31, 35, 40);">ours</font> | 2.57 | 5.6 | <font style="color:rgb(31, 35, 40);">6.81</font> | <font style="color:rgb(31, 35, 40);">6.87</font> | <font style="color:rgb(31, 35, 40);">7.63</font> | <font style="color:rgb(31, 35, 40);"></font> | |
| 178 | + |
| 179 | + |
| 180 | +### <font style="color:rgb(31, 35, 40);">Image-Text Evaluation </font> |
| 181 | +<font style="color:rgb(31, 35, 40);">Refer to MMEvol for detailed OpenCampass Vision Language Evaluation</font> |
| 182 | + |
| 183 | +```plain |
| 184 | +# run on all 9 datasets |
| 185 | +./script/run_inference.sh OpenOmni-Qwen "MME MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA MMStar MMVet AI2D_TEST OCRBench HallusionBench POPE BLINK" all |
| 186 | +
|
| 187 | +# The following are instructions for running on a single dataset |
| 188 | +# MME |
| 189 | +./script/run_inference.sh OpenOmni-Qwen MME all |
| 190 | +# MMMU_DEV_VAL |
| 191 | +./script/run_inference.sh OpenOmni-Qwen MMMU_DEV_VAL all |
| 192 | +# MathVista_MINI |
| 193 | +./script/run_inference.sh OpenOmni-Qwen MathVista_MINI all |
| 194 | +..... |
| 195 | +``` |
| 196 | + |
| 197 | +### <font style="color:rgb(31, 35, 40);">Speech-Text-Image Evaluation </font> |
| 198 | +<font style="color:rgb(31, 35, 40);">Please download OmniBench and run the following command</font> |
| 199 | + |
| 200 | +```plain |
| 201 | +python openomni/eval/llama3/omni_eavl.py |
| 202 | +python openomni/eval/qwen2/omni_eavl.py |
| 203 | +``` |
| 204 | + |
| 205 | +### <font style="color:rgb(31, 35, 40);">Speech-Text-Image-Video Evaluation </font> |
| 206 | +<font style="color:rgb(31, 35, 40);">Please download Ov-Odyssey and run the following command</font> |
| 207 | + |
| 208 | +```plain |
| 209 | +python openomni/eval/llama3/ov_odyssey_eavl.py |
| 210 | +python openomni/eval/qwen2/ov_odyssey_eavl.py |
| 211 | +``` |
| 212 | + |
| 213 | + |
| 214 | + |
| 215 | +### <font style="color:rgb(31, 35, 40);">Text-Speech Evaluation </font> |
| 216 | +```plain |
| 217 | +python openomni/eval/llama3/t2s_eavl.py |
| 218 | +python openomni/eval/qwen2/t2s_eavl.py |
| 219 | +``` |
| 220 | + |
| 221 | +### <font style="color:rgb(31, 35, 40);">Emotional Text-Speech Evaluation </font> |
| 222 | +```plain |
| 223 | +python openomni/eval/llama3/et2s_eavl.py |
| 224 | +python openomni/eval/qwen2/et2s_eavl.py |
| 225 | +``` |
| 226 | + |
| 227 | + |
| 228 | +## <font style="color:rgb(31, 35, 40);">📚</font><font style="color:rgb(31, 35, 40);">Citation</font> |
| 229 | + |
| 230 | +If you find this repo useful for your research, please consider citing the paper |
| 231 | + |
| 232 | +``` |
| 233 | +@article{luo2024openomni, |
| 234 | + title={OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment across Language with Real-time Self-Aware Emotional Speech Synthesis}, |
| 235 | + author={Run Luo, Ting-En Lin, Haonan Zhang, Yuchuan Wu, Xiong Liu, Min Yang, Yongbin Li, Longze Chen, Jiaming Li, Lei Zhang, Yangyi Chen, Hamid Alinejad-Rokny, Fei Huang}, |
| 236 | + journal={arXiv preprint arXiv:2409.05840}, |
| 237 | + year={2024} |
| 238 | +} |
| 239 | +``` |
| 240 | + |
| 241 | +## <font style="color:rgb(31, 35, 40);">📧 </font><font style="color:rgb(31, 35, 40);">Contact</font> |
| 242 | + |
| 243 | +if you have any question, please consider following concat for help |
| 244 | + |
| 245 | + |
| 246 | + |
| 247 | +- Haonan Zhang — [email protected] |
0 commit comments