diff --git a/docs/source/examples/DATASETS.md b/docs/source/examples/DATASETS.md index 6d41e08cc..6a3dd5959 100644 --- a/docs/source/examples/DATASETS.md +++ b/docs/source/examples/DATASETS.md @@ -151,7 +151,10 @@ Conversations should be formatted before feeding into the model. As of now, we'v | ------------- | -------------- | ----------------- | | `chatglm3` | `[gMASK]sop<\|system\|>`
` You are a chatbot developed by LMFlow team.<\|user\|>`
` Who are you?<\|assistant\|>`
` I am a chatbot developed by LMFlow team.<\|user\|>`
` How old are you?<\|assistant\|>`
` I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.` | [Link](./supported_conversation_template.md#chatglm3) | | `chatml` | `<\|im_start\|>system`
`You are a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`Who are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I am a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`How old are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|im_end\|>`
| [Link](./supported_conversation_template.md#chatml) | -| `deepseek` | `<|begin▁of▁sentence|>You are a chatbot developed by LMFlow team.`

`User: Who are you?`

`Assistant: I am a chatbot developed by LMFlow team.<|end▁of▁sentence|>User: How old are you?`

`Assistant: I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<|end▁of▁sentence|>` | [Link](./supported_conversation_template.md#deepseek) | +| `deepseek_v2` | `<|begin▁of▁sentence|>You are a chatbot developed by LMFlow team.`

`User: Who are you?`

`Assistant: I am a chatbot developed by LMFlow team.<|end▁of▁sentence|>User: How old are you?`

`Assistant: I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<|end▁of▁sentence|>` | [Link](./supported_conversation_template.md#deepseek) | +| `deepseek_v3` | -- | [Link](./supported_conversation_template.md#deepseek-v3) | +| `deepseek_r1` | -- | [Link](./supported_conversation_template.md#deepseek-r1-zero) | +| `deepseek_r1_distill` | -- | [Link](./supported_conversation_template.md#deepseek-r1-distill-llamaqwenl) | | `gemma` | `You are a chatbot developed by LMFlow team.user`
`Who are you?`
`model`
`I am a chatbot developed by LMFlow team.`
`user`
`How old are you?`
`model`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.`
| [Link](./supported_conversation_template.md#gemma) | | `hymba` | `System`
`You are a chatbot developed by LMFlow team.`
` {"name": "generate_qrcode", "description": "Generate a QR code for a given text", "parameters": {"type": "object", "properties": {"text": {"type": "string", "description": "The text to encode in the QR code"}}, "required": ["text"]}} `

`User`
`Who are you?`
`Assistant`
`I am a chatbot developed by LMFlow team.`
`User`
`How old are you?`
`Assistant`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.` | [Link](./supported_conversation_template.md#hymba) | | `internlm2` | `<\|im_start\|>system`
`You are a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`Who are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I am a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`How old are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|im_end\|>`
| [Link](./supported_conversation_template.md#internlm2) | @@ -159,6 +162,10 @@ Conversations should be formatted before feeding into the model. As of now, we'v | `llama2` | `[INST] <>`
`You are a chatbot developed by LMFlow team.`
`<
>`

`Who are you? [/INST] I am a chatbot developed by LMFlow team.
[INST] How old are you? [/INST] I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.` | [Link](./supported_conversation_template.md#llama-2) | | `phi3` | `<\|system\|>`
`You are a chatbot developed by LMFlow team.<\|end\|>`
`<\|user\|>`
`Who are you?<\|end\|>`
`<\|assistant\|>`
`I am a chatbot developed by LMFlow team.<\|end\|>`
`<\|user\|>`
`How old are you?<\|end\|>`
`<\|assistant\|>`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|end\|>`
`<\|endoftext\|>` | [Link](./supported_conversation_template.md#phi-3) | | `qwen2` | `<\|im_start\|>system`
`You are a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`Who are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I am a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`How old are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|im_end\|>`
| [Link](./supported_conversation_template.md#qwen-2) | +| `qwen2_5` | -- | [Link](./supported_conversation_template.md#qwen-25) | +| `qwen2_5_1m` | -- | [Link](./supported_conversation_template.md#qwen-25-1m) | +| `qwen2_5_math` | -- | [Link](./supported_conversation_template.md#qwen-25-math) | +| `qwen_qwq` | -- | [Link](./supported_conversation_template.md#qwen-qwq) | | `yi` | Same as `chatml` | [Link](./supported_conversation_template.md#yi) | | `yi1_5`| `You are a chatbot developed by LMFlow team.<\|im_start\|>user`
`Who are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I am a chatbot developed by LMFlow team.<\|im_end\|>`
`<\|im_start\|>user`
`How old are you?<\|im_end\|>`
`<\|im_start\|>assistant`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.<\|im_end\|>`
| [Link](./supported_conversation_template.md#yi-15) | | `zephyr` | `<\|system\|>`
`You are a chatbot developed by LMFlow team.
`
`<\|user\|>`
`Who are you?
`
`<\|assistant\|>`
`I am a chatbot developed by LMFlow team.`
`<\|user\|>`
`How old are you?`
`<\|assistant\|>`
`I don't age like humans do. I exist as a piece of software, so I don't have a concept of age in the traditional sense.`
| [Link](./supported_conversation_template.md#zephyr) | diff --git a/docs/source/examples/supported_conversation_template.md b/docs/source/examples/supported_conversation_template.md index 1e565fe0b..fb0dead0d 100644 --- a/docs/source/examples/supported_conversation_template.md +++ b/docs/source/examples/supported_conversation_template.md @@ -3,7 +3,10 @@ - [Supported Conversation Template](#supported-conversation-template) - [ChatGLM-3](#chatglm-3) - [ChatML](#chatml) - - [DeepSeek](#deepseek) + - [DeepSeek-V2](#deepseek-v2) + - [DeepSeek-V3](#deepseek-v3) + - [DeepSeek-R1](#deepseek-r1-zero) + - [DeepSeek-R1-Distill](#deepseek-r1-distill-llamaqwen) - [Gemma](#gemma) - [Hymba](#hymba) - [InternLM2](#internlm2) @@ -13,6 +16,10 @@ - [Mixtral 8x7B](#mixtral-8x7b) - [Phi-3](#phi-3) - [Qwen-2](#qwen-2) + - [Qwen-2.5](#qwen-25) + - [Qwen-2.5-1M](#qwen-25-1m) + - [Qwen-2.5-Math](#qwen-25-math) + - [Qwen-QwQ](#qwen-qwq) - [Yi](#yi) - [Yi-1.5](#yi-15) - [Zephyr](#zephyr) @@ -84,7 +91,7 @@ ``` -## DeepSeek +## DeepSeek-V2 **With a system message** ``` <|begin▁of▁sentence|>{{system_message}}\n\nUser: {{user_message_0}}\n\n @@ -117,6 +124,243 @@ ``` +## DeepSeek-V3 +**jinja template** +[[Reference](https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/1d044fd82b15f1cedb197a288e50cc96a2c27205/tokenizer_config.json#L34)] +``` +{% if not add_generation_prompt is defined %} + {% set add_generation_prompt = false %} +{% endif %} +{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %} +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {%- if ns.is_first_sp %} + {% set ns.system_prompt = ns.system_prompt + message['content'] %} + {% set ns.is_first_sp = false %} + {%- else %} + {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %} + {%- endif %} + {%- endif %} +{%- endfor %} +{{bos_token}} +{{ns.system_prompt}} +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {%- set ns.is_tool = false -%} + {{'<|User|>' + message['content']}} + {%- endif %} + {%- if message['role'] == 'assistant' and message['content'] is none %} + {%- set ns.is_tool = false -%} + {%- for tool in message['tool_calls']%} + {%- if not ns.is_first %} + {{'<|Assistant|>'}} + {% generation %} + {{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}} + {% endgeneration %} + {%- set ns.is_first = true -%} + {%- else %} + {% generation %} + {{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}} + {{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} + {% endgeneration %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if message['role'] == 'assistant' and message['content'] is not none %} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} + {% generation %} + {{ message['content'] + '<|end▁of▁sentence|>'}} + {%- set ns.is_tool = false -%} + {% endgeneration %} + {%- else %} + {{'<|Assistant|>'}} + {% generation %} + {{ message['content'] + '<|end▁of▁sentence|>'}} + {% endgeneration %} + {%- endif %} + {%- endif %} + {%- if message['role'] == 'tool' %} + {%- set ns.is_tool = true -%} + {%- if ns.is_output_first %} + {{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} + {%- set ns.is_output_first = false %} + {%- else %} + {{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} + {%- endif %} + {%- endif %} +{%- endfor -%} +{% if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} +{% endif %} +{% if add_generation_prompt and not ns.is_tool %} + {{'<|Assistant|>'}} +{% endif %} +``` + + +## DeepSeek-R1(-Zero) +**jinja template** +[[Reference](https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/f7361cd9ff99396dbf6bd644ad846015e59ed4fc/tokenizer_config.json#L34)] +``` +{% if not add_generation_prompt is defined %} + {% set add_generation_prompt = false %} +{% endif %} +{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %} +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {%- if ns.is_first_sp %} + {% set ns.system_prompt = ns.system_prompt + message['content'] %} + {% set ns.is_first_sp = false %} + {%- else %} + {% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %} + {%- endif %} + {%- endif %} +{%- endfor %} +{{ bos_token }} +{{ ns.system_prompt }} +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {%- set ns.is_tool = false -%} + {{'<|User|>' + message['content']}} + {%- endif %} + {%- if message['role'] == 'assistant' and 'tool_calls' in message %} + {%- set ns.is_tool = false -%} + {%- for tool in message['tool_calls'] %} + {%- if not ns.is_first %} + {%- if message['content'] is none %} + {{'<|Assistant|>'}} + {% generation %} + {{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} + {% endgeneration %} + {%- else %} + {{'<|Assistant|>'}} + {% generation %} + {{ message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} + {% endgeneration %} + {%- endif %} + {%- set ns.is_first = true -%} + {%- else %} + {% generation %} + {{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} + {% endgeneration %} + {%- endif %} + {%- endfor %} + {% generation %} + {{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} + {% endgeneration %} + {%- endif %} + {%- if message['role'] == 'assistant' and 'tool_calls' not in message %} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} + {% generation %} + {{ message['content'] + '<|end▁of▁sentence|>'}} + {% endgeneration %} + {%- set ns.is_tool = false -%} + {%- else %} + {% set content = message['content'] %} + {% if '' in content %} + {% set content = content.split('')[-1] %} + {% endif %} + {{'<|Assistant|>'}} + {% generation %} + {{ content + '<|end▁of▁sentence|>'}} + {% endgeneration %} + {%- endif %} + {%- endif %} + {%- if message['role'] == 'tool' %} + {%- set ns.is_tool = true -%} + {%- if ns.is_output_first %} + {{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} + {%- set ns.is_output_first = false %} + {%- else %} + {{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} + {%- endif %} + {%- endif %} +{%- endfor -%} +{% if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} +{% endif %} +{% if add_generation_prompt and not ns.is_tool %} + {{'<|Assistant|>'}} +{% endif %} +``` + + +## DeepSeek-R1-Distill(-Llama/Qwen) +**jinja template** +[[Reference](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/blob/6393b7559e403fd1d80bfead361586fd6f630a4d/tokenizer_config.json#L34)] +``` +{% if not add_generation_prompt is defined %} + {% set add_generation_prompt = false %} +{% endif %} +{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %} +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {% set ns.system_prompt = message['content'] %} + {%- endif %} +{%- endfor %} +{{bos_token}} +{{ns.system_prompt}} +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {%- set ns.is_tool = false -%} + {{'<|User|>' + message['content']}} + {%- endif %} + {%- if message['role'] == 'assistant' and message['content'] is none %} + {%- set ns.is_tool = false -%} + {%- for tool in message['tool_calls']%} + {%- if not ns.is_first %} + {{'<|Assistant|>'}} + {% generation %} + {{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} + {% endgeneration %} + {%- set ns.is_first = true -%} + {%- else %} + {% generation %} + {{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}} + {{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} + {% endgeneration %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if message['role'] == 'assistant' and message['content'] is not none %} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} + {% generation %} + {{ message['content'] + '<|end▁of▁sentence|>'}} + {% endgeneration %} + {%- set ns.is_tool = false -%} + {%- else %} + {% set content = message['content'] %} + {% if '' in content %} + {% set content = content.split('')[-1] %} + {% endif %} + {{'<|Assistant|>'}} + {% generation %} + {{ content + '<|end▁of▁sentence|>'}} + {% endgeneration %} + {%- endif %} + {%- endif %} + {%- if message['role'] == 'tool' %} + {%- set ns.is_tool = true -%} + {%- if ns.is_output_first %} + {{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} + {%- set ns.is_output_first = false %} + {%- else %} + {{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}} + {%- endif %} + {%- endif %} +{%- endfor -%} +{% if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} +{% endif %} +{% if add_generation_prompt and not ns.is_tool %} + {{'<|Assistant|>'}} +{% endif %} +``` + + ## Gemma **With a system message** ```{admonition} NOTICE @@ -405,6 +649,307 @@ The conversation template for Mixtral 8x7B is slightly different from the templa ``` +## Qwen-2.5 +**jinja template** +[[Reference](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/blob/7ae557604adf67be50417f59c2c2f167def9a775/tokenizer_config.json#L198)] +``` +{%- if tools %} + {{- '<|im_start|>system\\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }} + {%- for tool in tools %} + {{- \"\\n\" }} + {{- tool | tojson }} + {%- endfor %} + {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }} + {%- else %} + {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %} + {%- if message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role + '\\n' }} + {% generation %} + {{ message.content + '<|im_end|>' + '\\n' }} + {% endgeneration %} + {%- else %} + {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }} + {%- endif %} + {%- elif message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {% generation %} + {{- '\\n' + message.content }} + {% endgeneration %} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {% generation %} + {{- '\\n\\n{\"name\": \"' }} + {{- tool_call.name }} + {{- '\", \"arguments\": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\\n' }} + {% endgeneration %} + {%- endfor %} + {% generation %} + {{- '<|im_end|>\\n' }} + {% endgeneration %} + {%- elif message.role == \"tool\" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\\n\\n' }} + {{- message.content }} + {{- '\\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %} + {{- '<|im_end|>\\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\\n' }} +{%- endif %} +``` + + +## Qwen-2.5-1M +```{admonition} NOTICE +:class: warning + +Pay attention to the differences of the system prompt between Qwen-2.5 and Qwen-2.5-1M. +``` +**jinja template** +[Reference](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/e28526f7bb80e2a9c8af03b831a9af3812f18fba/tokenizer_config.json#L197) +``` +{%- if tools %} + {{- '<|im_start|>system\\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }} + {%- for tool in tools %} + {{- \"\\n\" }} + {{- tool | tojson }} + {%- endfor %} + {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }} + {%- else %} + {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %} + {%- if message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role + '\\n' }} + {% generation %} + {{ message.content + '<|im_end|>' + '\\n' }} + {% endgeneration %} + {%- else %} + {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }} + {%- endif %} + {%- elif message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {% generation %} + {{- '\\n' + message.content }} + {% endgeneration %} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {% generation %} + {{- '\\n\\n{\"name\": \"' }} + {{- tool_call.name }} + {{- '\", \"arguments\": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\\n' }} + {% endgeneration %} + {%- endfor %} + {% generation %} + {{- '<|im_end|>\\n' }} + {% endgeneration %} + {%- elif message.role == \"tool\" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\\n\\n' }} + {{- message.content }} + {{- '\\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %} + {{- '<|im_end|>\\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\\n' }} +{%- endif %} +``` + + +## Qwen-2.5-Math +**jinja template** +[Reference](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct/blob/aafeb0fc6f22cbf0eaeed126eff8be45b0360a35/tokenizer_config.json#L198) +``` +{%- if tools %} + {{- '<|im_start|>system\\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }} + {%- endif %} + {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }} + {%- for tool in tools %} + {{- \"\\n\" }} + {{- tool | tojson }} + {%- endfor %} + {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }} + {%- else %} + {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %} + {%- if message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role + '\\n' }} + {% generation %} + {{ message.content + '<|im_end|>' + '\\n' }} + {% endgeneration %} + {%- else %} + {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }} + {%- endif %} + {%- elif message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {% generation %} + {{- '\\n' + message.content }} + {% endgeneration %} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {% generation %} + {{- '\\n\\n{\"name\": \"' }} + {{- tool_call.name }} + {{- '\", \"arguments\": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\\n' }} + {% endgeneration %} + {%- endfor %} + {% generation %} + {{- '<|im_end|>\\n' }} + {% endgeneration %} + {%- elif message.role == \"tool\" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\\n\\n' }} + {{- message.content }} + {{- '\\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %} + {{- '<|im_end|>\\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\\n' }} +{%- endif %} +``` + + +## Qwen-QwQ +**jinja template** +[Reference](https://huggingface.co/Qwen/QwQ-32B-Preview/blob/91906fe41a48b6a89ce2970abfd1269eefee170e/tokenizer_config.json#L197) +``` +{%- if tools %} + {{- '<|im_start|>system\\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }} + {%- endif %} + {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }} + {%- for tool in tools %} + {{- \"\\n\" }} + {{- tool | tojson }} + {%- endfor %} + {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }} + {%- else %} + {{- '<|im_start|>system\\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %} + {%- if message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role + '\\n' }} + {% generation %} + {{ message.content + '<|im_end|>' + '\\n' }} + {% endgeneration %} + {%- else %} + {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }} + {%- endif %} + {%- elif message.role == \"assistant\" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {% generation %} + {{- '\\n' + message.content }} + {% endgeneration %} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {% generation %} + {{- '\\n\\n{\"name\": \"' }} + {{- tool_call.name }} + {{- '\", \"arguments\": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\\n' }} + {% endgeneration %} + {%- endfor %} + {% generation %} + {{- '<|im_end|>\\n' }} + {% endgeneration %} + {%- elif message.role == \"tool\" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\\n\\n' }} + {{- message.content }} + {{- '\\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %} + {{- '<|im_end|>\\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\\n' }} +{%- endif %} +``` + + ## Yi **With a system message** ``` diff --git a/src/lmflow/pipeline/finetuner.py b/src/lmflow/pipeline/finetuner.py index ee9b67c72..a19886189 100644 --- a/src/lmflow/pipeline/finetuner.py +++ b/src/lmflow/pipeline/finetuner.py @@ -7,7 +7,7 @@ import logging import os import sys -from typing import Any, Iterable, Optional, Tuple +from typing import Any, Iterable, Optional, Tuple, Union import datasets import transformers @@ -35,6 +35,9 @@ import lmflow.optim.optimizers as optim from lmflow.args import OptimizerNames, DatasetArguments, ModelArguments, FinetunerArguments from lmflow.datasets.dataset import Dataset +from lmflow.models.hf_decoder_model import HFDecoderModel +from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel +from lmflow.models.hf_text_regression_model import HFTextRegressionModel from lmflow.pipeline.base_tuner import BaseTuner from lmflow.pipeline.utils.peft_trainer import PeftTrainer, PeftSavingCallback @@ -418,8 +421,8 @@ def create_optimizer(self): return CustomizedOptimTrainer def tune(self, - model, - dataset, + model: Union[HFDecoderModel, HFTextRegressionModel, HFEncoderDecoderModel], + dataset: Dataset, transform_dataset_in_place=True, data_collator=None): """ diff --git a/src/lmflow/tokenization/hf_decoder_model.py b/src/lmflow/tokenization/hf_decoder_model.py index 91e690fe4..24b80a977 100644 --- a/src/lmflow/tokenization/hf_decoder_model.py +++ b/src/lmflow/tokenization/hf_decoder_model.py @@ -4,15 +4,15 @@ import logging from logging import Logger -from typing import Dict, Union +from typing import Dict, Union, Optional import transformers from transformers.testing_utils import CaptureLogger from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +from lmflow.args import DatasetArguments from lmflow.utils.conversation_template import ConversationTemplate from lmflow.utils.constants import CONVERSATION_ROLE_NAMES -from lmflow.args import DatasetArguments logger = logging.getLogger(__name__) @@ -140,7 +140,7 @@ def conversation_tokenize_function( data_args: DatasetArguments, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], column_names, - conversation_template: ConversationTemplate, + conversation_template: Union[ConversationTemplate, str] ) -> Dict: """Handels conversation datasets tokenization """ @@ -155,38 +155,63 @@ def conversation_tokenize_function( messages = examples["messages"][i] system = examples.get("system", [None] * num_example)[i] tools = examples.get("tools", [None] * num_example)[i] - if len(messages) < 2 or messages[0]['role'] != CONVERSATION_ROLE_NAMES['user']: - tok_logger.warning( - "Invalid instance encountered. Either the conversation has less than " - "one round or the first message is not from the user." - ) - continue - - if len(messages) % 2 != 0: - logger.warning( - "The number of messages is not even, the last message will be ignored." + + if isinstance(conversation_template, str): # jinja template + conversation = [{"role": "system", "content": system}] + conversation.extend(messages) + encoded_conversation = tokenizer.apply_chat_template( + conversation=conversation, + tools=tools, + chat_template=conversation_template, + return_assistant_tokens_mask=True, + return_dict=True, ) - messages = messages[:-1] - - encoded_conversation = conversation_template.encode_conversation( - tokenizer=tokenizer, - messages=messages, - system=system, - tools=tools, - ) - - input_ids, labels = [], [] - for turn_idx, (user_input, assistant_result) in enumerate(encoded_conversation): - input_ids += user_input + assistant_result if data_args.train_on_prompt: - labels += user_input + assistant_result + labels = encoded_conversation["input_ids"] else: - labels += [-100] * len(user_input) + assistant_result - - token_dict["input_ids"][i].extend(input_ids) - token_dict["attention_mask"][i].extend([1] * len(input_ids)) - token_dict["labels"][i].extend(labels) + labels = [ + encoded_conversation["input_ids"][index] if mask == 1 else -100 + for index, mask in enumerate(encoded_conversation["assistant_masks"]) + ] + + token_dict['input_ids'][i].extend(encoded_conversation['input_ids']) + token_dict['attention_mask'][i].extend(encoded_conversation['attention_mask']) + token_dict['labels'][i].extend(labels) + + else: # lmflow `conversation_template` + if len(messages) < 2 or messages[0]['role'] != CONVERSATION_ROLE_NAMES['user']: + tok_logger.warning( + "Invalid instance encountered. Either the conversation has less than " + "one round or the first message is not from the user." + ) + continue + + if len(messages) % 2 != 0: + logger.warning( + "The number of messages is not even, the last message will be ignored." + ) + messages = messages[:-1] + + encoded_conversation = conversation_template.encode_conversation( + tokenizer=tokenizer, + messages=messages, + system=system, + tools=tools, + ) + + input_ids, labels = [], [] + for turn_idx, (user_input, assistant_result) in enumerate(encoded_conversation): + input_ids += user_input + assistant_result + + if data_args.train_on_prompt: + labels += user_input + assistant_result + else: + labels += [-100] * len(user_input) + assistant_result + + token_dict["input_ids"][i].extend(input_ids) + token_dict["attention_mask"][i].extend([1] * len(input_ids)) + token_dict["labels"][i].extend(labels) if data_args.disable_group_texts: token_dict = blocking( diff --git a/src/lmflow/utils/conversation_template/__init__.py b/src/lmflow/utils/conversation_template/__init__.py index 1c0d58aeb..1dcf29ea0 100644 --- a/src/lmflow/utils/conversation_template/__init__.py +++ b/src/lmflow/utils/conversation_template/__init__.py @@ -1,24 +1,44 @@ #!/usr/bin/env python # coding=utf-8 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. +import logging + +from lmflow.utils.versioning import is_package_version_at_least + from .base import EMPTY_TEMPLATE, EMPTY_NO_SPECIAL_TOKENS_TEMPLATE, ConversationTemplate, ConversationTemplateForTool from .chatglm import CHATGLM3_TEMPLATE from .chatml import CHATML_TEMPLATE -from .deepseek import DEEPSEEK_TEMPLATE +from .deepseek import ( + DEEPSEEK_V2_TEMPLATE, + DEEPSEEK_V3_TEMPLATE, + DEEPSEEK_R1_TEMPLATE, + DEEPSEEK_R1_DISTILL_TEMPLATE +) from .gemma import GEMMA_TEMPLATE from .hymba import HYMBA_TEMPLATE from .internlm import INTERNLM2_TEMPLATE from .llama import LLAMA2_TEMPLATE, LLAMA3_TEMPLATE, LLAMA3_TEMPLATE_FOR_TOOL from .phi import PHI3_TEMPLATE -from .qwen import QWEN2_TEMPLATE, QWEN2_TEMPLATE_FOR_TOOL +from .qwen import ( + QWEN2_TEMPLATE, + QWEN2_TEMPLATE_FOR_TOOL, + QWEN2_5_TEMPLATE, + QWEN2_5_1M_TEMPLATE, + QWEN2_5_MATH_TEMPLATE, + QWEN_QWQ_TEMPLATE +) from .yi import YI1_5_TEMPLATE from .zephyr import ZEPHYR_TEMPLATE +logger = logging.getLogger(__name__) + + PRESET_TEMPLATES = { 'chatglm3': CHATGLM3_TEMPLATE, 'chatml': CHATML_TEMPLATE, - 'deepseek': DEEPSEEK_TEMPLATE, + 'deepseek': DEEPSEEK_V2_TEMPLATE, + 'deepseek_v2': DEEPSEEK_V2_TEMPLATE, 'disable': EMPTY_TEMPLATE, 'empty': EMPTY_TEMPLATE, 'empty_no_special_tokens': EMPTY_NO_SPECIAL_TOKENS_TEMPLATE, @@ -34,4 +54,23 @@ 'yi': CHATML_TEMPLATE, 'yi1_5': YI1_5_TEMPLATE, 'zephyr': ZEPHYR_TEMPLATE -} \ No newline at end of file +} + +JINJA_TEMPLATES = { + 'deepseek_r1': DEEPSEEK_R1_TEMPLATE, + 'deepseek_r1_distill': DEEPSEEK_R1_DISTILL_TEMPLATE, + 'deepseek_v3': DEEPSEEK_V3_TEMPLATE, + 'qwen2_5': QWEN2_5_TEMPLATE, + 'qwen2_5_1m': QWEN2_5_1M_TEMPLATE, + 'qwen2_5_math': QWEN2_5_MATH_TEMPLATE, + 'qwen_qwq': QWEN_QWQ_TEMPLATE, +} + +if is_package_version_at_least("transformers", "4.43.0"): + for template_name, template in JINJA_TEMPLATES.items(): + PRESET_TEMPLATES[template_name] = template +else: + logger.warning( + f"The following conversation templates require transformers>=4.43.0: {JINJA_TEMPLATES.keys()}. " + f"Please upgrade `transformers` to use them." + ) \ No newline at end of file diff --git a/src/lmflow/utils/conversation_template/base.py b/src/lmflow/utils/conversation_template/base.py index d943e02ac..ecc129171 100644 --- a/src/lmflow/utils/conversation_template/base.py +++ b/src/lmflow/utils/conversation_template/base.py @@ -147,12 +147,6 @@ def format(self, **kwargs) -> list: return formatted_template -@dataclass -class ListFormatter(Formatter): - def format(self, **kwargs) -> list: - pass # Work in progress - - @dataclass class ConversationTemplate: user_formatter: Formatter @@ -167,6 +161,7 @@ class ConversationTemplate: special_starter: Optional[TemplateComponent] = None special_stopper: Optional[TemplateComponent] = None template_name: Optional[str] = None + system_default: Optional[str] = None def __post_init__(self): if self.separator: @@ -217,7 +212,7 @@ def encode_conversation( raise ValueError("Your dataset contains system message but no system formatter is provided. " "Consider either providing a system formatter or removing system prompt from your dataset.") else: - system = None + system = self.system_default if self.system_default else None encoded_pairs = self._encode(tokenizer, messages, system, tools, **kwargs) encoded_pairs = self.post_process_pairs(encoded_pairs=encoded_pairs, tokenizer=tokenizer) @@ -439,11 +434,7 @@ def encode_conversation( ''' assert isinstance(messages, list), "Messages must be a list." - if tools is None: - tools = '' - else: - tools = ','.join(tools) - # logger.warning("Tools are not supported yet. Please include tools in the system message manually.") + tools = self._handle_tools(tools) if system is None: system = "" @@ -453,7 +444,7 @@ def encode_conversation( raise ValueError("Your dataset contains system message but no system formatter is provided. " "Consider either providing a system formatter or removing system prompt from your dataset.") else: - system = "" + system = self.system_default if self.system_default else "" encoded_pairs = self._encode(tokenizer, messages, system, tools, **kwargs) encoded_pairs = self.post_process_pairs(encoded_pairs=encoded_pairs, tokenizer=tokenizer) @@ -551,6 +542,10 @@ def _encode_template( else: raise NotImplementedError(f"Component type {component.type} is not supported yet.") return encoded_ids + + def _handle_tools(self, tools: Optional[List[str]]) -> str: + tools_out = ','.join(tools) if tools is not None else '' + return tools_out EMPTY_TEMPLATE = ConversationTemplate( diff --git a/src/lmflow/utils/conversation_template/deepseek.py b/src/lmflow/utils/conversation_template/deepseek.py index 680d2292d..29a02801e 100644 --- a/src/lmflow/utils/conversation_template/deepseek.py +++ b/src/lmflow/utils/conversation_template/deepseek.py @@ -4,8 +4,8 @@ from .base import StringFormatter, TemplateComponent, ConversationTemplate -DEEPSEEK_TEMPLATE = ConversationTemplate( - template_name='deepseek', +DEEPSEEK_V2_TEMPLATE = ConversationTemplate( + template_name='deepseek_v2', user_formatter=StringFormatter( template=[ TemplateComponent(type='string', content='User: {{content}}\n\n') @@ -23,4 +23,232 @@ ] ), special_starter=TemplateComponent(type='token', content='bos_token') +) + + +DEEPSEEK_V3_TEMPLATE = ( + "{% if not add_generation_prompt is defined %}" + "{% set add_generation_prompt = false %}" + "{% endif %}" + "{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}" + "{%- for message in messages %}" + "{%- if message['role'] == 'system' %}" + "{%- if ns.is_first_sp %}" + "{% set ns.system_prompt = ns.system_prompt + message['content'] %}" + "{% set ns.is_first_sp = false %}" + "{%- else %}" + "{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + "{{bos_token}}" + "{{ns.system_prompt}}" + "{%- for message in messages %}" + "{%- if message['role'] == 'user' %}" + "{%- set ns.is_tool = false -%}" + "{{'<|User|>' + message['content']}}" + "{%- endif %}" + "{%- if message['role'] == 'assistant' and message['content'] is none %}" + "{%- set ns.is_tool = false -%}" + "{%- for tool in message['tool_calls']%}" + "{%- if not ns.is_first %}" + "{{'<|Assistant|>'}}" + "{% generation %}" + "{{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}" + "{% endgeneration %}" + "{%- set ns.is_first = true -%}" + "{%- else %}" + "{% generation %}" + "{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}" + "{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- endfor %}" + "{%- endif %}" + "{%- if message['role'] == 'assistant' and message['content'] is not none %}" + "{%- if ns.is_tool %}" + "{{'<|tool▁outputs▁end|>'}}" + "{% generation %}" + "{{ message['content'] + '<|end▁of▁sentence|>'}}" + "{%- set ns.is_tool = false -%}" + "{% endgeneration %}" + "{%- else %}" + "{{'<|Assistant|>'}}" + "{% generation %}" + "{{ message['content'] + '<|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- endif %}" + "{%- if message['role'] == 'tool' %}" + "{%- set ns.is_tool = true -%}" + "{%- if ns.is_output_first %}" + "{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}" + "{%- set ns.is_output_first = false %}" + "{%- else %}" + "{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor -%}" + "{% if ns.is_tool %}" + "{{'<|tool▁outputs▁end|>'}}" + "{% endif %}" + "{% if add_generation_prompt and not ns.is_tool %}" + "{{'<|Assistant|>'}}" + "{% endif %}" +) + + +DEEPSEEK_R1_TEMPLATE = ( + "{% if not add_generation_prompt is defined %}" + "{% set add_generation_prompt = false %}" + "{% endif %}" + "{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}" + "{%- for message in messages %}" + "{%- if message['role'] == 'system' %}" + "{%- if ns.is_first_sp %}" + "{% set ns.system_prompt = ns.system_prompt + message['content'] %}" + "{% set ns.is_first_sp = false %}" + "{%- else %}" + "{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + "{{ bos_token }}" + "{{ ns.system_prompt }}" + "{%- for message in messages %}" + "{%- if message['role'] == 'user' %}" + "{%- set ns.is_tool = false -%}" + "{{'<|User|>' + message['content']}}" + "{%- endif %}" + "{%- if message['role'] == 'assistant' and 'tool_calls' in message %}" + "{%- set ns.is_tool = false -%}" + "{%- for tool in message['tool_calls'] %}" + "{%- if not ns.is_first %}" + "{%- if message['content'] is none %}" + " {{'<|Assistant|>'}}" + " {% generation %}" + " {{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}" + " {% endgeneration %}" + "{%- else %}" + " {{'<|Assistant|>'}}" + " {% generation %}" + " {{ message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}" + " {% endgeneration %}" + "{%- endif %}" + "{%- set ns.is_first = true -%}" + "{%- else %}" + "{% generation %}" + "{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- endfor %}" + "{% generation %}" + "{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}" + "{%- if ns.is_tool %}" + "{{'<|tool▁outputs▁end|>'}}" + "{% generation %}" + "{{ message['content'] + '<|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- set ns.is_tool = false -%}" + "{%- else %}" + "{% set content = message['content'] %}" + "{% if '' in content %}" + "{% set content = content.split('')[-1] %}" + "{% endif %}" + "{{'<|Assistant|>'}}" + "{% generation %}" + "{{ content + '<|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- endif %}" + "{%- if message['role'] == 'tool' %}" + "{%- set ns.is_tool = true -%}" + "{%- if ns.is_output_first %}" + "{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}" + "{%- set ns.is_output_first = false %}" + "{%- else %}" + "{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor -%}" + "{% if ns.is_tool %}" + "{{'<|tool▁outputs▁end|>'}}" + "{% endif %}" + "{% if add_generation_prompt and not ns.is_tool %}" + "{{'<|Assistant|>'}}" + "{% endif %}" +) + + +DEEPSEEK_R1_DISTILL_TEMPLATE = ( + "{% if not add_generation_prompt is defined %}" + "{% set add_generation_prompt = false %}" + "{% endif %}" + "{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}" + "{%- for message in messages %}" + "{%- if message['role'] == 'system' %}" + "{% set ns.system_prompt = message['content'] %}" + "{%- endif %}" + "{%- endfor %}" + "{{bos_token}}" + "{{ns.system_prompt}}" + "{%- for message in messages %}" + "{%- if message['role'] == 'user' %}" + "{%- set ns.is_tool = false -%}" + "{{'<|User|>' + message['content']}}" + "{%- endif %}" + "{%- if message['role'] == 'assistant' and message['content'] is none %}" + "{%- set ns.is_tool = false -%}" + "{%- for tool in message['tool_calls']%}" + "{%- if not ns.is_first %}" + "{{'<|Assistant|>'}}" + "{% generation %}" + "{{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}" + "{% endgeneration %}" + "{%- set ns.is_first = true -%}" + "{%- else %}" + "{% generation %}" + "{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}" + "{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- endfor %}" + "{%- endif %}" + "{%- if message['role'] == 'assistant' and message['content'] is not none %}" + "{%- if ns.is_tool %}" + "{{'<|tool▁outputs▁end|>'}}" + "{% generation %}" + "{{ message['content'] + '<|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- set ns.is_tool = false -%}" + "{%- else %}" + "{% set content = message['content'] %}" + "{% if '' in content %}" + "{% set content = content.split('')[-1] %}" + "{% endif %}" + "{{'<|Assistant|>'}}" + "{% generation %}" + "{{ content + '<|end▁of▁sentence|>'}}" + "{% endgeneration %}" + "{%- endif %}" + "{%- endif %}" + "{%- if message['role'] == 'tool' %}" + "{%- set ns.is_tool = true -%}" + "{%- if ns.is_output_first %}" + "{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}" + "{%- set ns.is_output_first = false %}" + "{%- else %}" + "{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor -%}" + "{% if ns.is_tool %}" + "{{'<|tool▁outputs▁end|>'}}" + "{% endif %}" + "{% if add_generation_prompt and not ns.is_tool %}" + "{{'<|Assistant|>'}}" + "{% endif %}" ) \ No newline at end of file diff --git a/src/lmflow/utils/conversation_template/hymba.py b/src/lmflow/utils/conversation_template/hymba.py index 81e3ec81e..cb8a827bf 100644 --- a/src/lmflow/utils/conversation_template/hymba.py +++ b/src/lmflow/utils/conversation_template/hymba.py @@ -37,56 +37,12 @@ class HymbaConversationTemplate(ConversationTemplateForTool): - def encode_conversation( - self, - tokenizer: PreTrainedTokenizer, - messages: List[Dict[str, str]], - system: Optional[str] = None, - tools: Optional[List[str]] = None, - **kwargs - ) -> Sequence[Tuple[List[int], List[int]]]: - r''' - Messages here should be guaranteed to be in pairs, with the first message being the user message and the second message being the system message. - Data example: - ```json - { - "conversation_id": 2, - "system": "sysinfo1", - "tools": ["tool_1_desc"], - "messages": [ - { - "role": "user", - "content": "hi" - }, - { - "role": "assistant", - "content": "Hello!" - } - ] - } - ``` - ''' - assert isinstance(messages, list), "Messages must be a list." - + def _handle_tools(self, tools: Optional[List[str]]) -> str: tools_out = '' if tools is not None: for tool in tools: tools_out += "\n " + tool + " " - - if system is None: - system = "" - else: - if system.replace(" ",""): # has actual content - if not self.system_formatter: - raise ValueError("Your dataset contains system message but no system formatter is provided. " - "Consider either providing a system formatter or removing system prompt from your dataset.") - system = '\n' + system - else: - system = "" - encoded_pairs = self._encode(tokenizer, messages, system, tools_out, **kwargs) - encoded_pairs = self.post_process_pairs(encoded_pairs=encoded_pairs, tokenizer=tokenizer) - - return encoded_pairs + return tools_out HYMBA_TEMPLATE = HymbaConversationTemplate( diff --git a/src/lmflow/utils/conversation_template/qwen.py b/src/lmflow/utils/conversation_template/qwen.py index 88c8fb581..406278e62 100644 --- a/src/lmflow/utils/conversation_template/qwen.py +++ b/src/lmflow/utils/conversation_template/qwen.py @@ -1,6 +1,10 @@ #!/usr/bin/env python # coding=utf-8 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. +from typing import Dict, Set, Sequence, Literal, Union, List, Optional, Tuple + +from transformers import PreTrainedTokenizer + from .base import StringFormatter, TemplateComponent, ConversationTemplate, ConversationTemplateForTool @@ -24,6 +28,7 @@ separator=TemplateComponent(type='string', content='\n') ) + QWEN2_TEMPLATE_FOR_TOOL = ConversationTemplateForTool( template_name='qwen2_for_tool', user_formatter=StringFormatter( @@ -52,4 +57,288 @@ ] ), separator=TemplateComponent(type='string', content='\n') +) + + +QWEN2_5_TEMPLATE = ( + "{%- if tools %}" + "{{- '<|im_start|>system\\n' }}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- messages[0]['content'] }}" + "{%- else %}" + "{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}" + "{%- endif %}" + "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}" + "{%- for tool in tools %}" + "{{- \"\\n\" }}" + "{{- tool | tojson }}" + "{%- endfor %}" + "{{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}" + "{%- else %}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}" + "{%- else %}" + "{{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- for message in messages %}" + "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}" + "{%- if message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role + '\\n' }}" + "{% generation %}" + "{{ message.content + '<|im_end|>' + '\\n' }}" + "{% endgeneration %}" + "{%- else %}" + "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}" + "{%- endif %}" + "{%- elif message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role }}" + "{%- if message.content %}" + "{% generation %}" + "{{- '\\n' + message.content }}" + "{% endgeneration %}" + "{%- endif %}" + "{%- for tool_call in message.tool_calls %}" + "{%- if tool_call.function is defined %}" + "{%- set tool_call = tool_call.function %}" + "{%- endif %}" + "{% generation %}" + "{{- '\\n\\n{\"name\": \"' }}" + "{{- tool_call.name }}" + "{{- '\", \"arguments\": ' }}" + "{{- tool_call.arguments | tojson }}" + "{{- '}\\n' }}" + "{% endgeneration %}" + "{%- endfor %}" + "{% generation %}" + "{{- '<|im_end|>\\n' }}" + "{% endgeneration %}" + "{%- elif message.role == \"tool\" %}" + "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}" + "{{- '<|im_start|>user' }}" + "{%- endif %}" + "{{- '\\n\\n' }}" + "{{- message.content }}" + "{{- '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}" + "{{- '<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + "{%- if add_generation_prompt %}" + "{{- '<|im_start|>assistant\\n' }}" + "{%- endif %}" +) + + +QWEN2_5_1M_TEMPLATE = ( + "{%- if tools %}" + "{{- '<|im_start|>system\\n' }}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- messages[0]['content'] }}" + "{%- else %}" + "{{- 'You are a helpful assistant.' }}" + "{%- endif %}" + "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}" + "{%- for tool in tools %}" + "{{- \"\\n\" }}" + "{{- tool | tojson }}" + "{%- endfor %}" + "{{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}" + "{%- else %}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}" + "{%- else %}" + "{{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- for message in messages %}" + "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}" + "{%- if message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role + '\\n' }}" + "{% generation %}" + "{{ message.content + '<|im_end|>' + '\\n' }}" + "{% endgeneration %}" + "{%- else %}" + "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}" + "{%- endif %}" + "{%- elif message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role }}" + "{%- if message.content %}" + "{% generation %}" + "{{- '\\n' + message.content }}" + "{% endgeneration %}" + "{%- endif %}" + "{%- for tool_call in message.tool_calls %}" + "{%- if tool_call.function is defined %}" + "{%- set tool_call = tool_call.function %}" + "{%- endif %}" + "{% generation %}" + "{{- '\\n\\n{\"name\": \"' }}" + "{{- tool_call.name }}" + "{{- '\", \"arguments\": ' }}" + "{{- tool_call.arguments | tojson }}" + "{{- '}\\n' }}" + "{% endgeneration %}" + "{%- endfor %}" + "{% generation %}" + "{{- '<|im_end|>\\n' }}" + "{% endgeneration %}" + "{%- elif message.role == \"tool\" %}" + "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}" + "{{- '<|im_start|>user' }}" + "{%- endif %}" + "{{- '\\n\\n' }}" + "{{- message.content }}" + "{{- '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}" + "{{- '<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + "{%- if add_generation_prompt %}" + "{{- '<|im_start|>assistant\\n' }}" + "{%- endif %}" +) + + +QWEN2_5_MATH_TEMPLATE = ( + "{%- if tools %}" + "{{- '<|im_start|>system\\n' }}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- messages[0]['content'] }}" + "{%- else %}" + "{{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}" + "{%- endif %}" + "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}" + "{%- for tool in tools %}" + "{{- \"\\n\" }}" + "{{- tool | tojson }}" + "{%- endfor %}" + "{{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}" + "{%- else %}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}" + "{%- else %}" + "{{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- for message in messages %}" + "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}" + "{%- if message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role + '\\n' }}" + "{% generation %}" + "{{ message.content + '<|im_end|>' + '\\n' }}" + "{% endgeneration %}" + "{%- else %}" + "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}" + "{%- endif %}" + "{%- elif message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role }}" + "{%- if message.content %}" + "{% generation %}" + "{{- '\\n' + message.content }}" + "{% endgeneration %}" + "{%- endif %}" + "{%- for tool_call in message.tool_calls %}" + "{%- if tool_call.function is defined %}" + "{%- set tool_call = tool_call.function %}" + "{%- endif %}" + "{% generation %}" + "{{- '\\n\\n{\"name\": \"' }}" + "{{- tool_call.name }}" + "{{- '\", \"arguments\": ' }}" + "{{- tool_call.arguments | tojson }}" + "{{- '}\\n' }}" + "{% endgeneration %}" + "{%- endfor %}" + "{% generation %}" + "{{- '<|im_end|>\\n' }}" + "{% endgeneration %}" + "{%- elif message.role == \"tool\" %}" + "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}" + "{{- '<|im_start|>user' }}" + "{%- endif %}" + "{{- '\\n\\n' }}" + "{{- message.content }}" + "{{- '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}" + "{{- '<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + "{%- if add_generation_prompt %}" + "{{- '<|im_start|>assistant\\n' }}" + "{%- endif %}" +) + + +QWEN_QWQ_TEMPLATE = ( + "{%- if tools %}" + "{{- '<|im_start|>system\\n' }}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- messages[0]['content'] }}" + "{%- else %}" + "{{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }}" + "{%- endif %}" + "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}" + "{%- for tool in tools %}" + "{{- \"\\n\" }}" + "{{- tool | tojson }}" + "{%- endfor %}" + "{{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}" + "{%- else %}" + "{%- if messages[0]['role'] == 'system' %}" + "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}" + "{%- else %}" + "{{- '<|im_start|>system\\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- for message in messages %}" + "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}" + "{%- if message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role + '\\n' }}" + "{% generation %}" + "{{ message.content + '<|im_end|>' + '\\n' }}" + "{% endgeneration %}" + "{%- else %}" + "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}" + "{%- endif %}" + "{%- elif message.role == \"assistant\" %}" + "{{- '<|im_start|>' + message.role }}" + "{%- if message.content %}" + "{% generation %}" + "{{- '\\n' + message.content }}" + "{% endgeneration %}" + "{%- endif %}" + "{%- for tool_call in message.tool_calls %}" + "{%- if tool_call.function is defined %}" + "{%- set tool_call = tool_call.function %}" + "{%- endif %}" + "{% generation %}" + "{{- '\\n\\n{\"name\": \"' }}" + "{{- tool_call.name }}" + "{{- '\", \"arguments\": ' }}" + "{{- tool_call.arguments | tojson }}" + "{{- '}\\n' }}" + "{% endgeneration %}" + "{%- endfor %}" + "{% generation %}" + "{{- '<|im_end|>\\n' }}" + "{% endgeneration %}" + "{%- elif message.role == \"tool\" %}" + "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}" + "{{- '<|im_start|>user' }}" + "{%- endif %}" + "{{- '\\n\\n' }}" + "{{- message.content }}" + "{{- '\\n' }}" + "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}" + "{{- '<|im_end|>\\n' }}" + "{%- endif %}" + "{%- endif %}" + "{%- endfor %}" + "{%- if add_generation_prompt %}" + "{{- '<|im_start|>assistant\\n' }}" + "{%- endif %}" ) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..ff5378519 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,16 @@ +def pytest_addoption(parser): + parser.addoption("--cpu-only", action="store_true", help="run tests that only requires cpu") + parser.addoption("--skip-slow", action="store_true", help="skip slow tests") + +def pytest_collection_modifyitems(config, items): + new_items = [] + for item in items: + func = item.function + if config.getoption("--cpu-only"): + if not (func.__doc__ and "#cpu" in func.__doc__.lower()): + continue + if config.getoption("--skip-slow"): + if func.__doc__ and "#slow" in func.__doc__.lower(): + continue + new_items.append(item) + items[:] = new_items \ No newline at end of file