server.py: pass all request options, comments in ts sigs, render tool…

… calls
ggml-org · ochafik · Apr 27, 2024 · Apr 8, 2024 · Mar 25, 2024 · Mar 26, 2024
commit 5f3de16116db536fe33d0859a79ff96e4d4f9d7e
diff --git a/examples/openai/api.py b/examples/openai/api.py
@@ -1,5 +1,5 @@
 from typing import Any, Dict, Literal, Optional, Union
-from pydantic import BaseModel, Json
+from pydantic import BaseModel, Json, TypeAdapter
 
 class FunctionCall(BaseModel):
     name: str
@@ -31,10 +31,33 @@ class ResponseFormat(BaseModel):
 class ChatCompletionRequest(BaseModel):
     model: str
     tools: Optional[list[Tool]] = None
-    messages: list[Message]
+    messages: list[Message] = None
+    prompt: Optional[str] = None
     response_format: Optional[ResponseFormat] = None
-    temperature: float = 1.0
+
     stream: bool = False
+    cache_prompt: Optional[bool] = None
+    n_predict: Optional[int] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    min_p: Optional[float] = None
+    tfs_z: Optional[float] = None
+    typical_p: Optional[float] = None
+    temperature: float = 1.0
+    dynatemp_range: Optional[float] = None
+    dynatemp_exponent: Optional[float] = None
+    repeat_last_n: Optional[int] = None
+    repeat_penalty: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    presense_penalty: Optional[float] = None
+    mirostat: Optional[bool] = None
+    mirostat_tau: Optional[float] = None
+    mirostat_eta: Optional[float] = None
+    penalize_nl: Optional[bool] = None
+    n_keep: Optional[int] = None
+    seed: Optional[int] = None
+    n_probs: Optional[int] = None
+    min_keep: Optional[int] = None
 
 class Choice(BaseModel):
     index: int

diff --git a/examples/openai/prompting.py b/examples/openai/prompting.py
@@ -41,7 +41,7 @@ def add_system_prompt(self, messages: list[Message], system_prompt: Message) ->
         system_message = next(((i, m) for i, m in enumerate(messages) if m.role == "system"), None)
         if system_message is not None:
             (i, m) = system_message
-            return messages[:i] + [Message(role="system", content=m.content + '\n' + system_prompt.content)] + messages[i+1:]
+            return messages[:i] + [Message(role="system", content=system_prompt.content + '\n' + m.content)] + messages[i+1:]
         else:
             return [system_prompt] + messages
 
@@ -63,22 +63,32 @@ def render(self, messages: list[Message], add_generation_prompt: bool, omit_bos:
                     assert messages[i+1].role == 'user'
                     new_messages.append(Message(
                         role="user",
-                        content=f'[SYS]{messages[i].content}[/SYS]\n{messages[i+1].content}'))
+                        content=f'[SYS]{messages[i].content}[/SYS]\n{messages[i+1].content}'
+                    ))
                     i += 2
+                elif messages[i].role == 'assistant' and messages[i].tool_calls and messages[i].content:
+                    tc = '\n'.join(f'<tool_call>{json.dumps(tc.model_dump())}</tool_call>' for tc in messages[i].tool_calls)
+                    new_messages.append(Message(
+                        role="assistant",
+                        content=f'{messages[i].content}\n{tc}'
+                    ))
+                    i += 1
                 else:
                     new_messages.append(messages[i])
                     i += 1
             # print(f'new_messages={json.dumps(new_messages, indent=2)}')
             messages = new_messages
         # print(f'messages={messages}')
 
-        return self.template.render(
+        result = self.template.render(
             messages=messages,
             eos_token=self.eos_token,
             bos_token='' if omit_bos else self.bos_token,
             raise_exception=raise_exception,
             add_generation_prompt=add_generation_prompt,
         )
+        sys.stderr.write(f'\n# RENDERED:\n\n{result}\n\n')
+        return result
 
 # While the API will be usable with a generic tools usage like OpenAI,
 # (see https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models),
@@ -120,38 +130,29 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
         return Message(
             role="system",
             content='\n'.join([
-                '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
+                # '''You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.''',
                 '''You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:''',
                 '''<tools>''',
-                *(json.dumps(tool.model_dump(), indent=indent) for tool in tools),
+                _tools_typescript_signatures(tools),
+                # _tools_schema_signatures(tools, indent=indent),
                 '''</tools>''',
                 '',
-                '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
-                '',
+                # '''Use the following json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}''',
+                # '',
                 # '''For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:''',
                 '''To call each function, give its name and arguments within <tool_call></tool_call> XML tags as follows:''',
                 '''<tool_call>''',
-                '''{"arguments": <args-dict>, "name": <function-name>}''',
+                '''{"name": <function-name>, "arguments": <args-dict>}''',
                 '''</tool_call>''',
-                '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it.''',
+                # '''This is not hypothetical, you're not asked what you would do. If you need a tool called, just call it with <tool_call>...</tool_call>.''',
             ])
         )
 
     elif chat_format.tool_style == ToolsPromptStyle.TYPESCRIPT_FUNCTIONARY_V2:
-        ts_converter = SchemaToTypeScriptConverter()
-
         return Message(
             role="system",
-            content='\n'.join([
-                '// Supported function definitions that should be called when necessary.'
-                'namespace functions {',
-                *[
-                    '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
-                    'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
-                    for tool in tools
-                ],
-                '} // namespace functions',
-            ])
+            content= '// Supported function definitions that should be called when necessary.\n' +
+                _tools_typescript_signatures(tools)
         )
 
     elif chat_format.tool_style == ToolsPromptStyle.TOOLS_HERMES_2_PRO:
@@ -170,6 +171,20 @@ def make_tools_prompt(chat_format: ChatFormat, tools: list[Tool], indent=2) -> M
     else:
         raise ValueError(f"Unsupported tool call style: {chat_format.tool_style}")
 
+def _tools_typescript_signatures(tools: list[Tool]) -> str:
+    ts_converter = SchemaToTypeScriptConverter()
+    return 'namespace functions {' + '\n'.join(
+        '// ' + tool.function.description.replace('\n', '\n// ') + '\n' + ''
+        'type ' + tool.function.name + ' = (_: ' + ts_converter.visit(tool.function.parameters) + ") => any;\n"
+        for tool in tools
+    ) + '} // namespace functions'
+
+def _tools_schema_signatures(tools: list[Tool], indent=None) -> str:
+    return '\n'.join(
+        json.dumps(tool.model_dump(), indent=indent)
+        for tool in tools
+    )
+
 @typechecked
 def _outputs_tool_call_tags(style: ToolsPromptStyle) -> bool:
     return style in (
@@ -199,6 +214,8 @@ def make_grammar(chat_format: ChatFormat, tools: list[Tool], response_schema: Op
     assert planted_prompt.startswith(empty_prompt), f"Planted prompt does not start with empty prompt: {planted_prompt} vs {empty_prompt}"
     [prefix, suffix] = planted_prompt[len(empty_prompt):].split(delimiter)
 
+    allow_parallel_calls = False
+
     def strip_suffix(s: str) -> str:
         if s.endswith(suffix):
             return s[:-len(suffix)]
@@ -235,17 +252,19 @@ def format_literal(s: str) -> str:
 
             tool_call_rule = converter._add_rule(
                 'tool_call',
-                format_literal("<tool_call>") + " (" +
+                format_literal("<tool_call>") + " space (" +
                 ' | '.join(tool_rules) +
-                ") " + format_literal("</tool_call>"))
+                ")  space " + format_literal("</tool_call>"))# + ' space')
 
             # Ideally we'd want a negative lookahead of /<tool\\?_call>/, but it's just too hard to express in GBNF for now.
             # So we just over-constrain the content rule to not contain literals dangerously getting close to <tool_call>
-            content_rule = converter._add_rule('content', '[^<] | "<" [^t<]? | "<t" [^o<]?')
+            content_rule = converter._add_rule('content', '[^<] | "<" [^t<] | "<t" [^o<]')
             # content_rule = converter._add_rule('content', converter.not_literal('<tool_call>'))
             converter._add_rule(
                 'root',
-                f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?')
+                # tool_call_rule)
+                f'{content_rule}* ({tool_call_rule}+ {content_rule}*)?' if allow_parallel_calls \
+                    else f'{content_rule}* {tool_call_rule}?')
 
             # # Constrain the output to be a non-tool-call message (constrained to a JSON schema or not)
             # # OR a tool-call message respecting the schema of any of the tools
@@ -285,7 +304,7 @@ def parse(s: str) -> Optional[Message]:
                                     id=gen_callid(),
                                     function=FunctionCall(**fc)))
 
-                    content = '(...)'.join(content).strip()
+                    content = '\n'.join(content).strip()
                     return Message(role="assistant", content=content if content else None, tool_calls=tool_calls)
 
                 # if '<tool_call>'.startswith(ls) or ls.startswith('<tool_call>'):
@@ -338,7 +357,8 @@ def parse(s: str) -> Optional[Message]:
             converter._add_rule(
                 'root',
                 f'{content_without_start_rule}   {content_rule}*   ({tool_call_rule}+ {content_rule}*)? | '
-                f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*')
+                f'{tool_call_without_start_rule} {tool_call_rule}* {content_rule}*' if allow_parallel_calls \
+                    else f'{content_without_start_rule}  {tool_call_rule}? | {tool_call_without_start_rule}')
 
             # converter._add_rule(
             #     "root", 

diff --git a/examples/openai/server.py b/examples/openai/server.py
@@ -59,8 +59,9 @@ def main(
     async def chat_completions(request: Request, chat_request: ChatCompletionRequest):
         headers = {
             "Content-Type": "application/json",
-            "Authorization": request.headers.get("Authorization"),
         }
+        if (auth := request.headers.get("Authorization")):
+            headers["Authorization"] = auth
 
         if chat_request.response_format is not None:
             assert chat_request.response_format.type == "json_object", f"Unsupported response format: {chat_request.response_format.type}"
@@ -75,18 +76,31 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
         (grammar, parser) = make_grammar(chat_format, chat_request.tools, response_schema)
 
         # TODO: Test whether the template supports formatting tool_calls
-        sys.stderr.write(f'\n{grammar}\n\n')
 
         prompt = chat_format.render(messages, add_generation_prompt=True)
+
+        sys.stderr.write(f'\n# PROMPT:\n\n{prompt}\n\n')
+        sys.stderr.write(f'\n# GRAMMAR:\n\n{grammar}\n\n')
+
+        data = LlamaCppServerCompletionRequest(
+            **{
+                k: v
+                for k, v in chat_request.model_dump().items()
+                if k not in (
+                    "prompt",
+                    "tools",
+                    "messages",
+                    "response_format",
+                )
+            },
+            prompt=prompt,
+            grammar=grammar,
+        ).model_dump()
+        sys.stderr.write(json.dumps(data, indent=2) + "\n")
         async with httpx.AsyncClient() as client:
             response = await client.post(
                 f"{cpp_server_endpoint}/completions",
-                json=LlamaCppServerCompletionRequest(
-                    prompt=prompt,
-                    stream=chat_request.stream,
-                    n_predict=1000,
-                    grammar=grammar,
-                ).model_dump(),
+                json=data,
                 headers=headers,
                 timeout=None)
 
@@ -96,11 +110,11 @@ async def chat_completions(request: Request, chat_request: ChatCompletionRequest
             return StreamingResponse(generate_chunks(response), media_type="text/event-stream")
         else:
             result = response.json()
+            sys.stderr.write("# RESULT:\n\n" + json.dumps(result, indent=2) + "\n\n")
             if 'content' not in result:
                 # print(json.dumps(result, indent=2))
                 return JSONResponse(result)
 
-            sys.stderr.write(json.dumps(result, indent=2) + "\n")
             # print(json.dumps(result.get('content'), indent=2))
             message = parser(result["content"])
             assert message is not None, f"Failed to parse response:\n{response.text}\n\n"

diff --git a/examples/openai/ts_converter.py b/examples/openai/ts_converter.py
@@ -14,12 +14,21 @@ class SchemaToTypeScriptConverter:
     # // where to get weather.
     # location: string,
     # }) => any;
+    def _desc_comment(self, schema: dict):
+        desc = schema.get("description", "").replace("\n", "\n// ") if 'description' in schema else None
+        return f'// {desc}\n' if desc else ''
+
     def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], additional_properties: Union[bool, Any]):
+        if additional_properties == True:
+            additional_properties = {}
+        elif additional_properties == False:
+            additional_properties = None
+
         return "{" + ', '.join([
-            f'{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
+            f'{self._desc_comment(prop_schema)}{prop_name}{"" if prop_name in required else "?"}: {self.visit(prop_schema)}'
             for prop_name, prop_schema in properties
         ] + (
-            [f"[key: string]: {self.visit(additional_properties)}"]
+            [f"{self._desc_comment(additional_properties) if additional_properties else ''}[key: string]: {self.visit(additional_properties)}"]
             if additional_properties is not None else []
         )) + "}"