Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 5 additions & 55 deletions examples/How_to_count_tokens_with_tiktoken.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -197,16 +197,10 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
" \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
" encoding = tiktoken.get_encoding(encoding_name)\n",
" num_tokens = len(encoding.encode(string))\n",
" return num_tokens"
]
"source": "# Import num_tokens_from_string function from our utility module\nfrom utils.token_counting_utils import num_tokens_from_string"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -460,54 +454,10 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\"):\n",
" \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n",
" try:\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" except KeyError:\n",
" print(\"Warning: model not found. Using o200k_base encoding.\")\n",
" encoding = tiktoken.get_encoding(\"o200k_base\")\n",
" if model in {\n",
" \"gpt-3.5-turbo-0125\",\n",
" \"gpt-4-0314\",\n",
" \"gpt-4-32k-0314\",\n",
" \"gpt-4-0613\",\n",
" \"gpt-4-32k-0613\",\n",
" \"gpt-4o-mini-2024-07-18\",\n",
" \"gpt-4o-2024-08-06\"\n",
" }:\n",
" tokens_per_message = 3\n",
" tokens_per_name = 1\n",
" elif \"gpt-3.5-turbo\" in model:\n",
" print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.\")\n",
" return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0125\")\n",
" elif \"gpt-4o-mini\" in model:\n",
" print(\"Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\")\n",
" return num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\")\n",
" elif \"gpt-4o\" in model:\n",
" print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n",
" return num_tokens_from_messages(messages, model=\"gpt-4o-2024-08-06\")\n",
" elif \"gpt-4\" in model:\n",
" print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
" return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
" else:\n",
" raise NotImplementedError(\n",
" f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n",
" )\n",
" num_tokens = 0\n",
" for message in messages:\n",
" num_tokens += tokens_per_message\n",
" for key, value in message.items():\n",
" num_tokens += len(encoding.encode(value))\n",
" if key == \"name\":\n",
" num_tokens += tokens_per_name\n",
" num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>\n",
" return num_tokens\n"
]
"source": "# Import the unified token counting function\nimport sys\nimport os\n# Add the utils directory to the path so we can import our utility\nsys.path.append(os.path.join(os.path.dirname(os.path.abspath('.')), 'utils'))\n\nfrom utils.token_counting_utils import num_tokens_from_messages\n\n# The num_tokens_from_messages function is now imported from the shared utility module\n# It supports all current OpenAI models including:\n# - gpt-3.5-turbo variants\n# - gpt-4 variants \n# - gpt-4o and gpt-4o-mini variants"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -811,4 +761,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
49 changes: 3 additions & 46 deletions examples/How_to_format_inputs_to_ChatGPT_models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -508,53 +508,10 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tiktoken\n",
"\n",
"\n",
"def num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\"):\n",
" \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n",
" try:\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" except KeyError:\n",
" print(\"Warning: model not found. Using cl100k_base encoding.\")\n",
" encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
" if model in {\n",
" \"gpt-3.5-turbo-0613\",\n",
" \"gpt-3.5-turbo-16k-0613\",\n",
" \"gpt-4-0314\",\n",
" \"gpt-4-32k-0314\",\n",
" \"gpt-4-0613\",\n",
" \"gpt-4-32k-0613\",\n",
" }:\n",
" tokens_per_message = 3\n",
" tokens_per_name = 1\n",
" elif model == \"gpt-3.5-turbo-0301\":\n",
" tokens_per_message = 4 # every message follows <|start|>{role/name}\\n{content}<|end|>\\n\n",
" tokens_per_name = -1 # if there's a name, the role is omitted\n",
" elif \"gpt-3.5-turbo\" in model:\n",
" print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\")\n",
" return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\")\n",
" elif \"gpt-4\" in model:\n",
" print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
" return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
" else:\n",
" raise NotImplementedError(\n",
" f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n",
" )\n",
" num_tokens = 0\n",
" for message in messages:\n",
" num_tokens += tokens_per_message\n",
" for key, value in message.items():\n",
" num_tokens += len(encoding.encode(value))\n",
" if key == \"name\":\n",
" num_tokens += tokens_per_name\n",
" num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>\n",
" return num_tokens\n"
]
"source": "# Import the unified token counting function\nimport sys\nimport os\n# Add the utils directory to the path so we can import our utility\nsys.path.append(os.path.join(os.path.dirname(os.path.abspath('.')), 'utils'))\n\nfrom utils.token_counting_utils import num_tokens_from_messages\n\n# The num_tokens_from_messages function is now imported from the shared utility module\n# It supports all current OpenAI models including:\n# - gpt-3.5-turbo variants\n# - gpt-4 variants \n# - gpt-4o and gpt-4o-mini variants"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -678,4 +635,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
125 changes: 125 additions & 0 deletions examples/utils/token_counting_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Utility functions for counting tokens used by OpenAI models.

This module provides functions to estimate the number of tokens that will be
used by various OpenAI models when processing messages.
"""

import tiktoken


def num_tokens_from_messages(messages, model="gpt-4o-mini"):
"""
Return the number of tokens used by a list of messages.

Args:
messages: List of message dictionaries with 'role' and 'content' keys
model: Model name string (e.g., "gpt-4", "gpt-3.5-turbo", "gpt-4o-mini")

Returns:
int: Estimated number of tokens used by the messages

Note:
Token counts are estimates and may vary slightly from actual API usage.
The exact token counting method may change between model versions.
"""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
print(f"Warning: model {model} not found. Using o200k_base encoding.")
encoding = tiktoken.get_encoding("o200k_base")

# Models that use o200k_base encoding
if model in {
"gpt-4o",
"gpt-4o-2024-05-13",
"gpt-4o-2024-08-06",
"gpt-4o-mini",
"gpt-4o-mini-2024-07-18",
}:
# For o200k_base models, use o200k_base encoding
try:
encoding = tiktoken.get_encoding("o200k_base")
except KeyError:
pass
# Models that use cl100k_base encoding
elif model in {
"gpt-3.5-turbo-0125",
"gpt-3.5-turbo-0613",
"gpt-3.5-turbo-16k-0613",
"gpt-4-0314",
"gpt-4-32k-0314",
"gpt-4-0613",
"gpt-4-32k-0613",
}:
# For cl100k_base models, ensure we're using cl100k_base
try:
encoding = tiktoken.get_encoding("cl100k_base")
except KeyError:
pass

# Set tokens per message and per name based on model
if model in {
"gpt-3.5-turbo-0125",
"gpt-3.5-turbo-0613",
"gpt-3.5-turbo-16k-0613",
"gpt-4-0314",
"gpt-4-32k-0314",
"gpt-4-0613",
"gpt-4-32k-0613",
"gpt-4o-mini-2024-07-18",
"gpt-4o-mini",
"gpt-4o-2024-08-06",
"gpt-4o",
}:
tokens_per_message = 3
tokens_per_name = 1
elif model == "gpt-3.5-turbo-0301":
# Special handling for gpt-3.5-turbo-0301
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
tokens_per_name = -1 # if there's a name, the role is omitted
# Handle base model names that may update over time
elif "gpt-3.5-turbo" in model:
print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.")
return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125")
elif "gpt-4o-mini" in model:
print("Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.")
return num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18")
elif "gpt-4o" in model:
print("Warning: gpt-4o may update over time. Returning num tokens assuming gpt-4o-2024-08-06.")
return num_tokens_from_messages(messages, model="gpt-4o-2024-08-06")
elif "gpt-4" in model:
print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
return num_tokens_from_messages(messages, model="gpt-4-0613")
else:
raise NotImplementedError(
f"num_tokens_from_messages() is not implemented for model {model}. "
f"See https://github.com/openai/openai-python/blob/main/chatml.md "
f"for information on how messages are converted to tokens."
)

num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
return num_tokens


def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""
Returns the number of tokens in a text string using the specified encoding.

Args:
string: The text string to tokenize
encoding_name: The name of the encoding to use (e.g., "cl100k_base", "o200k_base")

Returns:
int: Number of tokens in the string
"""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens