From ab84ceb012ffc14ed4e02079abe5bd307134de48 Mon Sep 17 00:00:00 2001 From: Pinak Panigrahi Date: Mon, 6 Jan 2025 10:09:05 -0800 Subject: [PATCH 1/3] adding a tokenizer flag to use the relevant tokenizer --- token_benchmark_ray.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py index 63216b1..5640cab 100644 --- a/token_benchmark_ray.py +++ b/token_benchmark_ray.py @@ -32,6 +32,7 @@ def get_token_throughput_latencies( stddev_input_tokens: int, mean_output_tokens: int, stddev_output_tokens: int, + tokenizer: str, additional_sampling_params: Optional[Dict[str, Any]] = None, num_concurrent_requests: int = 1, max_num_completed_requests: int = 500, @@ -60,9 +61,7 @@ def get_token_throughput_latencies( """ random.seed(11111) - tokenizer = LlamaTokenizerFast.from_pretrained( - "hf-internal-testing/llama-tokenizer" - ) + tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer) get_token_length = lambda text: len(tokenizer.encode(text)) if not additional_sampling_params: @@ -292,6 +291,7 @@ def run_token_benchmark( additional_sampling_params: str, results_dir: str, user_metadata: Dict[str, Any], + tokenizer: str ): """ Args: @@ -327,6 +327,7 @@ def run_token_benchmark( stddev_output_tokens=stddev_output_tokens, num_concurrent_requests=num_concurrent_requests, additional_sampling_params=json.loads(additional_sampling_params), + tokenizer=tokenizer ) if results_dir: @@ -462,6 +463,11 @@ def run_token_benchmark( "name=foo,bar=1. These will be added to the metadata field of the results. " ), ) +args.add_argument( + "--tokenizer", + type=str, + default="hf-internal-testing/llama-tokenizer", +) if __name__ == "__main__": env_vars = dict(os.environ) @@ -488,4 +494,5 @@ def run_token_benchmark( additional_sampling_params=args.additional_sampling_params, results_dir=args.results_dir, user_metadata=user_metadata, + tokenizer=args.tokenizer, ) From ec97f3c6ce266b81c2b5ef242825a63d30af29aa Mon Sep 17 00:00:00 2001 From: Pinak Panigrahi Date: Mon, 13 Jan 2025 21:51:52 -0800 Subject: [PATCH 2/3] use AutoTokenizer class --- token_benchmark_ray.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py index 5640cab..d6bf0cd 100644 --- a/token_benchmark_ray.py +++ b/token_benchmark_ray.py @@ -24,7 +24,7 @@ ) from tqdm import tqdm -from transformers import LlamaTokenizerFast +from transformers import AutoTokenizer def get_token_throughput_latencies( model: str, @@ -32,7 +32,6 @@ def get_token_throughput_latencies( stddev_input_tokens: int, mean_output_tokens: int, stddev_output_tokens: int, - tokenizer: str, additional_sampling_params: Optional[Dict[str, Any]] = None, num_concurrent_requests: int = 1, max_num_completed_requests: int = 500, @@ -61,7 +60,7 @@ def get_token_throughput_latencies( """ random.seed(11111) - tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer) + tokenizer = AutoTokenizer.from_pretrained(model) get_token_length = lambda text: len(tokenizer.encode(text)) if not additional_sampling_params: @@ -290,8 +289,7 @@ def run_token_benchmark( stddev_output_tokens: int, additional_sampling_params: str, results_dir: str, - user_metadata: Dict[str, Any], - tokenizer: str + user_metadata: Dict[str, Any] ): """ Args: @@ -463,11 +461,6 @@ def run_token_benchmark( "name=foo,bar=1. These will be added to the metadata field of the results. " ), ) -args.add_argument( - "--tokenizer", - type=str, - default="hf-internal-testing/llama-tokenizer", -) if __name__ == "__main__": env_vars = dict(os.environ) @@ -493,6 +486,5 @@ def run_token_benchmark( num_concurrent_requests=args.num_concurrent_requests, additional_sampling_params=args.additional_sampling_params, results_dir=args.results_dir, - user_metadata=user_metadata, - tokenizer=args.tokenizer, + user_metadata=user_metadata ) From a24f20cf5751c41e9054d817fabe486614f99cc0 Mon Sep 17 00:00:00 2001 From: Pinak Panigrahi Date: Mon, 20 Jan 2025 16:19:07 -0800 Subject: [PATCH 3/3] revert to using a tokenizer passed through the tokenizer argument --- token_benchmark_ray.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py index d6bf0cd..7f9f4bd 100644 --- a/token_benchmark_ray.py +++ b/token_benchmark_ray.py @@ -34,6 +34,7 @@ def get_token_throughput_latencies( stddev_output_tokens: int, additional_sampling_params: Optional[Dict[str, Any]] = None, num_concurrent_requests: int = 1, + tokenizer: str = "hf-internal-testing/llama-tokenizer", max_num_completed_requests: int = 500, test_timeout_s=90, llm_api="openai", @@ -60,7 +61,7 @@ def get_token_throughput_latencies( """ random.seed(11111) - tokenizer = AutoTokenizer.from_pretrained(model) + tokenizer = AutoTokenizer.from_pretrained(tokenizer) get_token_length = lambda text: len(tokenizer.encode(text)) if not additional_sampling_params: @@ -289,7 +290,8 @@ def run_token_benchmark( stddev_output_tokens: int, additional_sampling_params: str, results_dir: str, - user_metadata: Dict[str, Any] + user_metadata: Dict[str, Any], + tokenizer: str ): """ Args: @@ -461,6 +463,14 @@ def run_token_benchmark( "name=foo,bar=1. These will be added to the metadata field of the results. " ), ) +args.add_argument( + "--tokenizer", + type=str, + default="hf-internal-testing/llama-tokenizer", + help=( + "Tokenizer to use for counting tokens" + ), +) if __name__ == "__main__": env_vars = dict(os.environ) @@ -486,5 +496,6 @@ def run_token_benchmark( num_concurrent_requests=args.num_concurrent_requests, additional_sampling_params=args.additional_sampling_params, results_dir=args.results_dir, - user_metadata=user_metadata + user_metadata=user_metadata, + tokenizer=args.tokenizer )