Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/compare_providers_streaming.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.AWS_REGION }}
MISTRAL_LARGE_API: ${{ secrets.MISTRAL_LARGE_API }}
AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}

steps:
- name: Check out repository
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dynamodb/docker/docker/dynamodb/shared-local-instance.db
.coverage
benchmark_graph/*
.pytest_cache/*

*.ipynb

# Testing and coverage
coverage.xml
Expand All @@ -17,6 +17,9 @@ htmlcov

# Notebooks
notebooks/.ipynb_checkpoints/
.ipynb_checkpoints/
exp_dir/
experiments/


azure-test.py
186 changes: 129 additions & 57 deletions benchmarking/benchmark_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import time
from datetime import datetime
from matplotlib.ticker import LogLocator, FormatStrFormatter
import random
from utils.db_utils import save_flattened_metrics_to_csv


class Benchmark:
"""
Expand All @@ -25,11 +28,12 @@ def __init__(
providers,
num_requests,
models,
max_output,
prompt,
outputs,
prompts,
streaming=False,
verbosity=False,
vllm_ip=None
vllm_ip=None,
exp_dir=None
):
"""
Initializes the Benchmark instance with provided parameters.
Expand All @@ -46,11 +50,15 @@ def __init__(
self.providers = providers
self.num_requests = num_requests
self.models = models
self.prompt = prompt
self.prompts = prompts
self.streaming = streaming
self.max_output = max_output
self.outputs = outputs
self.verbosity = verbosity
self.vllm_ip = vllm_ip
self.exp_dir = exp_dir
# New parameters for retry mechanism
self.max_retries = 3
self.base_timeout = 10

base_dir = "streaming" if streaming else "end_to_end"

Expand All @@ -77,33 +85,40 @@ def plot_metrics(self, metric, filename_suffix):

for provider in self.providers:
provider_name = provider.__class__.__name__
for model, latencies in provider.metrics[metric].items():
# Convert to milliseconds and sort for CDF
latencies_sorted = np.sort(latencies) * 1000
cdf = np.arange(1, len(latencies_sorted) + 1) / len(latencies_sorted)
model_name = provider.get_model_name(model)
save_flattened_metrics_to_csv(provider, metric, f"{self.exp_dir}/{metric}_logs.csv")
print(provider.metrics[metric].items())
for model, input_dict in provider.metrics[metric].items():
for input_size, output_dict in input_dict.items():
for max_output, latencies in output_dict.items():
# Convert to milliseconds and sort for CDF
print(model, input_size, len(latencies))
latencies_sorted = np.sort(latencies) * 1000
cdf = np.arange(1, len(latencies_sorted) + 1) / len(latencies_sorted)
model_name = provider.get_model_name(model)

if provider_name.lower() == "vllm":
plt.plot(
latencies_sorted,
cdf,
marker="o",
linestyle="-",
markersize=6, # Slightly larger marker size
color="black", # Black color for the marker
label=f"{provider_name} - {model_name}",
linewidth=2, # Bold line
)
else:
plt.plot(
latencies_sorted,
cdf,
marker="o",
linestyle="-",
markersize=5,
label=f"{provider_name} - {model_name}",
)

if provider_name.lower() == "vllm":
plt.plot(
latencies_sorted,
cdf,
marker="o",
linestyle="-",
markersize=6, # Slightly larger marker size
color="black", # Black color for the marker
# label=f"{provider_name} - {model_name}",
label=f"{provider_name}",
linewidth=2, # Bold line
)
else:
plt.plot(
latencies_sorted,
cdf,
marker="o",
linestyle="-",
markersize=5,
# label=f"{provider_name} - {model_name}",
label=f"{provider_name} - {max_output}",
)

plt.xlabel("Latency (ms)", fontsize=12)
plt.ylabel("Portion of requests", fontsize=12)
plt.grid(True)
Expand Down Expand Up @@ -147,38 +162,94 @@ def run(self):
for provider in self.providers:
provider_name = provider.__class__.__name__
# logging.debug(f"{provider_name}")
print(f"{provider_name}")
# print(f"{provider_name}")
for model in self.models:
model_name = provider.get_model_name(model)
print(f"Model: {model_name}\nPrompt: {self.prompt}")
# print(f"Model: {model_name}")

for prompt in self.prompts:
# print(f"Prompt: {prompt}")
for max_output in self.outputs:
for i in range(self.num_requests):
if self.verbosity:
print(f"Request {i + 1}/{self.num_requests}")
print(f"{provider_name}" + f" Model: {model_name}")

for i in range(self.num_requests):
if self.verbosity:
print(f"Request {i + 1}/{self.num_requests}")
# if ((i+1) % 10) == 0:
# # print("[DEBUG] Sleeping for 2 mins to bypass rate limit...")
# time.sleep(120)

if i % 20 == 0:
# print("[DEBUG] Sleeping for 2 mins to bypass rate limit...")
time.sleep(120)
if self.streaming:
if provider_name == "vLLM":
provider.perform_inference_streaming(
model, prompt, self.vllm_ip, max_output, self.verbosity
)
else:
provider.perform_inference_streaming(
model, prompt, max_output, self.verbosity
)
else:
if provider_name == "vLLM":
provider.perform_inference(
model, prompt, self.vllm_ip, max_output, self.verbosity
)
else:
provider.perform_inference(
model, prompt, max_output, self.verbosity
)

if self.streaming:
if provider_name == "vLLM":
provider.perform_inference_streaming(
model, self.prompt, self.vllm_ip, self.max_output, self.verbosity
)
else:
provider.perform_inference_streaming(
model, self.prompt, self.max_output, self.verbosity
)
else:
if provider_name == "vLLM":
provider.perform_inference(
model, self.prompt, self.vllm_ip, self.max_output, self.verbosity
)
else:
provider.perform_inference(
model, self.prompt, self.max_output, self.verbosity
)

# # Simple retry mechanism

# success = False
# attempts = 0

# while not success and attempts < self.max_retries:
# try:
# # Calculate timeout with exponential backoff
# timeout = self.base_timeout * (2 ** attempts)
# print(timeout)

# if self.streaming:
# if provider_name == "vLLM":
# provider.perform_inference_streaming(
# model, self.prompt, self.vllm_ip,
# self.max_output, self.verbosity, timeout=timeout
# )
# else:
# provider.perform_inference_streaming(
# model, self.prompt, timeout, self.max_output,
# self.verbosity
# )
# else:
# if provider_name == "vLLM":
# provider.perform_inference(
# model, self.prompt, self.vllm_ip,
# self.max_output, self.verbosity, timeout=timeout
# )
# else:
# provider.perform_inference(
# model, self.prompt, self.max_output,
# self.verbosity, timeout=timeout
# )

# # If we get here, the request was successful
# success = True

# except Exception as e:
# attempts += 1
# if self.verbosity:
# print(f"Request failed (attempt {attempts}/{self.max_retries}): {str(e)}")

# # Add some jitter to avoid request storms on retry
# if attempts < self.max_retries:
# jitter = random.uniform(0.5, 1.5)
# wait_time = min(30, (2 ** attempts) * jitter)
# time.sleep(wait_time)

# # Record failed request if all attempts failed
# if not success:
# print(f"Request {i+1} failed after {self.max_retries} attempts")
if not self.streaming:
self.plot_metrics("response_times", "response_times")
else:
Expand All @@ -188,3 +259,4 @@ def run(self):
self.plot_metrics("timebetweentokens", "timebetweentokens")
self.plot_metrics("timebetweentokens_median", "timebetweentokens_median")
self.plot_metrics("timebetweentokens_p95", "timebetweentokens_p95")
self.plot_metrics("totaltokens", "totaltokens")
3 changes: 2 additions & 1 deletion benchmarking/dynamo_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,10 @@ def run(self):
if self.verbosity:
print(f"Request {i + 1}/{self.num_requests}")

if i % 20 == 0:
if i % 10 == 0:
# print("[DEBUG] Sleeping for 2 mins to bypass rate limit...")
time.sleep(120)
print("finished sleeping")

if self.streaming:
if provider_name == "vLLM":
Expand Down
17 changes: 8 additions & 9 deletions benchmarking/experiments/compare_providers_streaming.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
{
"providers": [
"Anthropic",
"Groq",
"Hyperbolic",
"AWSBedrock",
"Azure",
"Cloudflare",
"Google",
"Groq",
"Hyperbolic",
"OpenAI",
"PerplexityAI",
"TogetherAI"
"TogetherAI",
"Anthropic",
"Google",
"PerplexityAI"
],
"models": [
"common-model"
],
"num_requests": 100,
"num_requests": 1,
"input_tokens": 10,
"streaming": true,
"max_output": 100,
"verbose": true,
"backend": true
"verbose": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
"AWSBedrock"
],
"models": [
"common-model"
"meta-llama-3-8b-instruct", "mistral-23b-instruct-v0.1", "mistral-48b-instruct-v0.1", "meta-llama-3-70b-instruct", "mistral-124b-instruct-v0.1"
],
"num_requests": 100,
"num_requests": 1,
"input_tokens": 10,
"streaming": true,
"max_output": 100,
"verbose": true,
"backend": true
"verbose": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
],
"models": [
"meta-llama-3.1-8b-instruct",
"meta-llama-3.1-70b-instruct"
"mistral-23b-instruct-v0.1",
"meta-llama-3.3-70b-instruct",
"meta-llama-3.1-405b-instruct"
],
"num_requests": 100,
"num_requests": 1,
"input_tokens": 10,
"streaming": true,
"max_output": 100,
"verbose": true,
"backend": true
"verbose": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,13 @@
"Cloudflare"
],
"models": [
"google-gemma-2b-it",
"phi-2",
"meta-llama-3.2-3b-instruct",
"meta-llama-3.1-70b-instruct",
"mistral-7b-instruct-v0.1"
"meta-llama-2-7b",
"deepseek-ai-R1-32b",
"meta-llama-3.1-70b-instruct"
],
"num_requests": 100,
"input_tokens": 10,
"streaming": true,
"max_output": 100,
"verbose": true,
"backend": true
"verbose": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@
"input_tokens": 10,
"streaming": true,
"max_output": 100,
"verbose": true,
"backend": true
"verbose": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
"TogetherAI"
],
"models": [
"google-gemma-2b-it",
"meta-llama-3.2-3b-instruct",
"mistral-7b-instruct-v0.1",
"meta-llama-3.1-70b-instruct",
"meta-llama-3.1-405b-instruct"
Expand All @@ -13,6 +11,5 @@
"input_tokens": 10,
"streaming": true,
"max_output": 100,
"verbose": true,
"backend": true
"verbose": true
}
Loading
Loading