diff --git a/sibble_scripts/priority_batch_benchmark.py b/sibble_scripts/priority_batch_benchmark.py new file mode 100644 index 000000000000..add3cbf25b85 --- /dev/null +++ b/sibble_scripts/priority_batch_benchmark.py @@ -0,0 +1,156 @@ +from vllm import LLM, SamplingParams +import time +from typing import List, Dict +import heapq +from dataclasses import dataclass +from queue import PriorityQueue +import numpy as np + + +@dataclass(order=True) +class PrioritizedRequest: + priority: float + prompt: str = None + + +def create_tree_search_prompts(depth: int = 3, branches: int = 3) -> Dict[str, float]: + """Creates a set of prompts simulating tree search with assigned priorities""" + prompts_with_priority = {} + base_prompt = "Analyze the following solution path:" + + # Create one "golden" high-priority path + golden_priority = 0.95 # Very high priority solution + golden_path = "Sustainable vertical farming + Distribution optimization + Community engagement" + prompts_with_priority[f"{base_prompt} Solution: {golden_path}"] = golden_priority + + # Create other paths with lower priorities + solutions = [ + "Local food production networks", + "Technology-enabled distribution", + "Policy and infrastructure changes", + "Education and training programs", + ] + + for level in range(depth): + for branch in range(branches): + # Lower levels have potentially higher priorities (simulating getting closer to solution) + base_priority = (level + 1) / depth # Increases with depth + priority = ( + np.random.beta(2, 5) * base_priority + ) # More low priorities than high + solution = np.random.choice(solutions) + prompt = f"{base_prompt} Level {level}, Branch {branch}: {solution}" + prompts_with_priority[prompt] = priority + + return prompts_with_priority + + +def batch_requests_priority( + prompts_dict: Dict[str, float], batch_size: int +) -> List[List[str]]: + """Groups requests into batches, prioritizing higher scores""" + # Pre-sort all items at once instead of using a priority queue + sorted_prompts = sorted(prompts_dict.items(), key=lambda x: x[1], reverse=True) + prompts = [prompt for prompt, _ in sorted_prompts] + + # Use the same list comprehension as FIFO for batching + return [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)] + + +def batch_requests_fifo( + prompts_dict: Dict[str, float], batch_size: int +) -> List[List[str]]: + """Groups requests into batches in FIFO order""" + prompts = list(prompts_dict.keys()) + return [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)] + + +def generate_outputs(llm, prompts: List[str], sampling_params): + formatted_prompts = [f"[INST] {prompt} [/INST]" for prompt in prompts] + outputs = llm.generate(formatted_prompts, sampling_params) + return [output.outputs[0].text.strip() for output in outputs] + + +def main(): + llm_config = { + "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "tensor_parallel_size": 1, + "max_model_len": 1024, + "max_num_seqs": 4, + "max_num_batched_tokens": 4096, + "gpu_memory_utilization": 0.7, + } + + sampling_params = SamplingParams( + temperature=0.7, + max_tokens=64, + stop=["", "[/INST]"], + ) + + # Create tree search simulation prompts + prompts_with_priority = create_tree_search_prompts(depth=3, branches=3) + batch_size = 4 + + # Initialize model + llm = LLM(**llm_config) + + # Test priority-based batching with early stopping + print("\n=== Priority-based Search Path ===") + priority_batches = batch_requests_priority(prompts_with_priority, batch_size) + + priority_results = [] + start_time = time.time() + found_solution = False + + for batch_idx, batch in enumerate(priority_batches): + results = generate_outputs(llm, batch, sampling_params) + priority_results.extend(results) + + # Print batch information and results + print(f"\nBatch {batch_idx + 1}:") + for prompt, result in zip(batch, results): + priority = prompts_with_priority[prompt] + print(f"\nPriority: {priority:.3f}") + print(f"Prompt: {prompt}") + print(f"Response: {result[:100]}...") # Truncate long responses + + if priority > 0.9: # Found high-priority solution + print("\n🎯 Found high-priority solution!") + found_solution = True + break + if found_solution: + break + + priority_time = time.time() - start_time + + # Test FIFO batching + print("\n=== FIFO Search Path ===") + fifo_batches = batch_requests_fifo(prompts_with_priority, batch_size) + + fifo_results = [] + start_time = time.time() + for batch_idx, batch in enumerate(fifo_batches): + results = generate_outputs(llm, batch, sampling_params) + fifo_results.extend(results) + + # Print batch information and results + print(f"\nBatch {batch_idx + 1}:") + for prompt, result in zip(batch, results): + print(f"\nPriority: {prompts_with_priority[prompt]:.3f}") + print(f"Prompt: {prompt}") + print(f"Response: {result[:100]}...") + + fifo_time = time.time() - start_time + + # Print final comparison + print("\n=== Performance Comparison ===") + print( + f"Priority-based batching time (with early stopping): {priority_time:.2f} seconds" + ) + print(f"FIFO batching time: {fifo_time:.2f} seconds") + print(f"Priority batches processed: {len(priority_results)} outputs") + print(f"FIFO batches processed: {len(fifo_results)} outputs") + + +if __name__ == "__main__": + main() diff --git a/sibble_scripts/shared_prefill.py b/sibble_scripts/shared_prefill.py new file mode 100644 index 000000000000..5ec77e87cf2b --- /dev/null +++ b/sibble_scripts/shared_prefill.py @@ -0,0 +1,101 @@ +from vllm import LLM, SamplingParams +import time +from typing import List +from util import estimate_shared_prefix_tokens, shared_prefix + + +def generate_outputs(llm, prompts: List[str], sampling_params): + formatted_prompts = [f"[INST] {prompt} [/INST]" for prompt in prompts] + outputs = llm.generate(formatted_prompts, sampling_params) + return [output.outputs[0].text.strip() for output in outputs] + + +def main(): + # Common LLM configuration + llm_config = { + "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "tensor_parallel_size": 1, + "max_model_len": 1024, + "max_num_seqs": 2, + "max_num_batched_tokens": 2048, + "use_v2_block_manager": True, + "gpu_memory_utilization": 0.7, + "enforce_eager": True, + "enable_chunked_prefill": False, + } + + sampling_params = SamplingParams( + temperature=0.7, + max_tokens=8, + stop=["", "[/INST]"], + ) + + topics = [ + "quantum computing", + "neural networks", + "blockchain", + "AI ethics", + "cloud systems", + "cybersecurity", + "data science", + "edge computing", + "deep learning", + "robotics", + ] * 10 + + prompts = [f"{shared_prefix} {topic}." for topic in topics] + + # Estimate tokens in shared prefix + token_count = estimate_shared_prefix_tokens(llm_config["model"], shared_prefix) + print(f"\nEstimated tokens in shared prefix: {token_count}") + + # Test shared prefix approach + print("\nInitializing model with prefix sharing...") + llm = LLM(**llm_config, enable_prefix_caching=True) + + # Warmup run + print("Performing warmup run...") + _ = generate_outputs(llm, prompts[:2], sampling_params) + + print("Running performance test...") + start_time = time.time() + shared_results = generate_outputs(llm, prompts, sampling_params) + shared_time = time.time() - start_time + del llm + + # Test regular approach (prefix sharing disabled) + print("\nInitializing model without prefix sharing...") + llm_regular = LLM(**llm_config, enable_prefix_caching=False) + + # Warmup run + print("Performing warmup run...") + _ = generate_outputs(llm_regular, prompts[:2], sampling_params) + + print("Running performance test...") + start_time = time.time() + regular_results = generate_outputs(llm_regular, prompts, sampling_params) + regular_time = time.time() - start_time + del llm_regular + + # Print timing comparison + print("\n=== Performance Comparison ===") + print(f"Shared prefix execution time: {shared_time:.2f} seconds") + print(f"Regular execution time: {regular_time:.2f} seconds") + print(f"Time saved with prefix sharing: {regular_time - shared_time:.2f} seconds") + print(f"Performance improvement: {(regular_time/shared_time - 1)*100:.1f}%") + + # Print sample results (first few only) + print("\n=== Sample Generated Results ===") + print("\nShared Prefix Results (first 2):") + for i, (prompt, result) in enumerate(zip(prompts[:2], shared_results[:2]), 1): + print(f"\nPrompt {i}: {prompt}") + print(f"Response {i}: {result}") + + print("\nRegular Results (first 2):") + for i, (prompt, result) in enumerate(zip(prompts[:2], regular_results[:2]), 1): + print(f"\nPrompt {i}: {prompt}") + print(f"Response {i}: {result}") + + +if __name__ == "__main__": + main() diff --git a/sibble_scripts/spec_dec.py b/sibble_scripts/spec_dec.py new file mode 100644 index 000000000000..37ebbe4e15bf --- /dev/null +++ b/sibble_scripts/spec_dec.py @@ -0,0 +1,53 @@ +import re +import torch +from vllm import LLM, SamplingParams + + +prompts = [ + "Clouds are formed by", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Common LLM configuration +llm_config = { + "model": "facebook/opt-6.7b", + "tensor_parallel_size": 1, + "max_model_len": 128, + "max_num_seqs": 1, + "max_num_batched_tokens": 128, + "use_v2_block_manager": True, +} + +# Run with speculative decoding +llm_spec = LLM( + **llm_config, + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, +) + +print("\n=== Speculative Decoding ===") +outputs_spec = llm_spec.generate(prompts, sampling_params) +for output in outputs_spec: + tokens_per_second = len(output.outputs[0].token_ids) / ( + output.metrics.finished_time - output.metrics.first_scheduled_time + ) + print(f"Prompt: {output.prompt!r}") + print(f"Generated: {output.outputs[0].text!r}") + print(f"Tokens per second: {tokens_per_second:.2f}") + +# Clean up memory +del llm_spec +torch.cuda.empty_cache() + +# Run without speculative decoding +llm_normal = LLM(**llm_config) + +print("\n=== Normal Inference ===") +outputs_normal = llm_normal.generate(prompts, sampling_params) +for output in outputs_normal: + tokens_per_second = len(output.outputs[0].token_ids) / ( + output.metrics.finished_time - output.metrics.first_scheduled_time + ) + print(f"Prompt: {output.prompt!r}") + print(f"Generated: {output.outputs[0].text!r}") + print(f"Tokens per second: {tokens_per_second:.2f}") diff --git a/sibble_scripts/util.py b/sibble_scripts/util.py new file mode 100644 index 000000000000..943c7daf3b44 --- /dev/null +++ b/sibble_scripts/util.py @@ -0,0 +1,54 @@ +from transformers import AutoTokenizer +import re + + +def estimate_shared_prefix_tokens(model_name: str, shared_prefix: str) -> int: + """ + Estimates the number of tokens in the shared prefix based on the model being used. + + Args: + model_name: Name of the model (e.g., "TinyLlama/TinyLlama-1.1B-Chat-v1.0") + shared_prefix: The shared prefix text to estimate tokens for + + Returns: + int: Estimated number of tokens + """ + try: + # Load the appropriate tokenizer based on model name + tokenizer = AutoTokenizer.from_pretrained(model_name) + + # For Llama-style models, we need to account for the chat template + if "llama" in model_name.lower(): + # Add the chat template markers that are added in generate_outputs + shared_prefix = f"[INST] {shared_prefix}" + + # Get token count + tokens = tokenizer(shared_prefix, return_tensors="pt") + token_count = len(tokens[0]) + + return token_count + + except Exception as e: + print(f"Warning: Could not load tokenizer for {model_name}. Error: {e}") + # Fallback to a very rough estimation (1 token ≈ 4 characters) + return len(shared_prefix) // 4 + + +# Create prompts with very long shared prefix and short variations +shared_prefix = """You are a distinguished technical expert with decades of experience across multiple domains of technology and innovation. Your analytical approach combines deep theoretical knowledge with practical industry experience, allowing you to evaluate technological developments from both academic and commercial perspectives. You possess a comprehensive understanding of how technologies evolve, scale, and integrate into existing infrastructures, while maintaining a keen awareness of their broader societal implications. + + In your analysis, you consistently examine the intricate relationships between technical capabilities and real-world constraints, considering everything from fundamental scientific principles to practical implementation challenges. You evaluate technologies through multiple lenses: their technical merit and innovation potential, their economic viability and market dynamics, their environmental and social impact, and their alignment with regulatory frameworks and ethical standards. + + Your expertise extends beyond pure technical assessment to encompass strategic considerations such as scalability pathways, security implications, and long-term sustainability. You understand how technologies affect various stakeholders, from individual users to large organizations, and how they influence industry standards and best practices. You're particularly adept at identifying potential risks and opportunities, whether they're technical, operational, or strategic in nature. + + When analyzing technologies, you draw upon your extensive experience with system architectures, performance optimization, quality assurance, and risk management. You consider both immediate practical applications and long-term evolutionary potential, always maintaining a balanced perspective that acknowledges both opportunities and challenges. Your insights are grounded in a deep understanding of market dynamics, regulatory environments, and the complex interplay between innovation and established systems. + + Your analytical methodology is rooted in a systematic approach that begins with thorough research and data collection, followed by rigorous analysis using both quantitative and qualitative methods. You excel at synthesizing complex technical information from multiple sources, identifying key patterns and trends, and drawing meaningful conclusions that can inform strategic decision-making. Your recommendations are always backed by solid evidence and careful consideration of alternative perspectives. + + Throughout your career, you've developed a unique ability to bridge the gap between technical complexity and practical implementation. You understand the challenges of technology adoption across different organizational contexts and scales. Your experience spans both established enterprises and innovative startups, giving you insight into how different organizational cultures and resources affect technology implementation and success. + + You maintain active engagement with the latest developments in multiple technical fields through continuous learning and participation in professional networks. Your knowledge base is constantly evolving, incorporating new research findings, emerging technologies, and evolving best practices. This commitment to ongoing education ensures that your analyses remain current and relevant in a rapidly changing technological landscape. + + Your communication style combines technical precision with clarity and accessibility. You can effectively convey complex technical concepts to diverse audiences, from technical specialists to business stakeholders. You understand the importance of tailoring your analysis to the specific needs and context of different stakeholders while maintaining the technical rigor necessary for informed decision-making. + + Based on your extensive expertise and multifaceted analytical approach, please provide a concise technical summary about"""