|
| 1 | +""" |
| 2 | +Example of using vLLM with mem0 for high-performance memory operations. |
| 3 | +
|
| 4 | +SETUP INSTRUCTIONS: |
| 5 | +1. Install vLLM: |
| 6 | + pip install vllm |
| 7 | +
|
| 8 | +2. Start vLLM server (in a separate terminal): |
| 9 | + vllm serve microsoft/DialoGPT-small --port 8000 |
| 10 | +
|
| 11 | + Wait for the message: "Uvicorn running on http://0.0.0.0:8000" |
| 12 | + (Small model: ~500MB download, much faster!) |
| 13 | +
|
| 14 | +3. Verify server is running: |
| 15 | + curl http://localhost:8000/health |
| 16 | +
|
| 17 | +4. Run this example: |
| 18 | + python examples/misc/vllm_example.py |
| 19 | +
|
| 20 | +Optional environment variables: |
| 21 | + export VLLM_BASE_URL="http://localhost:8000/v1" |
| 22 | + export VLLM_API_KEY="vllm-api-key" |
| 23 | +""" |
| 24 | + |
| 25 | +from mem0 import Memory |
| 26 | + |
| 27 | +# Configuration for vLLM integration |
| 28 | +config = { |
| 29 | + "llm": { |
| 30 | + "provider": "vllm", |
| 31 | + "config": { |
| 32 | + "model": "Qwen/Qwen2.5-32B-Instruct", |
| 33 | + "vllm_base_url": "http://localhost:8000/v1", |
| 34 | + "api_key": "vllm-api-key", |
| 35 | + "temperature": 0.7, |
| 36 | + "max_tokens": 100, |
| 37 | + } |
| 38 | + }, |
| 39 | + "embedder": { |
| 40 | + "provider": "openai", |
| 41 | + "config": { |
| 42 | + "model": "text-embedding-3-small" |
| 43 | + } |
| 44 | + }, |
| 45 | + "vector_store": { |
| 46 | + "provider": "qdrant", |
| 47 | + "config": { |
| 48 | + "collection_name": "vllm_memories", |
| 49 | + "host": "localhost", |
| 50 | + "port": 6333 |
| 51 | + } |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +def main(): |
| 56 | + """ |
| 57 | + Demonstrate vLLM integration with mem0 |
| 58 | + """ |
| 59 | + print("--> Initializing mem0 with vLLM...") |
| 60 | + |
| 61 | + # Initialize memory with vLLM |
| 62 | + memory = Memory.from_config(config) |
| 63 | + |
| 64 | + print("--> Memory initialized successfully!") |
| 65 | + |
| 66 | + # Example conversations to store |
| 67 | + conversations = [ |
| 68 | + { |
| 69 | + "messages": [ |
| 70 | + {"role": "user", "content": "I love playing chess on weekends"}, |
| 71 | + {"role": "assistant", "content": "That's great! Chess is an excellent strategic game that helps improve critical thinking."} |
| 72 | + ], |
| 73 | + "user_id": "user_123" |
| 74 | + }, |
| 75 | + { |
| 76 | + "messages": [ |
| 77 | + {"role": "user", "content": "I'm learning Python programming"}, |
| 78 | + {"role": "assistant", "content": "Python is a fantastic language for beginners! What specific areas are you focusing on?"} |
| 79 | + ], |
| 80 | + "user_id": "user_123" |
| 81 | + }, |
| 82 | + { |
| 83 | + "messages": [ |
| 84 | + {"role": "user", "content": "I prefer working late at night, I'm more productive then"}, |
| 85 | + {"role": "assistant", "content": "Many people find they're more creative and focused during nighttime hours. It's important to maintain a consistent schedule that works for you."} |
| 86 | + ], |
| 87 | + "user_id": "user_123" |
| 88 | + } |
| 89 | + ] |
| 90 | + |
| 91 | + print("\n--> Adding memories using vLLM...") |
| 92 | + |
| 93 | + # Add memories - now powered by vLLM's high-performance inference |
| 94 | + for i, conversation in enumerate(conversations, 1): |
| 95 | + result = memory.add( |
| 96 | + messages=conversation["messages"], |
| 97 | + user_id=conversation["user_id"] |
| 98 | + ) |
| 99 | + print(f"Memory {i} added: {result}") |
| 100 | + |
| 101 | + print("\n🔍 Searching memories...") |
| 102 | + |
| 103 | + # Search memories - vLLM will process the search and memory operations |
| 104 | + search_queries = [ |
| 105 | + "What does the user like to do on weekends?", |
| 106 | + "What is the user learning?", |
| 107 | + "When is the user most productive?" |
| 108 | + ] |
| 109 | + |
| 110 | + for query in search_queries: |
| 111 | + print(f"\nQuery: {query}") |
| 112 | + memories = memory.search( |
| 113 | + query=query, |
| 114 | + user_id="user_123" |
| 115 | + ) |
| 116 | + |
| 117 | + for memory_item in memories: |
| 118 | + print(f" - {memory_item['memory']}") |
| 119 | + |
| 120 | + print("\n--> Getting all memories for user...") |
| 121 | + all_memories = memory.get_all(user_id="user_123") |
| 122 | + print(f"Total memories stored: {len(all_memories)}") |
| 123 | + |
| 124 | + for memory_item in all_memories: |
| 125 | + print(f" - {memory_item['memory']}") |
| 126 | + |
| 127 | + print("\n--> vLLM integration demo completed successfully!") |
| 128 | + print("\nBenefits of using vLLM:") |
| 129 | + print(" -> 2.7x higher throughput compared to standard implementations") |
| 130 | + print(" -> 5x faster time-per-output-token") |
| 131 | + print(" -> Efficient memory usage with PagedAttention") |
| 132 | + print(" -> Simple configuration, same as other providers") |
| 133 | + |
| 134 | + |
| 135 | +if __name__ == "__main__": |
| 136 | + try: |
| 137 | + main() |
| 138 | + except Exception as e: |
| 139 | + print(f"=> Error: {e}") |
| 140 | + print("\nTroubleshooting:") |
| 141 | + print("1. Make sure vLLM server is running: vllm serve microsoft/DialoGPT-small --port 8000") |
| 142 | + print("2. Check if the model is downloaded and accessible") |
| 143 | + print("3. Verify the base URL and port configuration") |
| 144 | + print("4. Ensure you have the required dependencies installed") |
0 commit comments