mem0ai
diff --git a/‎docs/components/llms/config.mdx‎
Lines changed: 2 additions & 0 deletions b/‎docs/components/llms/config.mdx‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/components/llms/models/vllm.mdx‎
Lines changed: 109 additions & 0 deletions b/‎docs/components/llms/models/vllm.mdx‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎docs/docs.json‎
Lines changed: 2 additions & 1 deletion b/‎docs/docs.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/misc/vllm_example.py‎
Lines changed: 144 additions & 0 deletions b/‎examples/misc/vllm_example.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎mem0/configs/llms/base.py‎
Lines changed: 7 additions & 0 deletions b/‎mem0/configs/llms/base.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎mem0/llms/configs.py‎
Lines changed: 1 addition & 0 deletions b/‎mem0/llms/configs.py‎
Lines changed: 1 addition & 0 deletions
@@ -58,6 +58,7 @@ config = {
 
 m = Memory.from_config(config)
 m.add("Your text here", user_id="user", metadata={"category": "example"})
+
 ```
 
 ```typescript TypeScript
@@ -76,6 +77,7 @@ const config = {
 const memory = new Memory(config);
 await memory.add("Your text here", { userId: "user123", metadata: { category: "example" } });
 ```
+
 </CodeGroup>
 
 ## Why is Config Needed?
 
@@ -0,0 +1,109 @@
+---
+title: vLLM
+---
+
+<Snippet file="paper-release.mdx" />
+
+[vLLM](https://docs.vllm.ai/) is a high-performance inference engine for large language models that provides significant performance improvements for local inference. It's designed to maximize throughput and memory efficiency for serving LLMs.
+
+## Prerequisites
+
+1. **Install vLLM**:
+
+   ```bash
+   pip install vllm
+   ```
+
+2. **Start vLLM server**:
+
+   ```bash
+   # For testing with a small model
+   vllm serve microsoft/DialoGPT-medium --port 8000
+
+   # For production with a larger model (requires GPU)
+   vllm serve Qwen/Qwen2.5-32B-Instruct --port 8000
+   ```
+
+## Usage
+
+```python
+import os
+from mem0 import Memory
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"  # used for embedding model
+
+config = {
+    "llm": {
+        "provider": "vllm",
+        "config": {
+            "model": "Qwen/Qwen2.5-32B-Instruct",
+            "vllm_base_url": "http://localhost:8000/v1",
+            "temperature": 0.1,
+            "max_tokens": 2000,
+        }
+    }
+}
+
+m = Memory.from_config(config)
+messages = [
+    {"role": "user", "content": "I'm planning to watch a movie tonight. Any recommendations?"},
+    {"role": "assistant", "content": "How about thriller movies? They can be quite engaging."},
+    {"role": "user", "content": "I'm not a big fan of thrillers, but I love sci-fi movies."},
+    {"role": "assistant", "content": "Got it! I'll avoid thrillers and suggest sci-fi movies instead."}
+]
+m.add(messages, user_id="alice", metadata={"category": "movies"})
+```
+
+## Configuration Parameters
+
+| Parameter       | Description                       | Default                       | Environment Variable |
+| --------------- | --------------------------------- | ----------------------------- | -------------------- |
+| `model`         | Model name running on vLLM server | `"Qwen/Qwen2.5-32B-Instruct"` | -                    |
+| `vllm_base_url` | vLLM server URL                   | `"http://localhost:8000/v1"`  | `VLLM_BASE_URL`      |
+| `api_key`       | API key (dummy for local)         | `"vllm-api-key"`              | `VLLM_API_KEY`       |
+| `temperature`   | Sampling temperature              | `0.1`                         | -                    |
+| `max_tokens`    | Maximum tokens to generate        | `2000`                        | -                    |
+
+## Environment Variables
+
+You can set these environment variables instead of specifying them in config:
+
+```bash
+export VLLM_BASE_URL="http://localhost:8000/v1"
+export VLLM_API_KEY="your-vllm-api-key"
+export OPENAI_API_KEY="your-openai-api-key"  # for embeddings
+```
+
+## Benefits
+
+- **High Performance**: 2-24x faster inference than standard implementations
+- **Memory Efficient**: Optimized memory usage with PagedAttention
+- **Local Deployment**: Keep your data private and reduce API costs
+- **Easy Integration**: Drop-in replacement for other LLM providers
+- **Flexible**: Works with any model supported by vLLM
+
+## Troubleshooting
+
+1. **Server not responding**: Make sure vLLM server is running
+
+   ```bash
+   curl http://localhost:8000/health
+   ```
+
+2. **404 errors**: Ensure correct base URL format
+
+   ```python
+   "vllm_base_url": "http://localhost:8000/v1"  # Note the /v1
+   ```
+
+3. **Model not found**: Check model name matches server
+
+4. **Out of memory**: Try smaller models or reduce `max_model_len`
+
+   ```bash
+   vllm serve Qwen/Qwen2.5-32B-Instruct --max-model-len 4096
+   ```
+
+## Config
+
+All available parameters for the `vllm` config are present in [Master List of All Params in Config](../config).
@@ -117,7 +117,8 @@
                           "components/llms/models/xAI",
                           "components/llms/models/sarvam",
                           "components/llms/models/lmstudio",
-                          "components/llms/models/langchain"
+                          "components/llms/models/langchain",
+                          "components/llms/models/vllm"
                         ]
                       }
                     ]
 
@@ -0,0 +1,144 @@
+"""
+Example of using vLLM with mem0 for high-performance memory operations.
+
+SETUP INSTRUCTIONS:
+1. Install vLLM:
+   pip install vllm
+
+2. Start vLLM server (in a separate terminal):
+   vllm serve microsoft/DialoGPT-small --port 8000
+
+   Wait for the message: "Uvicorn running on http://0.0.0.0:8000"
+   (Small model: ~500MB download, much faster!)
+
+3. Verify server is running:
+   curl http://localhost:8000/health
+
+4. Run this example:
+   python examples/misc/vllm_example.py
+
+Optional environment variables:
+   export VLLM_BASE_URL="http://localhost:8000/v1"
+   export VLLM_API_KEY="vllm-api-key"
+"""
+
+from mem0 import Memory
+
+# Configuration for vLLM integration
+config = {
+    "llm": {
+        "provider": "vllm",
+        "config": {
+            "model": "Qwen/Qwen2.5-32B-Instruct",
+            "vllm_base_url": "http://localhost:8000/v1",
+            "api_key": "vllm-api-key",
+            "temperature": 0.7,
+            "max_tokens": 100,
+        }
+    },
+    "embedder": {
+        "provider": "openai",
+        "config": {
+            "model": "text-embedding-3-small"
+        }
+    },
+    "vector_store": {
+        "provider": "qdrant",
+        "config": {
+            "collection_name": "vllm_memories",
+            "host": "localhost",
+            "port": 6333
+        }
+    }
+}
+
+def main():
+    """
+    Demonstrate vLLM integration with mem0
+    """
+    print("--> Initializing mem0 with vLLM...")
+    
+    # Initialize memory with vLLM
+    memory = Memory.from_config(config)
+    
+    print("--> Memory initialized successfully!")
+    
+    # Example conversations to store
+    conversations = [
+        {
+            "messages": [
+                {"role": "user", "content": "I love playing chess on weekends"},
+                {"role": "assistant", "content": "That's great! Chess is an excellent strategic game that helps improve critical thinking."}
+            ],
+            "user_id": "user_123"
+        },
+        {
+            "messages": [
+                {"role": "user", "content": "I'm learning Python programming"},
+                {"role": "assistant", "content": "Python is a fantastic language for beginners! What specific areas are you focusing on?"}
+            ],
+            "user_id": "user_123"
+        },
+        {
+            "messages": [
+                {"role": "user", "content": "I prefer working late at night, I'm more productive then"},
+                {"role": "assistant", "content": "Many people find they're more creative and focused during nighttime hours. It's important to maintain a consistent schedule that works for you."}
+            ],
+            "user_id": "user_123"
+        }
+    ]
+    
+    print("\n--> Adding memories using vLLM...")
+    
+    # Add memories - now powered by vLLM's high-performance inference
+    for i, conversation in enumerate(conversations, 1):
+        result = memory.add(
+            messages=conversation["messages"],
+            user_id=conversation["user_id"]
+        )
+        print(f"Memory {i} added: {result}")
+    
+    print("\n🔍 Searching memories...")
+    
+    # Search memories - vLLM will process the search and memory operations
+    search_queries = [
+        "What does the user like to do on weekends?",
+        "What is the user learning?",
+        "When is the user most productive?"
+    ]
+    
+    for query in search_queries:
+        print(f"\nQuery: {query}")
+        memories = memory.search(
+            query=query,
+            user_id="user_123"
+        )
+        
+        for memory_item in memories:
+            print(f"  - {memory_item['memory']}")
+    
+    print("\n--> Getting all memories for user...")
+    all_memories = memory.get_all(user_id="user_123")
+    print(f"Total memories stored: {len(all_memories)}")
+    
+    for memory_item in all_memories:
+        print(f"  - {memory_item['memory']}")
+    
+    print("\n--> vLLM integration demo completed successfully!")
+    print("\nBenefits of using vLLM:")
+    print("  -> 2.7x higher throughput compared to standard implementations")
+    print("  -> 5x faster time-per-output-token")
+    print("  -> Efficient memory usage with PagedAttention")
+    print("  -> Simple configuration, same as other providers")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"=> Error: {e}")
+        print("\nTroubleshooting:")
+        print("1. Make sure vLLM server is running: vllm serve microsoft/DialoGPT-small --port 8000")
+        print("2. Check if the model is downloaded and accessible")
+        print("3. Verify the base URL and port configuration")
+        print("4. Ensure you have the required dependencies installed")
@@ -44,6 +44,8 @@ def __init__(
         # LM Studio specific
         lmstudio_base_url: Optional[str] = "http://localhost:1234/v1",
         lmstudio_response_format: dict = None,
+        # vLLM specific
+        vllm_base_url: Optional[str] = "http://localhost:8000/v1",
         # AWS Bedrock specific
         aws_access_key_id: Optional[str] = None,
         aws_secret_access_key: Optional[str] = None,
@@ -98,6 +100,8 @@ def __init__(
         :type lmstudio_base_url: Optional[str], optional
         :param lmstudio_response_format: LM Studio response format to be use, defaults to None
         :type lmstudio_response_format: Optional[Dict], optional
+        :param vllm_base_url: vLLM base URL to be use, defaults to "http://localhost:8000/v1"
+        :type vllm_base_url: Optional[str], optional
         """
 
         self.model = model
@@ -139,6 +143,9 @@ def __init__(
         self.lmstudio_base_url = lmstudio_base_url
         self.lmstudio_response_format = lmstudio_response_format
 
+        # vLLM specific
+        self.vllm_base_url = vllm_base_url
+
         # AWS Bedrock specific
         self.aws_access_key_id = aws_access_key_id
         self.aws_secret_access_key = aws_secret_access_key
 
@@ -26,6 +26,7 @@ def validate_config(cls, v, values):
             "xai",
             "sarvam",
             "lmstudio",
+            "vllm",
             "langchain",
         ):
             return v
Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,8 @@`
`117`	`117`	`"components/llms/models/xAI",`
`118`	`118`	`"components/llms/models/sarvam",`
`119`	`119`	`"components/llms/models/lmstudio",`
`120`		`- "components/llms/models/langchain"`
	`120`	`+ "components/llms/models/langchain",`
	`121`	`+ "components/llms/models/vllm"`
`121`	`122`	`]`
`122`	`123`	`}`
`123`	`124`	`]`