TalkWithWeb/app copy.py at main · mshojaei77/TalkWithWeb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
import os
import re
import uuid
import requests
import pickle
import streamlit as st
from bs4 import BeautifulSoup
import faiss
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from web_scraper import scrape_with_depth_sync
import PyPDF2
import json

# Page configuration
st.set_page_config(
    page_title="Agentic RAG Assistant",
    page_icon="🔍",
    layout="wide"
)

# Load environment variables
load_dotenv()

# Initialize OpenAI client
@st.cache_resource
def get_openai_client():
    return OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

client = get_openai_client()

# Initialize FAISS index and document store
embedding_size = 1536  # OpenAI embedding dimension
INDEX_FILE = "embeddings.faiss"
DOCUMENT_STORE_FILE = "document_store.pkl"

# Load existing index and document store if they exist
@st.cache_resource
def load_index_and_documents():
    if os.path.exists(INDEX_FILE) and os.path.exists(DOCUMENT_STORE_FILE):
        with st.spinner("Loading knowledge base..."):
            index = faiss.read_index(INDEX_FILE)

            with open(DOCUMENT_STORE_FILE, 'rb') as f:
                document_store = pickle.load(f)

            return index, document_store
    else:
        index = faiss.IndexFlatL2(embedding_size)
        document_store = []
        return index, document_store

# Save index and document store to disk
def save_index_and_documents(index, document_store):
    with st.spinner("Saving knowledge base..."):
        faiss.write_index(index, INDEX_FILE)

        with open(DOCUMENT_STORE_FILE, 'wb') as f:
            pickle.dump(document_store, f)

        st.sidebar.success(f"✅ Saved {index.ntotal} vectors and {len(document_store)} documents")

# Extract URLs from text using regex
def extract_urls(text):
    url_pattern = re.compile(r'https?://\S+')
    return url_pattern.findall(text)

# Scrape content from a URL and convert to text
def scrape_url(url):
    try:
        with st.spinner(f"Scraping {url}..."):
            response = requests.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()

            # Get text content
            text = soup.get_text(separator='\n')

            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk for chunk in chunks if chunk)

            return text
    except Exception as e:
        st.error(f"Error scraping {url}: {e}")
        return ""

# Save content to a markdown file with website hash as filename
def save_to_markdown(content, url):
    url_hash = uuid.uuid4().hex[:8] if not url else hash(url) % 10000000
    filename = f"scraped_{url_hash}.md"

    # Check if file already exists
    if os.path.exists(filename):
        return filename

    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)

    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# Content from {url}\n\n")
        f.write(content)

    return filename

# Get embedding from OpenAI API
def get_embedding(text):
    with st.spinner("Generating text embeddings..."):
        response = client.embeddings.create(
            input=text,
            model="text-embedding-3-small"
        )
        return response.data[0].embedding

# Split content into chunks and add to FAISS index
def add_to_index(content, url, index, document_store):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
    )

    chunks = text_splitter.split_text(content)
    chunks_added = 0

    progress_bar = st.sidebar.progress(0)
    chunk_status = st.sidebar.empty()

    for i, chunk in enumerate(chunks):
        # Update progress
        progress = (i + 1) / len(chunks)
        progress_bar.progress(progress)
        chunk_status.text(f"Processing chunk {i+1}/{len(chunks)}")

        # Check if this chunk from this URL is already in the document store
        chunk_exists = any(
            doc["text"] == chunk and doc["source"] == url
            for doc in document_store
        )

        if not chunk_exists:
            embedding = get_embedding(chunk)
            # Convert to numpy array and reshape
            embedding_np = np.array(embedding).astype('float32').reshape(1, -1)

            # Add to FAISS index
            index.add(embedding_np)

            # Store document with metadata
            document_store.append({
                "text": chunk,
                "source": url
            })

            chunks_added += 1

    progress_bar.empty()
    chunk_status.empty()

    # Show success message if we added anything
    if chunks_added > 0:
        st.sidebar.success(f"Added {chunks_added} new chunks")

    # Delete markdown file after processing
    if url.startswith("PDF: "):
        # For PDFs, extract name and create consistent filename format
        file_name = url.replace("PDF: ", "")
        file_hash = hash(file_name) % 10000000
        md_filename = f"pdf_{file_hash}.md"
    else:
        # For URLs, use the same hash as when saving
        url_hash = hash(url) % 10000000 if url else uuid.uuid4().hex[:8]
        md_filename = f"scraped_{url_hash}.md"

    # Delete the file if it exists
    if os.path.exists(md_filename):
        try:
            os.remove(md_filename)
            st.sidebar.info(f"Removed temporary file {md_filename} after indexing")
        except Exception as e:
            st.sidebar.warning(f"Could not remove temporary file {md_filename}: {e}")

    return chunks_added, index, document_store

# Search for relevant documents using FAISS
def search_documents(query, index, document_store, k=3):
    with st.spinner("Searching knowledge base for relevant information..."):
        query_embedding = get_embedding(query)
        query_embedding_np = np.array([query_embedding]).astype('float32')

        # Search in FAISS
        distances, indices = index.search(query_embedding_np, k)

        results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1 and idx < len(document_store):
                results.append({
                    "text": document_store[idx]["text"],
                    "source": document_store[idx]["source"],
                    "score": float(distances[0][i])
                })

        return results

# Define a function to determine if RAG is needed for a query
def search_knowledge_base_tool(query, index, document_store, k=5):
    """
    Search the knowledge base for information relevant to the query.
    Only use this when the user asks for specific information that might be in the knowledge base.
    """
    results = search_documents(query, index, document_store, k=k)

    if not results:
        return "No relevant information found in the knowledge base."

    context = "Information from provided sources:\n\n"
    for i, result in enumerate(results, 1):
        context += f"[Source {i}: {result['source']}]\n{result['text']}\n\n"

    return context

# Process user input with function calling
def process_user_input(user_input, urls, index, document_store):
    # Define the function for RAG search
    tools = [
        {
            "type": "function",
            "function": {
                "name": "search_knowledge_base",
                "description": "Search the knowledge base for information relevant to the user's query.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": "The user's question to search for in the knowledge base."
                        }
                    },
                    "required": ["query"],
                    "additionalProperties": False
                },
                "strict": True
            }
        }
    ]

    # Process URLs if provided
    if urls:
        st.sidebar.info(f"Processing {len(urls)} URLs")

        url_progress = st.sidebar.empty()

        for i, url in enumerate(urls):
            url_progress.text(f"Processing URL {i+1}/{len(urls)}: {url}")

            # Generate URL hash to check if already scraped
            url_hash = hash(url) % 10000000
            filename = f"scraped_{url_hash}.md"

            # Check if this URL is already in the index
            url_in_index = any(doc["source"] == url for doc in document_store)

            if url_in_index:
                st.sidebar.info(f"Content from {url} is already in the knowledge base")
                continue

            # Check if we've already scraped this URL
            if os.path.exists(filename):
                st.sidebar.info(f"Loading existing content for {url}")
                with open(filename, "r", encoding="utf-8") as f:
                    content = f.read()

                # Add to FAISS index
                chunks_added, index, document_store = add_to_index(content, url, index, document_store)
            else:
                # Scrape URL if not already scraped
                content = scrape_url(url)
                if content:
                    # Save to markdown
                    filename = save_to_markdown(content, url)
                    st.sidebar.success(f"Saved content from {url}")

                    # Add to FAISS index
                    chunks_added, index, document_store = add_to_index(content, url, index, document_store)

        url_progress.empty()

        # Save after processing all URLs
        save_index_and_documents(index, document_store)

    # If there's a question and we have RAG data available
    if user_input:
        # Detect if query is likely about documents in the knowledge base
        document_related_keywords = ["resume", "cv", "document", "pdf", "file", "report", "uploaded", "paper",
                                    "summarize", "extract", "analyze", "read"]

        is_document_query = any(keyword in user_input.lower() for keyword in document_related_keywords)

        # Enhanced system prompt with clearer instructions
        system_message = (
            "You are a helpful assistant that answers questions based on the knowledge base. "
            "ALWAYS check the knowledge base first before responding to queries about specific documents or content. "
            "If the user asks about documents (like resumes, reports, etc.), ALWAYS search the knowledge base "
            "and only mention uploading if nothing relevant is found. "
            "If you find relevant content, summarize or analyze it as requested."
        )

        messages = [{"role": "system", "content": system_message}]
        messages.append({"role": "user", "content": user_input})

        # For document queries, use tool_choice="required" to force RAG search
        if is_document_query and index.ntotal > 0:
            with st.spinner("Analyzing your question about documents..."):
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages,
                    tools=tools,
                    tool_choice="required"  # Force the model to use the function for document queries
                )
        else:
            # For other queries, let the model decide
            with st.spinner("Processing your question..."):
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages,
                    tools=tools if index.ntotal > 0 else None,
                    tool_choice="auto"
                )

        assistant_message = response.choices[0].message

        # Check if the model called the function or was forced to
        if assistant_message.tool_calls:
            # Process the RAG search
            messages.append(assistant_message)

            results = []
            for tool_call in assistant_message.tool_calls:
                function_name = tool_call.function.name
                function_args = json.loads(tool_call.function.arguments)

                if function_name == "search_knowledge_base":
                    query = function_args.get("query", user_input)
                    results = search_documents(query, index, document_store, k=5)

                    # Build context from results
                    if results:
                        context = "Information from provided sources:\n\n"
                        for i, result in enumerate(results, 1):
                            context += f"[Source {i}: {result['source']}]\n{result['text']}\n\n"
                    else:
                        # Be explicit that nothing was found
                        if is_document_query:
                            context = "No relevant documents found in the knowledge base. Please advise the user to upload the document they're referring to."
                        else:
                            context = "No relevant information found in the knowledge base."

                    # Add the function response
                    messages.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "content": context
                    })

            # Get final response with the RAG context
            with st.spinner("Generating response based on retrieved information..."):
                final_response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages
                )

            return final_response.choices[0].message.content, results
        else:
            # Model decided not to use RAG, just return its response
            return assistant_message.content, []

    return None, []

# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    try:
        with st.spinner("Extracting text from PDF..."):
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                text += pdf_reader.pages[page_num].extract_text() + "\n"
            return text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return ""

# Main Streamlit app
def main():

    # Initialize chat history
    if "messages" not in st.session_state:
        st.session_state.messages = []

    # Initialize or load index and document store
    index, document_store = load_index_and_documents()

    # Display stats in sidebar
    with st.sidebar:
        st.title("Knowledge Base")
        st.metric("Vectors", index.ntotal)

        # Show which URLs are in the knowledge base
        if document_store and len(document_store) > 0:
            unique_sources = set(doc["source"] for doc in document_store)
            st.subheader("Sources in Knowledge Base:")
            for source in unique_sources:
                st.write(f"- {source}")

        # URL input section in sidebar
        st.subheader("Add Content to Knowledge Base")
        url_input = st.text_area("Enter URLs (one per line)", height=100,
                                help="Enter URLs to websites you want to analyze")

        # Deep crawler option
        deep_crawler = st.checkbox("🕸️ Deep Crawler", help="Crawl links found on the given URLs")

        # Depth selector (only shown if deep crawler is enabled)
        max_depth = 1
        max_urls_per_level = 5
        if deep_crawler:
            col1, col2 = st.columns(2)
            with col1:
                max_depth = st.slider("Crawl Depth", min_value=1, max_value=3, value=1,
                                     help="How many levels of links to follow (higher values will take longer)")
            with col2:
                max_urls_per_level = st.slider("URLs per Level", min_value=3, max_value=10, value=5,
                                              help="Maximum number of URLs to process at each depth level")

        urls = []
        if url_input:
            # Split by newline and extract URLs
            url_lines = url_input.split('\n')
            for line in url_lines:
                extracted = extract_urls(line)
                if extracted:
                    urls.extend(extracted)
                elif line.strip().startswith('http'):
                    # If the line itself looks like a URL
                    urls.append(line.strip())

        # Display the extracted URLs
        if urls:
            st.write("📋 URLs to process:")
            for url in urls:
                st.write(f"- {url}")

        process_urls = st.button("Process URLs", type="primary", disabled=len(urls) == 0)

        # Add a section for PDF uploads
        st.markdown("---")
        st.subheader("Upload PDF Documents")
        uploaded_files = st.file_uploader(
            "Upload PDF files to add to knowledge base",
            accept_multiple_files=True,
            type=['pdf']
        )

        process_pdfs = st.button("Process PDFs", type="primary", disabled=len(uploaded_files or []) == 0)

        # Add clear knowledge base button and functionality
        st.markdown("---")
        if st.button("🗑️ Clear Knowledge Base", type="secondary"):
            with st.spinner("Clearing knowledge base and associated files..."):
                # Delete the files if they exist
                if os.path.exists(INDEX_FILE):
                    os.remove(INDEX_FILE)
                if os.path.exists(DOCUMENT_STORE_FILE):
                    os.remove(DOCUMENT_STORE_FILE)

                # Delete all markdown files
                for file in os.listdir():
                    if file.endswith(".md") or file.endswith(".pkl") or file.endswith(".faiss"):
                        os.remove(file)

                # Reset index and document store
                index = faiss.IndexFlatL2(embedding_size)
                document_store = []

                # Save empty index and document store
                save_index_and_documents(index, document_store)

                # Clear all Streamlit caches
                st.cache_resource.clear()
                st.cache_data.clear()

            st.success("Knowledge base and all associated files cleared successfully!")
            st.rerun()


        if process_urls and urls:
            if deep_crawler:
                # Use deep crawler to process URLs
                with st.spinner(f"Deep crawling URLs with depth {max_depth}..."):
                    st.info(f"This may take a while for depth {max_depth} with {max_urls_per_level} URLs per level")
                    results = scrape_with_depth_sync(urls, max_depth=max_depth, max_urls_per_level=max_urls_per_level)

                    # Process each URL and its content from deep crawler results
                    total_processed = 0
                    crawler_progress = st.progress(0)
                    total_urls = len(results)

                    for i, (url, data) in enumerate(results.items()):
                        crawler_progress.progress((i + 1) / total_urls)
                        st.sidebar.info(f"Processing {i+1}/{total_urls}: {url} (depth {data['depth']})")

                        content = data['content']
                        if content:
                            # Save to markdown
                            filename = save_to_markdown(content, url)

                            # Add to FAISS index
                            chunks_added, index, document_store = add_to_index(content, url, index, document_store)
                            total_processed += 1

                    crawler_progress.empty()
                    save_index_and_documents(index, document_store)
                    st.success(f"Deep crawler processed {total_processed} URLs with content")
            else:
                # Process URLs without a question (regular mode)
                process_user_input("", urls, index, document_store)
                st.success("URLs processed and added to knowledge base!")

        if process_pdfs and uploaded_files:
            total_pdfs = len(uploaded_files)
            pdf_progress = st.progress(0)
            pdf_status = st.empty()

            for i, pdf_file in enumerate(uploaded_files):
                # Update progress
                progress = (i + 1) / total_pdfs
                pdf_progress.progress(progress)
                pdf_status.text(f"Processing PDF {i+1}/{total_pdfs}: {pdf_file.name}")

                # Generate file hash to check if already processed
                file_hash = hash(pdf_file.name + str(pdf_file.size)) % 10000000
                filename = f"pdf_{file_hash}.md"

                # Check if this PDF is already in the knowledge base
                pdf_source = f"PDF: {pdf_file.name}"
                pdf_in_index = any(doc["source"] == pdf_source for doc in document_store)

                if pdf_in_index:
                    st.info(f"Content from {pdf_file.name} is already in the knowledge base")
                    continue

                # Extract text from PDF
                content = extract_text_from_pdf(pdf_file)

                if content:
                    # Save to markdown
                    with open(filename, "w", encoding="utf-8") as f:
                        f.write(f"# Content from PDF: {pdf_file.name}\n\n")
                        f.write(content)

                    st.success(f"Saved content from {pdf_file.name}")

                    # Add to FAISS index
                    chunks_added, index, document_store = add_to_index(content, pdf_source, index, document_store)

            pdf_progress.empty()
            pdf_status.empty()

            # Save after processing all PDFs
            save_index_and_documents(index, document_store)
            st.success(f"Processed {total_pdfs} PDF files and added to knowledge base!")

    st.title("Agentic RAG Assistant")

    # Display chat messages from history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Chat input at the bottom
    if question := st.chat_input("Message RAG-Powered AI Assistant..."):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": question})
        with st.chat_message("user"):
            st.markdown(question)

        # Get AI response
        with st.chat_message("assistant"):
            response, results = process_user_input(question, [], index, document_store)

            if response:
                st.markdown(response)

        # Add assistant response to chat history
        if response:
            st.session_state.messages.append({"role": "assistant", "content": response})

if __name__ == "__main__":
    main()