fix the bugs for file transfer

Zongwei9888 · Zongwei9888 · commit 219bc58c7f1b · 2025-08-31T23:07:51.000+08:00
diff --git a/mcp_agent.config.yaml b/mcp_agent.config.yaml
@@ -96,7 +96,7 @@ mcp:
 openai:
   # Secrets (API keys, etc.) are stored in an mcp_agent.secrets.yaml file which can be gitignored
   #  default_model: "o3-mini"
-  default_model: "anthropic/claude-sonnet-4"
+  default_model: "anthropic/claude-3.5-sonnet"
 
 
 anthropic:
diff --git a/prompts/code_prompts.py b/prompts/code_prompts.py
@@ -63,15 +63,18 @@
 Task: Handle paper according to input type and save to "./deepcode_lab/papers/id/id.md"
 Note: Generate id (id is a number) by counting files in "./deepcode_lab/papers/" directory and increment by 1.
 
+CRITICAL RULE: NEVER use write_file tool to create paper content directly. Always use file-downloader tools for PDF/document conversion.
+
 Processing Rules:
 1. URL Input (input_type = "url"):
    - Use "file-downloader" tool to download paper
    - Extract metadata (title, authors, year)
    - Return saved file path and metadata
 
 2. File Input (input_type = "file"):
-   - Move file to "./deepcode_lab/papers/id/"
-   - Use "file-downloader" tool to convert to .md format
+   - Move file to "./deepcode_lab/papers/id/" using move_file_to tool
+   - The move_file_to tool will automatically convert PDF/documents to .md format
+   - NEVER manually extract content or use write_file - let the conversion tools handle this
    - Return new saved file path and metadata
 
 3. Directory Input (input_type = "directory"):
diff --git a/tools/pdf_downloader.py b/tools/pdf_downloader.py
@@ -103,7 +103,17 @@ async def perform_document_conversion(
     conversion_msg = ""
 
     # 首先尝试使用简单的PDF转换器（对于PDF文件）
-    if file_path.lower().endswith(".pdf") and PYPDF2_AVAILABLE:
+    # 检查文件是否实际为PDF（无论扩展名如何）
+    is_pdf_file = False
+    if PYPDF2_AVAILABLE:
+        try:
+            with open(file_path, "rb") as f:
+                header = f.read(8)
+                is_pdf_file = header.startswith(b'%PDF')
+        except Exception:
+            is_pdf_file = file_path.lower().endswith(".pdf")
+    
+    if is_pdf_file and PYPDF2_AVAILABLE:
         try:
             simple_converter = SimplePdfConverter()
             conversion_result = simple_converter.convert_pdf_to_markdown(file_path)
diff --git a/utils/file_processor.py b/utils/file_processor.py
@@ -187,6 +187,12 @@ async def read_file_content(file_path: str) -> str:
             if not os.path.exists(file_path):
                 raise FileNotFoundError(f"File not found: {file_path}")
 
+            # Check if file is actually a PDF by reading the first few bytes
+            with open(file_path, "rb") as f:
+                header = f.read(8)
+                if header.startswith(b'%PDF'):
+                    raise IOError(f"File {file_path} is a PDF file, not a text file. Please convert it to markdown format or use PDF processing tools.")
+
             # Read file content
             # Note: Using async with would be better for large files
             # but for simplicity and compatibility, using regular file reading
@@ -195,6 +201,8 @@ async def read_file_content(file_path: str) -> str:
 
             return content
 
+        except UnicodeDecodeError as e:
+            raise IOError(f"Error reading file {file_path}: File encoding is not UTF-8. Original error: {str(e)}")
         except Exception as e:
             raise IOError(f"Error reading file {file_path}: {str(e)}")
 
diff --git a/workflows/agent_orchestration_engine.py b/workflows/agent_orchestration_engine.py
@@ -661,6 +661,12 @@ async def orchestrate_document_preprocessing_agent(
         # Step 2: Read document content to determine size
         md_path = os.path.join(dir_info["paper_dir"], md_files[0])
         try:
+            # Check if file is actually a PDF by reading the first few bytes
+            with open(md_path, "rb") as f:
+                header = f.read(8)
+                if header.startswith(b'%PDF'):
+                    raise IOError(f"File {md_path} is a PDF file, not a text file. Please convert it to markdown format or use PDF processing tools.")
+            
             with open(md_path, "r", encoding="utf-8") as f:
                 document_content = f.read()
         except Exception as e: