Skip to content

Commit 219bc58

Browse files
committed
fix the bugs for file transfer
1 parent 64266f7 commit 219bc58

File tree

5 files changed

+31
-4
lines changed

5 files changed

+31
-4
lines changed

mcp_agent.config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ mcp:
9696
openai:
9797
# Secrets (API keys, etc.) are stored in an mcp_agent.secrets.yaml file which can be gitignored
9898
# default_model: "o3-mini"
99-
default_model: "anthropic/claude-sonnet-4"
99+
default_model: "anthropic/claude-3.5-sonnet"
100100

101101

102102
anthropic:

prompts/code_prompts.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,18 @@
6363
Task: Handle paper according to input type and save to "./deepcode_lab/papers/id/id.md"
6464
Note: Generate id (id is a number) by counting files in "./deepcode_lab/papers/" directory and increment by 1.
6565
66+
CRITICAL RULE: NEVER use write_file tool to create paper content directly. Always use file-downloader tools for PDF/document conversion.
67+
6668
Processing Rules:
6769
1. URL Input (input_type = "url"):
6870
- Use "file-downloader" tool to download paper
6971
- Extract metadata (title, authors, year)
7072
- Return saved file path and metadata
7173
7274
2. File Input (input_type = "file"):
73-
- Move file to "./deepcode_lab/papers/id/"
74-
- Use "file-downloader" tool to convert to .md format
75+
- Move file to "./deepcode_lab/papers/id/" using move_file_to tool
76+
- The move_file_to tool will automatically convert PDF/documents to .md format
77+
- NEVER manually extract content or use write_file - let the conversion tools handle this
7578
- Return new saved file path and metadata
7679
7780
3. Directory Input (input_type = "directory"):

tools/pdf_downloader.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,17 @@ async def perform_document_conversion(
103103
conversion_msg = ""
104104

105105
# 首先尝试使用简单的PDF转换器(对于PDF文件)
106-
if file_path.lower().endswith(".pdf") and PYPDF2_AVAILABLE:
106+
# 检查文件是否实际为PDF(无论扩展名如何)
107+
is_pdf_file = False
108+
if PYPDF2_AVAILABLE:
109+
try:
110+
with open(file_path, "rb") as f:
111+
header = f.read(8)
112+
is_pdf_file = header.startswith(b'%PDF')
113+
except Exception:
114+
is_pdf_file = file_path.lower().endswith(".pdf")
115+
116+
if is_pdf_file and PYPDF2_AVAILABLE:
107117
try:
108118
simple_converter = SimplePdfConverter()
109119
conversion_result = simple_converter.convert_pdf_to_markdown(file_path)

utils/file_processor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,12 @@ async def read_file_content(file_path: str) -> str:
187187
if not os.path.exists(file_path):
188188
raise FileNotFoundError(f"File not found: {file_path}")
189189

190+
# Check if file is actually a PDF by reading the first few bytes
191+
with open(file_path, "rb") as f:
192+
header = f.read(8)
193+
if header.startswith(b'%PDF'):
194+
raise IOError(f"File {file_path} is a PDF file, not a text file. Please convert it to markdown format or use PDF processing tools.")
195+
190196
# Read file content
191197
# Note: Using async with would be better for large files
192198
# but for simplicity and compatibility, using regular file reading
@@ -195,6 +201,8 @@ async def read_file_content(file_path: str) -> str:
195201

196202
return content
197203

204+
except UnicodeDecodeError as e:
205+
raise IOError(f"Error reading file {file_path}: File encoding is not UTF-8. Original error: {str(e)}")
198206
except Exception as e:
199207
raise IOError(f"Error reading file {file_path}: {str(e)}")
200208

workflows/agent_orchestration_engine.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,12 @@ async def orchestrate_document_preprocessing_agent(
661661
# Step 2: Read document content to determine size
662662
md_path = os.path.join(dir_info["paper_dir"], md_files[0])
663663
try:
664+
# Check if file is actually a PDF by reading the first few bytes
665+
with open(md_path, "rb") as f:
666+
header = f.read(8)
667+
if header.startswith(b'%PDF'):
668+
raise IOError(f"File {md_path} is a PDF file, not a text file. Please convert it to markdown format or use PDF processing tools.")
669+
664670
with open(md_path, "r", encoding="utf-8") as f:
665671
document_content = f.read()
666672
except Exception as e:

0 commit comments

Comments
 (0)