File tree Expand file tree Collapse file tree 5 files changed +31
-4
lines changed
Expand file tree Collapse file tree 5 files changed +31
-4
lines changed Original file line number Diff line number Diff line change 9696openai :
9797 # Secrets (API keys, etc.) are stored in an mcp_agent.secrets.yaml file which can be gitignored
9898 # default_model: "o3-mini"
99- default_model : " anthropic/claude-sonnet-4 "
99+ default_model : " anthropic/claude-3.5-sonnet "
100100
101101
102102anthropic :
Original file line number Diff line number Diff line change 6363Task: Handle paper according to input type and save to "./deepcode_lab/papers/id/id.md"
6464Note: Generate id (id is a number) by counting files in "./deepcode_lab/papers/" directory and increment by 1.
6565
66+ CRITICAL RULE: NEVER use write_file tool to create paper content directly. Always use file-downloader tools for PDF/document conversion.
67+
6668Processing Rules:
67691. URL Input (input_type = "url"):
6870 - Use "file-downloader" tool to download paper
6971 - Extract metadata (title, authors, year)
7072 - Return saved file path and metadata
7173
72742. File Input (input_type = "file"):
73- - Move file to "./deepcode_lab/papers/id/"
74- - Use "file-downloader" tool to convert to .md format
75+ - Move file to "./deepcode_lab/papers/id/" using move_file_to tool
76+ - The move_file_to tool will automatically convert PDF/documents to .md format
77+ - NEVER manually extract content or use write_file - let the conversion tools handle this
7578 - Return new saved file path and metadata
7679
77803. Directory Input (input_type = "directory"):
Original file line number Diff line number Diff line change @@ -103,7 +103,17 @@ async def perform_document_conversion(
103103 conversion_msg = ""
104104
105105 # 首先尝试使用简单的PDF转换器(对于PDF文件)
106- if file_path .lower ().endswith (".pdf" ) and PYPDF2_AVAILABLE :
106+ # 检查文件是否实际为PDF(无论扩展名如何)
107+ is_pdf_file = False
108+ if PYPDF2_AVAILABLE :
109+ try :
110+ with open (file_path , "rb" ) as f :
111+ header = f .read (8 )
112+ is_pdf_file = header .startswith (b'%PDF' )
113+ except Exception :
114+ is_pdf_file = file_path .lower ().endswith (".pdf" )
115+
116+ if is_pdf_file and PYPDF2_AVAILABLE :
107117 try :
108118 simple_converter = SimplePdfConverter ()
109119 conversion_result = simple_converter .convert_pdf_to_markdown (file_path )
Original file line number Diff line number Diff line change @@ -187,6 +187,12 @@ async def read_file_content(file_path: str) -> str:
187187 if not os .path .exists (file_path ):
188188 raise FileNotFoundError (f"File not found: { file_path } " )
189189
190+ # Check if file is actually a PDF by reading the first few bytes
191+ with open (file_path , "rb" ) as f :
192+ header = f .read (8 )
193+ if header .startswith (b'%PDF' ):
194+ raise IOError (f"File { file_path } is a PDF file, not a text file. Please convert it to markdown format or use PDF processing tools." )
195+
190196 # Read file content
191197 # Note: Using async with would be better for large files
192198 # but for simplicity and compatibility, using regular file reading
@@ -195,6 +201,8 @@ async def read_file_content(file_path: str) -> str:
195201
196202 return content
197203
204+ except UnicodeDecodeError as e :
205+ raise IOError (f"Error reading file { file_path } : File encoding is not UTF-8. Original error: { str (e )} " )
198206 except Exception as e :
199207 raise IOError (f"Error reading file { file_path } : { str (e )} " )
200208
Original file line number Diff line number Diff line change @@ -661,6 +661,12 @@ async def orchestrate_document_preprocessing_agent(
661661 # Step 2: Read document content to determine size
662662 md_path = os .path .join (dir_info ["paper_dir" ], md_files [0 ])
663663 try :
664+ # Check if file is actually a PDF by reading the first few bytes
665+ with open (md_path , "rb" ) as f :
666+ header = f .read (8 )
667+ if header .startswith (b'%PDF' ):
668+ raise IOError (f"File { md_path } is a PDF file, not a text file. Please convert it to markdown format or use PDF processing tools." )
669+
664670 with open (md_path , "r" , encoding = "utf-8" ) as f :
665671 document_content = f .read ()
666672 except Exception as e :
You can’t perform that action at this time.
0 commit comments