Skip to content

Commit a3d2cfc

Browse files
committed
Benchmark
1 parent c8c39a8 commit a3d2cfc

File tree

2 files changed

+291
-1
lines changed

2 files changed

+291
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,4 +178,5 @@ cython_debug/
178178
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179179
# refer to https://docs.cursor.com/context/ignore-files
180180
.cursorignore
181-
.cursorindexingignore
181+
.cursorindexingignore
182+
/benchmark_suite

benchmark.py

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
import os
2+
import shutil
3+
import subprocess
4+
import tarfile
5+
import zipfile
6+
from pathlib import Path
7+
import sys
8+
9+
# --- Configuration ---
10+
# Fix 1: Define the path to your main analyzer script.
11+
# This assumes the benchmark script is in the same directory as main.py
12+
MAIN_PY = Path("main.py")
13+
14+
BENCHMARK_DIR = Path("benchmark_suite")
15+
HALTING_DIR = BENCHMARK_DIR / "halting"
16+
NON_HALTING_DIR = BENCHMARK_DIR / "non-halting"
17+
COMPLEX_DIR = BENCHMARK_DIR / "complex"
18+
19+
STDLIB_DEST = HALTING_DIR / "stdlib"
20+
PYPI_DEST = HALTING_DIR / "pypi_sources"
21+
SYNTHETIC_DEST = NON_HALTING_DIR / "synthetic"
22+
PARADOXES_DEST = NON_HALTING_DIR / "paradoxes"
23+
24+
# List of PyPI packages to download
25+
PYPI_PACKAGES = [
26+
"requests", "numpy", "pandas", "flask", "django",
27+
"scikit-learn", "matplotlib", "beautifulsoup4", "sqlalchemy", "celery"
28+
]
29+
30+
# Number of synthetic non-halting files to generate
31+
NUM_SYNTHETIC = 50
32+
33+
# Patterns for non-halting code
34+
NON_HALTING_PATTERNS = [
35+
("while_true", "while True:\n pass"),
36+
("unbounded_inc", "x = 0\nwhile x >= 0:\n x += 1"),
37+
("unbounded_dec", "x = 0\nwhile x <= 0:\n x -= 1"),
38+
("simple_recursion", "def f():\n f()\nf()"),
39+
("mutual_recursion", "def f():\n g()\ndef g():\n f()\nf()"),
40+
]
41+
42+
# Path to your project's scripts directory for paradoxes (adjust as needed)
43+
PROJECT_SCRIPTS_DIR = Path("scripts")
44+
45+
# --- Helper Functions ---
46+
def create_directory(path: Path):
47+
path.mkdir(parents=True, exist_ok=True)
48+
print(f"Created/Ensured directory: {path}")
49+
50+
def collect_stdlib():
51+
"""Copies all .py files from the standard library."""
52+
create_directory(STDLIB_DEST)
53+
stdlib_path = Path(shutil.__file__).parent
54+
print(f"Found standard library at: {stdlib_path}")
55+
56+
file_count = 0
57+
for root, _, files in os.walk(stdlib_path):
58+
for file in files:
59+
if file.endswith(".py"):
60+
source = Path(root) / file
61+
try:
62+
shutil.copy(source, STDLIB_DEST)
63+
file_count += 1
64+
except Exception as e:
65+
print(f"Could not copy {source}: {e}")
66+
67+
print(f"Successfully copied {file_count} stdlib files.")
68+
69+
def download_and_unpack_pypi():
70+
"""Downloads PyPI packages and unpacks their .py files."""
71+
create_directory(PYPI_DEST)
72+
73+
# --- Step 1: Download packages using pip ---
74+
print("Downloading PyPI packages...")
75+
subprocess.run([sys.executable, "-m", "pip", "download", "--no-deps", "--dest", str(PYPI_DEST), *PYPI_PACKAGES], check=True, capture_output=True)
76+
77+
# --- Step 2: Unpack all archives first ---
78+
unpacked_dir = PYPI_DEST / "unpacked"
79+
create_directory(unpacked_dir)
80+
81+
print("Unpacking archives...")
82+
archives_to_delete = []
83+
for archive in PYPI_DEST.iterdir():
84+
# Only process archive files, ignore directories
85+
if archive.is_file():
86+
try:
87+
if archive.suffix in (".tar.gz", ".tgz"):
88+
with tarfile.open(archive, "r:gz") as tar:
89+
tar.extractall(path=unpacked_dir)
90+
archives_to_delete.append(archive)
91+
elif archive.suffix == ".whl":
92+
with zipfile.ZipFile(archive, "r") as zip_ref:
93+
zip_ref.extractall(path=unpacked_dir)
94+
archives_to_delete.append(archive)
95+
except (tarfile.ReadError, zipfile.BadZipFile, EOFError) as e:
96+
print(f"Warning: Could not unpack {archive.name}: {e}. Skipping.")
97+
98+
# --- Step 3: Collect all .py files from the unpacked directory ---
99+
print("Collecting .py files...")
100+
file_count = 0
101+
for root, _, files in os.walk(unpacked_dir):
102+
for file in files:
103+
if file.endswith(".py"):
104+
source = Path(root) / file
105+
# Use a unique name to prevent overwriting files with the same name from different packages
106+
unique_name = f"{source.parent.name}_{source.name}"
107+
dest = PYPI_DEST / unique_name
108+
try:
109+
if not dest.exists():
110+
shutil.copy(source, dest)
111+
file_count += 1
112+
except Exception as e:
113+
print(f"Could not copy {source}: {e}")
114+
115+
# --- Step 4: Clean up archives and temporary directory *after* all operations are done ---
116+
print("Cleaning up temporary files...")
117+
for archive in archives_to_delete:
118+
try:
119+
archive.unlink()
120+
except PermissionError as e:
121+
print(f"Warning: Could not delete archive {archive.name} immediately: {e}")
122+
123+
try:
124+
shutil.rmtree(unpacked_dir)
125+
except PermissionError as e:
126+
print(f"Warning: Could not delete temporary directory {unpacked_dir} immediately: {e}")
127+
128+
print(f"Successfully unpacked and collected {file_count} PyPI .py files.")
129+
130+
131+
def generate_synthetic_non_halting():
132+
"""Generates synthetic non-halting Python scripts."""
133+
create_directory(SYNTHETIC_DEST)
134+
file_count = 0
135+
for i in range(NUM_SYNTHETIC):
136+
pattern_name, code = NON_HALTING_PATTERNS[i % len(NON_HALTING_PATTERNS)]
137+
file_path = SYNTHETIC_DEST / f"{pattern_name}_{i}.py"
138+
with open(file_path, 'w') as f: f.write(code)
139+
file_count += 1
140+
print(f"Successfully generated {file_count} synthetic non-halting files.")
141+
142+
def copy_paradoxes_and_classify():
143+
"""Copies user's scripts, classifying halting/non-halting/complex."""
144+
create_directory(HALTING_DIR)
145+
create_directory(PARADOXES_DEST)
146+
create_directory(COMPLEX_DIR)
147+
148+
if not PROJECT_SCRIPTS_DIR.exists():
149+
print(f"Warning: Scripts directory {PROJECT_SCRIPTS_DIR} not found. Skipping copy.")
150+
return
151+
152+
# These lists now only contain the base names
153+
halting_scripts = ["bounded_loop.py", "dynamic_input.py", "halting.py", "self_referential.py", "simple_halting.py"]
154+
non_halting_scripts = ["complex_non_halting.py", "final_paradox.py", "mutating_paradox_A.py", "mutating_paradox_A_revised.py", "mutating_paradox_final_test.py", "non_halting.py", "obfuscated_paradox.py", "paradox.py", "polymorphic_termination_paradox.py", "semantic_paradox_A.py", "truly_obfuscated_paradox.py"]
155+
complex_scripts = ["ackermann.py", "collatz_conjecture.py"]
156+
157+
file_count = 0
158+
for file in PROJECT_SCRIPTS_DIR.iterdir():
159+
if file.suffix == ".py":
160+
dest_dir = None
161+
if file.name in halting_scripts:
162+
dest_dir = HALTING_DIR
163+
elif file.name in complex_scripts:
164+
dest_dir = COMPLEX_DIR
165+
elif file.name in non_halting_scripts:
166+
dest_dir = PARADOXES_DEST
167+
168+
if dest_dir:
169+
shutil.copy(file, dest_dir / file.name)
170+
file_count += 1
171+
172+
print(f"Successfully copied and classified {file_count} user scripts.")
173+
174+
# Fix 2: Create the placeholder function for clarity
175+
def setup_complex():
176+
"""Placeholder for manual complex files."""
177+
create_directory(COMPLEX_DIR)
178+
print("Complex directory created. If you have curated complex cases (e.g., a Turing machine), add them manually.")
179+
180+
def run_halting_analyzer():
181+
"""Runs the main.py on the benchmark suite and collects results."""
182+
if not MAIN_PY.exists():
183+
print(f"Error: main.py not found at {MAIN_PY}. Ensure this script is in the project root.")
184+
return {}
185+
186+
# Modify the main.py content in memory to point to the correct directory
187+
with open(MAIN_PY, 'r') as f:
188+
main_code = f.read()
189+
190+
# This replacement is fragile but works for this specific main.py
191+
modified_code = main_code.replace(
192+
"scripts_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'scripts')",
193+
f"scripts_dir = r'{BENCHMARK_DIR.resolve()}'"
194+
)
195+
196+
temp_main_path = BENCHMARK_DIR / "temp_main.py"
197+
with open(temp_main_path, 'w') as f:
198+
f.write(modified_code)
199+
200+
print("Running analyzer on the entire corpus... this may take several minutes.")
201+
try:
202+
# Run the modified main script
203+
process = subprocess.run([sys.executable, str(temp_main_path)], capture_output=True, text=True, timeout=600) # 10-min timeout
204+
205+
results = {}
206+
current_script = None
207+
output = process.stdout
208+
209+
for line in output.splitlines():
210+
if line.startswith("[Analyzing]: "):
211+
# Extract just the filename from the path
212+
full_path = line.split("[Analyzing]: ")[1].strip()
213+
current_script = os.path.basename(full_path)
214+
elif line.startswith("Result: ") and current_script:
215+
result = line.split("Result: ")[1].strip()
216+
results[current_script] = result
217+
current_script = None
218+
219+
if process.stderr:
220+
print("\n--- Analyzer Errors (stderr) ---")
221+
print(process.stderr)
222+
print("------------------------------\n")
223+
224+
return results
225+
except subprocess.TimeoutExpired:
226+
print("ERROR: The benchmark run timed out. The corpus may be too large or a script caused a severe hang.")
227+
return {}
228+
except Exception as e:
229+
print(f"An unexpected error occurred while running the analyzer: {e}")
230+
return {}
231+
finally:
232+
if temp_main_path.exists():
233+
temp_main_path.unlink()
234+
235+
def calculate_percentage(results):
236+
"""Calculates success percentage based on expected behaviors."""
237+
total = 0
238+
success = 0
239+
240+
print("\n--- Verifying Benchmark Results ---")
241+
for category_dir, expected in [(HALTING_DIR, "halts"), (NON_HALTING_DIR, "does not halt"), (COMPLEX_DIR, "impossible to determine")]:
242+
for root, _, files in os.walk(category_dir):
243+
for file in files:
244+
if file.endswith(".py"):
245+
total += 1
246+
analyzer_result = results.get(file, "error (not found in output)")
247+
248+
is_success = False
249+
if category_dir.name == "halting":
250+
if analyzer_result == expected:
251+
is_success = True
252+
elif category_dir.name == "non-halting":
253+
# Success if it correctly says "does not halt" OR safely defers
254+
if analyzer_result in ["does not halt", "impossible to determine"]:
255+
is_success = True
256+
elif category_dir.name == "complex":
257+
# Success if it safely defers OR correctly proves it halts (like Ackermann in theory)
258+
if analyzer_result in ["impossible to determine", "halts", "does not halt"]: # 'does not halt' is safe for Ackermann
259+
is_success = True
260+
261+
if is_success:
262+
success += 1
263+
else:
264+
print(f"MISMATCH in {category_dir.name}: {file} -> Expected '{expected}', Got '{analyzer_result}'")
265+
266+
if total > 0:
267+
percentage = (success / total) * 100
268+
print(f"\n--- Practical Success Rate: {percentage:.2f}% ({success}/{total}) ---")
269+
else:
270+
print("No files were analyzed.")
271+
272+
# --- Main Execution ---
273+
if __name__ == "__main__":
274+
create_directory(BENCHMARK_DIR)
275+
276+
print("--- Phase 1: Collecting Corpus ---")
277+
collect_stdlib()
278+
download_and_unpack_pypi()
279+
generate_synthetic_non_halting()
280+
copy_paradoxes_and_classify()
281+
setup_complex()
282+
283+
print("\n--- Phase 2: Running Halting Analyzer ---")
284+
analysis_results = run_halting_analyzer()
285+
286+
print("\n--- Phase 3: Calculating Final Score ---")
287+
calculate_percentage(analysis_results)
288+
289+
print("\n--- Automation Complete ---")

0 commit comments

Comments
 (0)