From c2de78964fa1487a86af7c5e5b647014910c5d2a Mon Sep 17 00:00:00 2001 From: James Braza Date: Mon, 22 Dec 2025 09:39:58 -0500 Subject: [PATCH 1/2] Created a test to check for invalid configs --- tests/test_configs.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_configs.py b/tests/test_configs.py index 90b9c34c8..a45c576b7 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -1,3 +1,4 @@ +import importlib.resources import os import pathlib from unittest.mock import patch @@ -6,6 +7,7 @@ from pydantic import ValidationError from pytest_subtests import SubTests +import paperqa.configs from paperqa.prompts import citation_prompt from paperqa.settings import ( AgentSettings, @@ -205,3 +207,15 @@ def test_citation_prompt_current_year(): f"Citation prompt should contain '{expected_year_text}' but got:" f" {citation_prompt}" ) + + +def test_validity_of_bundled_configs(subtests: SubTests) -> None: + for config_file in [ + f + for f in importlib.resources.files(paperqa.configs).iterdir() + if f.name.endswith(".json") + ]: + config_name = config_file.name.removesuffix(".json") + with subtests.test(msg=config_name): + settings = get_settings(config_name) + assert isinstance(settings, Settings) From 30540fa3b48c1d57f3d470dec0d3f6d1485ff3f7 Mon Sep 17 00:00:00 2001 From: James Braza Date: Mon, 22 Dec 2025 09:42:21 -0500 Subject: [PATCH 2/2] Updated configs for paper-qa==2025.12.17 as needed --- src/paperqa/configs/clinical_trials.json | 6 ++++-- src/paperqa/configs/contracrow.json | 9 +++++---- src/paperqa/configs/high_quality.json | 6 ++++-- src/paperqa/configs/openreview.json | 6 ++++-- src/paperqa/configs/search_only_clinical_trials.json | 6 ++++-- src/paperqa/configs/tier2_limits.json | 6 ++++-- src/paperqa/configs/tier3_limits.json | 6 ++++-- src/paperqa/configs/tier4_limits.json | 6 ++++-- src/paperqa/configs/tier5_limits.json | 6 ++++-- src/paperqa/configs/wikicrow.json | 9 +++++---- 10 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/paperqa/configs/clinical_trials.json b/src/paperqa/configs/clinical_trials.json index f7235b4b3..bbfa56b2f 100644 --- a/src/paperqa/configs/clinical_trials.json +++ b/src/paperqa/configs/clinical_trials.json @@ -15,7 +15,9 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 9000, - "overlap": 750 + "reader_config": { + "chunk_chars": 9000, + "overlap": 750 + } } } diff --git a/src/paperqa/configs/contracrow.json b/src/paperqa/configs/contracrow.json index 3985ec5e8..47d58935d 100644 --- a/src/paperqa/configs/contracrow.json +++ b/src/paperqa/configs/contracrow.json @@ -20,13 +20,14 @@ "answer_filter_extra_background": false }, "parsing": { - "chunk_size": 7000, "use_doc_details": true, - "overlap": 250, + "reader_config": { + "chunk_chars": 7000, + "overlap": 250 + }, "citation_prompt": "Provide the citation for the following text in MLA Format. Do not write an introductory sentence. If reporting date accessed, the current year is 2024\n\n{text}\n\nCitation:", "structured_citation_prompt": "Extract the title, authors, and doi as a JSON from this MLA citation. If any field can not be found, return it as null. Use title, authors, and doi as keys, author's value should be a list of authors. {citation}\n\nCitation JSON:", - "disable_doc_valid_check": false, - "chunking_algorithm": "simple_overlap" + "disable_doc_valid_check": false }, "prompts": { "summary": "Summarize the excerpt below to help answer a question.\n\nExcerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly answer the question, instead summarize to give evidence to help answer the question. Stay detailed; report specific numbers, equations, or direct quotes (marked with quotation marks). Reply \"Not applicable\" if the excerpt is irrelevant. At the end of your response, provide an integer score from 1-10 on a newline indicating relevance to question. Do not explain your score.\n\nRelevant Information Summary ({summary_length}):", diff --git a/src/paperqa/configs/high_quality.json b/src/paperqa/configs/high_quality.json index 1fac7788b..16cc0ff29 100644 --- a/src/paperqa/configs/high_quality.json +++ b/src/paperqa/configs/high_quality.json @@ -6,7 +6,9 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 7000, - "overlap": 250 + "reader_config": { + "chunk_chars": 7000, + "overlap": 250 + } } } diff --git a/src/paperqa/configs/openreview.json b/src/paperqa/configs/openreview.json index 0bf1bc491..562e7d458 100644 --- a/src/paperqa/configs/openreview.json +++ b/src/paperqa/configs/openreview.json @@ -30,7 +30,9 @@ "return_paper_metadata": false }, "parsing": { - "chunk_size": 3000000, - "use_doc_details": false + "use_doc_details": false, + "reader_config": { + "chunk_chars": 3000000 + } } } diff --git a/src/paperqa/configs/search_only_clinical_trials.json b/src/paperqa/configs/search_only_clinical_trials.json index d6c105b1a..1b04d473a 100644 --- a/src/paperqa/configs/search_only_clinical_trials.json +++ b/src/paperqa/configs/search_only_clinical_trials.json @@ -14,7 +14,9 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 9000, - "overlap": 750 + "reader_config": { + "chunk_chars": 9000, + "overlap": 750 + } } } diff --git a/src/paperqa/configs/tier2_limits.json b/src/paperqa/configs/tier2_limits.json index 238974c93..7ac9dfbe9 100644 --- a/src/paperqa/configs/tier2_limits.json +++ b/src/paperqa/configs/tier2_limits.json @@ -6,8 +6,10 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 7000, - "overlap": 250 + "reader_config": { + "chunk_chars": 7000, + "overlap": 250 + } }, "prompts": { "use_json": true diff --git a/src/paperqa/configs/tier3_limits.json b/src/paperqa/configs/tier3_limits.json index 4c21bda97..eaed70ce0 100644 --- a/src/paperqa/configs/tier3_limits.json +++ b/src/paperqa/configs/tier3_limits.json @@ -6,8 +6,10 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 7000, - "overlap": 250 + "reader_config": { + "chunk_chars": 7000, + "overlap": 250 + } }, "prompts": { "use_json": true diff --git a/src/paperqa/configs/tier4_limits.json b/src/paperqa/configs/tier4_limits.json index ddc6879d3..c4b199aff 100644 --- a/src/paperqa/configs/tier4_limits.json +++ b/src/paperqa/configs/tier4_limits.json @@ -6,8 +6,10 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 7000, - "overlap": 250 + "reader_config": { + "chunk_chars": 7000, + "overlap": 250 + } }, "prompts": { "use_json": true diff --git a/src/paperqa/configs/tier5_limits.json b/src/paperqa/configs/tier5_limits.json index 2b15880fe..a7ea5566c 100644 --- a/src/paperqa/configs/tier5_limits.json +++ b/src/paperqa/configs/tier5_limits.json @@ -6,8 +6,10 @@ }, "parsing": { "use_doc_details": true, - "chunk_size": 7000, - "overlap": 250 + "reader_config": { + "chunk_chars": 7000, + "overlap": 250 + } }, "prompts": { "use_json": true diff --git a/src/paperqa/configs/wikicrow.json b/src/paperqa/configs/wikicrow.json index d1ba4f753..38b4de4f8 100644 --- a/src/paperqa/configs/wikicrow.json +++ b/src/paperqa/configs/wikicrow.json @@ -20,13 +20,14 @@ "answer_filter_extra_background": false }, "parsing": { - "chunk_size": 7000, "use_doc_details": true, - "overlap": 1750, + "reader_config": { + "chunk_chars": 7000, + "overlap": 1750 + }, "citation_prompt": "Provide the citation for the following text in MLA Format. Do not write an introductory sentence. If reporting date accessed, the current year is 2024\n\n{text}\n\nCitation:", "structured_citation_prompt": "Extract the title, authors, and doi as a JSON from this MLA citation. If any field can not be found, return it as null. Use title, authors, and doi as keys, author's value should be a list of authors. {citation}\n\nCitation JSON:", - "disable_doc_valid_check": false, - "chunking_algorithm": "simple_overlap" + "disable_doc_valid_check": false }, "prompts": { "summary": "Summarize the excerpt below to help answer a question.\n\nExcerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly answer the question, instead summarize to give evidence to help answer the question. Stay detailed; report specific numbers, equations, or direct quotes (marked with quotation marks). Reply \"Not applicable\" if the excerpt is irrelevant. At the end of your response, provide an integer score from 1-10 on a newline indicating relevance to question. Do not explain your score.\n\nRelevant Information Summary ({summary_length}):",