Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5675e97
Image reader and image support in `gather_evidence` (#1046)
jamesbraza Aug 6, 2025
d1cde22
Preventing Greek name from crashing `DocDetails` creation (#1048)
jamesbraza Aug 6, 2025
0caa926
Better invalid name logs (#1049)
jamesbraza Aug 6, 2025
f71d023
Multimodal PDF support (#1047)
jamesbraza Aug 7, 2025
d3760c9
Fix paperqa/configs link in README.md (#1051)
chrisranderson Aug 8, 2025
948423f
Fixing all broken links, `pymarkdown` (#1052)
jamesbraza Aug 9, 2025
d34543e
Documenting `DocMetadataTask`/`MetadataProvider` (#1050)
jamesbraza Aug 9, 2025
beb838e
chore(deps): lock file maintenance (#1053)
renovate[bot] Aug 9, 2025
81ea858
chore(deps): update actions/checkout action to v5 (#1058)
renovate[bot] Aug 11, 2025
193feb6
chore(deps): update actions/download-artifact action to v5 (#1059)
renovate[bot] Aug 11, 2025
72b7f26
Moved `mypy` to use `local` hook to unsilence it on missing dependenc…
jamesbraza Aug 11, 2025
6801155
Removed dead `patch` from `test_add_clinical_trials_to_docs` (#1056)
jamesbraza Aug 11, 2025
34ceb70
Restored `UnpaywallProvider` by updating expected response (#1057)
jamesbraza Aug 11, 2025
fab944a
Refreshed `test_crossref_journalquality_fields_filtering` cassette (#…
jamesbraza Aug 11, 2025
4862764
Updating `journal_quality.csv` from script (#1061)
jamesbraza Aug 11, 2025
2d58202
Better lookup failure message in `Settings.from_name` (#1064)
jamesbraza Aug 16, 2025
62afb8c
Lower bibtex logging to debug (#1067)
mskarlin Aug 19, 2025
687ce40
Fix: handle non-UTF-8 input in util.py
dmcgrath19 Aug 20, 2025
7a88d42
Add comments to utf-8 error handling
dmcgrath19 Aug 20, 2025
46e21d6
[pre-commit.ci lite] apply automatic fixes
pre-commit-ci-lite[bot] Aug 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updating journal_quality.csv from script (#1061)
  • Loading branch information
jamesbraza authored Aug 11, 2025
commit 4862764f137e2eb63fa6865cfeccf2b3cbcf8b5b
79,484 changes: 40,113 additions & 39,371 deletions src/paperqa/clients/client_data/journal_quality.csv

Large diffs are not rendered by default.

152 changes: 147 additions & 5 deletions src/paperqa/clients/journal_quality.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
from __future__ import annotations

import asyncio
import csv
import logging
import os
import tempfile
from collections.abc import Awaitable, Callable, Sequence
from pathlib import Path
from typing import Any, ClassVar

import anyio
import httpx
from pydantic import ValidationError
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TextColumn,
TimeElapsedColumn,
)

from paperqa.types import DocDetails

from .client_models import JournalQuery, MetadataPostProcessor

logger = logging.getLogger(__name__)

DEFAULT_JOURNAL_QUALITY_CSV_PATH = (
Path(__file__).parent / "client_data" / "journal_quality.csv"
)

# TODO: refresh script for journal quality data

Expand All @@ -25,13 +41,11 @@ class JournalQualityPostProcessor(MetadataPostProcessor[JournalQuery]):
def __init__(self, journal_quality_path: os.PathLike | str | None = None) -> None:
if journal_quality_path is None:
# Construct the path relative to module
self.journal_quality_path = str(
os.path.join(
os.path.dirname(__file__), "client_data", "journal_quality.csv"
)
self.journal_quality_path: str | os.PathLike = (
DEFAULT_JOURNAL_QUALITY_CSV_PATH
)
else:
self.journal_quality_path = str(journal_quality_path)
self.journal_quality_path = journal_quality_path
self.data: dict[str, Any] | None = None

def load_data(self) -> None:
Expand Down Expand Up @@ -72,3 +86,131 @@ def query_creator(self, doc_details: DocDetails, **kwargs) -> JournalQuery | Non
"Must have a valid journal name to query journal quality data."
)
return None


# SEE: https://en.wikipedia.org/wiki/JUFO
JUFO_PORTAL_DOWNLOAD_QUALITY_URL = (
"https://jfp.csc.fi/jufoportal_base/api/download?query=&isActive=true&col=Jufo_ID"
"&col=Name&col=Abbreviation&col=Level&col=ISSNL&col=ISSN1&col=ISSN2&col=ISBN"
"&col=Other_Title&col=Title_details&col=Continues&col=Continued_by&col=Website"
"&col=Country&col=country_code&col=Publisher&col=Language&col=lang_code3"
"&col=lang_code2&col=Year_Start&col=Year_End&col=isScientific&col=isProfessional"
"&col=isGeneral&col=Type_fi&col=Type_sv&col=Type_en&col=Jufo_History"
)


async def download_file(
dest_path: str | os.PathLike,
url: str = JUFO_PORTAL_DOWNLOAD_QUALITY_URL,
client: httpx.AsyncClient | None = None,
) -> Path:
dest_path = Path(dest_path)

async def download(client_: httpx.AsyncClient) -> None:
progress = Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TimeElapsedColumn(),
)

async with client_.stream("GET", url, timeout=60) as response:
response.raise_for_status()
task_id = progress.add_task(
f"Downloading {dest_path.name}",
total=int(response.headers.get("Content-Length", 0)) or None,
)
with progress:
async with await anyio.open_file(dest_path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size=2048):
if not chunk:
continue
await f.write(chunk)
progress.update(task_id, advance=len(chunk))

if client is None:
async with httpx.AsyncClient() as client: # noqa: PLR1704
await download(client)
else:
await download(client)
return dest_path


async def process_csv(
file_path: str | os.PathLike,
override_allowlist: Sequence[tuple[str, int]] | None = (
("annual review of pathology", 2),
("annual review of pathology: mechanisms of disease", 2),
("biochimica et biophysica acta (bba) - bioenergetics", 1),
("biochimica et biophysica acta (bba) - biomembranes", 1),
("biochimica et biophysica acta (bba) - gene regulatory mechanisms", 1),
("biochimica et biophysica acta (bba) - general subjects", 1),
(
"biochimica et biophysica acta (bba) - molecular and cell biology of lipids",
1,
),
("biochimica et biophysica acta (bba) - molecular basis of disease", 1),
("biochimica et biophysica acta (bba) - molecular cell research", 1),
("biochimica et biophysica acta (bba) - proteins and proteomics", 1),
("biochimica et biophysica acta (bba) - reviews on cancer", 1),
("bmc evolutionary biology", 2),
("pnas", 3),
("proceedings of the national academy of sciences", 3),
),
override_blocklist: Sequence[tuple[str, int]] | None = (("scientific reports", 0),),
records_callback: Callable[[Sequence[tuple[str, int]]], Awaitable] | None = None,
) -> list[tuple[str, int]]:
async with await anyio.open_file(file_path, encoding="utf-8") as f:
content = await f.read()

lines = content.splitlines()
progress = Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TimeElapsedColumn(),
MofNCompleteColumn(),
)

task_id = progress.add_task("Processing", total=len(lines) - 1)
# Keys are case-insensitive, values are case-sensitive
records: dict[tuple[str, int], tuple[str, int]] = {}
with progress:
for row in csv.DictReader(lines):
data = (
row["Name"],
(
int(row["Level"])
if str(row.get("Level", "")).isdigit()
else DocDetails.UNDEFINED_JOURNAL_QUALITY
),
)
records[data[0].lower(), data[1]] = data
progress.update(task_id, advance=1)
for row_override in override_allowlist or []:
records[row_override[0].lower(), row_override[1]] = row_override
for row_override in override_blocklist or []:
records.pop((row_override[0].lower(), row_override[1]), None)
records_list = [records[key] for key in sorted(records)]

if records_callback is not None:
await records_callback(records_list)
return records_list


async def main() -> None:
with tempfile.TemporaryDirectory() as tmpdir:
downloaded_path = await download_file(
dest_path=Path(tmpdir) / "journal_quality.csv"
)
records = await process_csv(downloaded_path)

with open( # noqa: ASYNC230
DEFAULT_JOURNAL_QUALITY_CSV_PATH, "w", encoding="utf-8"
) as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["clean_name", "quality"])
for name, quality in records:
writer.writerow([name.lower(), quality])


if __name__ == "__main__":
asyncio.run(main())

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading