diff --git a/src/paperqa/clients/__init__.py b/src/paperqa/clients/__init__.py index 7464993dc..64ee54a8a 100644 --- a/src/paperqa/clients/__init__.py +++ b/src/paperqa/clients/__init__.py @@ -7,7 +7,7 @@ import aiohttp from lmi.utils import gather_with_concurrency -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from paperqa.types import Doc, DocDetails @@ -36,21 +36,33 @@ class DocMetadataTask(BaseModel): - """Holder for provider and processor tasks.""" + """Simple container pairing metadata providers with processors.""" model_config = ConfigDict(arbitrary_types_allowed=True) - providers: Collection[MetadataProvider] - processors: Collection[MetadataPostProcessor] + providers: Collection[MetadataProvider] = Field( + description=( + "Metadata providers allotted to this task." + " An example would be providers for Crossref and Semantic Scholar." + ) + ) + processors: Collection[MetadataPostProcessor] = Field( + description=( + "Metadata post-processors allotted to this task." + " An example would be a journal quality filter." + ) + ) def provider_queries( self, query: dict ) -> list[Coroutine[Any, Any, DocDetails | None]]: + """Set up query coroutines for each contained metadata provider.""" return [p.query(query) for p in self.providers] def processor_queries( self, doc_details: DocDetails, session: aiohttp.ClientSession ) -> list[Coroutine[Any, Any, DocDetails]]: + """Set up process coroutines for each contained metadata post-processor.""" return [ p.process(copy.copy(doc_details), session=session) for p in self.processors ] @@ -78,7 +90,6 @@ def __init__( if nested, will query in order looking for termination criteria after each. Will terminate early if either DocDetails.is_hydration_needed is False OR if all requested fields are present in the DocDetails object. - """ self._session = session self.tasks: list[DocMetadataTask] = [] diff --git a/src/paperqa/clients/client_models.py b/src/paperqa/clients/client_models.py index 5049f4d09..2c875653b 100644 --- a/src/paperqa/clients/client_models.py +++ b/src/paperqa/clients/client_models.py @@ -88,25 +88,28 @@ class JournalQuery(ClientQuery): class MetadataProvider(ABC, Generic[ClientQueryType]): - """Provide metadata from a query by any means necessary.""" + """Provide metadata from a query by any means necessary. + + An example is going from a DOI to full paper metadata using Semantic Scholar. + """ async def query(self, query: dict) -> DocDetails | None: - return await self._query(self.query_transformer(query)) + return await self._query(self.query_factory(query)) @abstractmethod async def _query(self, query: ClientQueryType) -> DocDetails | None: - pass + """Run a query against the provider.""" @abstractmethod - def query_transformer(self, query: dict) -> ClientQueryType: - pass + def query_factory(self, query: dict) -> ClientQueryType: + """Create a query object from unstructured query data.""" class DOIOrTitleBasedProvider(MetadataProvider[DOIQuery | TitleAuthorQuery]): async def query(self, query: dict) -> DocDetails | None: try: - client_query = self.query_transformer(query) + client_query = self.query_factory(query) return await self._query(client_query) # We allow graceful failures, i.e. return "None" for both DOI errors and timeout errors # DOINotFoundError means the paper doesn't exist in the source, the timeout is to prevent @@ -150,7 +153,7 @@ async def _query(self, query: DOIQuery | TitleAuthorQuery) -> DocDetails | None: TimeoutError: When the request takes too long on the client side """ - def query_transformer(self, query: dict) -> DOIQuery | TitleAuthorQuery: + def query_factory(self, query: dict) -> DOIQuery | TitleAuthorQuery: try: if "doi" in query: return DOIQuery(**query) @@ -169,7 +172,6 @@ class MetadataPostProcessor(ABC, Generic[ClientQueryType]): MetadataPostProcessor should be idempotent and not order-dependent, i.e. all MetadataPostProcessor instances should be able to run in parallel. - """ async def process(self, doc_details: DocDetails, **kwargs) -> DocDetails: