Skip to content
Merged
Next Next commit
New third-party IO engines
  • Loading branch information
datapythonista committed Jun 12, 2025
commit f33778c9030dda546584d1f7c287a5d91383ca38
149 changes: 149 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
import warnings
import zipfile

import pkg_resources

from pandas._typing import (
BaseBuffer,
ReadCsvBuffer,
Expand Down Expand Up @@ -90,6 +92,10 @@

from pandas import MultiIndex

# registry of I/O engines. It is populated the first time a non-core
# pandas engine is used
_io_engines = None


@dataclasses.dataclass
class IOArgs:
Expand Down Expand Up @@ -1282,3 +1288,146 @@ def dedup_names(
counts[col] = cur_count + 1

return names


def _engine_func(format_name: str, engine_name: str, is_writer: bool):
"""
Return the engine function for a given format and operation.

pandas I/O engines can be registered via entry points. The first time this
function is called it will register all the entry points of the "pandas.io_engine"
group and cache them in the global `_io_engines` variable.

Engines are implemented as classes with the `read_<format>` and `to_<format>`
methods (classmethods) for the formats they wish to provide. This function will
return the method from the engine and format being requested.

Parameters
----------
format_name : str
The format such as 'csv', 'parquet', 'json', 'html', etc.
engine_name : str
The engine name provided by the user in `engine=<value>`.
is_writer : bool
`True` to return the `to_<format>` function, `False` to return the
`read_<format>` one.

Examples
--------
An engine is implemented with a class like:

>>> class DummyEngine:
... @classmethod
... def read_csv(cls, filepath_or_buffer, **kwargs):
... # the engine signature must match the pandas method signature
... return pd.DataFrame()

It must be registered as an entry point with the engine name:

```
[project.entry-points."pandas.io_engine"]
dummy = "pandas:io.dummy.DummyEngine"

```

Then the `read_csv` method of the engine can be retrieved with:

>>> func = _engine_func(format_name="csv", engine_name="dummy", is_writer=False)

This is used internally to dispatch the next pandas call to the engine caller:

>>> df = read_csv("myfile.csv", engine="dummy")
"""
global _io_engines

if _io_engines is None:
_io_engines = {}
for entry_point in pkg_resources.iter_entry_points(group="pandas.io_engine"):
_io_engines[entry_point.name] = entry_point.load()

try:
engine_class = _io_engines[engine_name]
except KeyError as err:
raise ValueError(
f"'{engine_name}' is not a known engine. Some engines are only available "
"after installing the package that provides them."
) from err

func_name = f"to_{format_name}" if is_writer else f"read_{format_name}"
try:
engine_method = getattr(engine_class, func_name)
except AttributeError as err:
raise ValueError(
f"The engine '{engine_name}' does not provide a '{func_name}' function"
) from err
else:
return engine_method


def _extract_io_function_info(func_name):
"""
Return the format and if it's a reader or writer from a function name like read_csv.
"""
op_type, format_name = func_name.split("_", maxsplit=1)
if op_type == "read":
is_writer = False
elif op_type == "to":
is_writer = True
else:
raise ValueError(
"Unable to extract info from the function name '{func_name}'. "
"The expected format is `read_<format> or `to_<format>`."
)

return format_name, is_writer


def allow_third_party_engines(skip_engines: list[str] | None = None):
"""
Decorator to avoid boilerplate code when allowing readers and writers to use
third-party engines.

The decorator will introspect the function to know which format should be obtained,
and to know if it's a reader or a writer. Then it will check if the engine has been
registered, and if it has, it will dispatch the execution to the engine with the
arguments provided by the user.

Parameters
----------
skip_engines : list of str, optional
For engines that are implemented in pandas, we want to skip them for this engine
dispatching system. They should be specified in this parameter.

Examples
--------
The decorator works both with the `skip_engines` parameter, or without:

>>> class DataFrame:
... @allow_third_party_engines(["python", "c", "pyarrow"])
... def read_csv(filepath_or_buffer, **kwargs):
... pass
...
... @allow_third_party_engines
... def read_sas(filepath_or_buffer, **kwargs):
... pass
"""

def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
if "engine" in kwargs and kwargs["engine"] not in skip_engines:
format_name, is_writer = _extract_io_function_info(func.__name__)
engine_func = _engine_func(
format_name=format_name,
engine_name=kwargs.pop("engine"),
is_writer=is_writer,
)
return engine_func(*args, **kwargs)
else:
return func(*args, **kwargs)

return wrapper

if callable(skip_engines):
return decorator(skip_engines)
return decorator
8 changes: 8 additions & 0 deletions pandas/io/iceberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

from pandas import DataFrame

from pandas.io.common import allow_third_party_engines


@allow_third_party_engines()
def read_iceberg(
table_identifier: str,
catalog_name: str | None = None,
Expand All @@ -18,6 +21,7 @@ def read_iceberg(
snapshot_id: int | None = None,
limit: int | None = None,
scan_properties: dict[str, Any] | None = None,
engine: str | None = None,
) -> DataFrame:
"""
Read an Apache Iceberg table into a pandas DataFrame.
Expand Down Expand Up @@ -52,6 +56,10 @@ def read_iceberg(
scan_properties : dict of {str: obj}, optional
Additional Table properties as a dictionary of string key value pairs to use
for this scan.
engine : str, optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the read_* and to_* signatures also have an engine_kwargs: dict[str, Any] | None argument to allow specific engine arguments to be passes per implementation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very good point. In read_parquet we already have a **kwargs for engine specific arguments. In map, apply... it's a normal engine_kwargs since **kwargs is used in some cases for the udf keyword arguments. I think for IO readers/writers **kwargs as read_parquet does is fine.

I didn't want to add the engine to all connectors in this PR to keep it simpler, but I'm planning to follow up with another PR that adds it, and adds **kwargs for connectors where it's not there already. Surely happy to add both things here if you prefer, just thought it would make reviewing simpler to keep the implementation separate from all the changes to parameters.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if engine-specific kwargs are needed, isn't that a good reason to use engine.read_whatever(path, **kwargs) instead of pd.read_[...]?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point. Thinking about readers we don't care about I think what you propose is the best choice. And this PR doesn't really prevent that from happening anyway. But for readers we cared enough to include in pandas, I think this new interface offers an advantage. For example, there was some discussion on whether we should move the fastparquet engine out of pandas, Patrick suggested it. I think this interface allows moving the fastparquet engine to the fastparquet package, users with fastparquet installed will still have it available in the same way as it is now, but we can forget about it.

Of course discussions about moving readers out of pandas will have to happen later. But this interface seems quite useful and it's very simple, so in my opinion it's a good deal.

The engine to use. Engines can be installed via third-party packages. For an
updated list of existing pandas I/O engines check the I/O engines section of
our Ecosystem page.

Returns
-------
Expand Down