Finishing docs and tests

pandas-dev · WillAyd · Jul 3, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
commit 1ca77c1cce8e67364f5b9f43b3321cd66723354f
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
@@ -489,6 +489,69 @@ registers the default "matplotlib" backend as follows.
 More information on how to implement a third-party plotting backend can be found at
 https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1.
 
+.. _extending.plotting-backends:
+
+IO engines
+-----------
+
+pandas provides several IO connectors such as :func:`read_csv` or :meth:`to_parquet`, and many
+of those support multiple engines. For example, :func:`read_csv` supports the ``python``, ``c``
+and ``pyarrow`` engines, each with its advantages and disadvantages, making each more appropriate
+for certain use cases.
+
+Third-party package developers can implement engines for any of the pandas readers and writers.
+When a ``pandas.read_*`` function or ``DataFrame.to_*`` method are called with an ``engine="<name>"``
+that is not known to pandas, pandas will look into the entry points registered in the group
+``pandas.io_engine`` by the packages in the environment, and will call the corresponding method.
+
+An engine is a simple Python class which implements one or more of the pandas readers and writers
+as class methods:
+
+.. code-block:: python
+
+    class EmptyDataEngine:
+        @classmethod
+        def read_json(cls, path_or_buf=None, **kwargs):
+            return pd.DataFrame()
+
+        @classmethod
+        def to_json(cls, path_or_buf=None, **kwargs):
+            with open(path_or_buf, "w") as f:
+                f.write()
+
+        @classmethod
+        def read_clipboard(cls, sep='\\s+', dtype_backend=None, **kwargs):
+            return pd.DataFrame()
+
+A single engine can support multiple readers and writers. When possible, it is a good practice for
+a reader to provide both a reader and writer for the supported formats. But it is possible to
+provide just one of them.
+
+The package implementing the engine needs to create an entry point for pandas to be able to discover
+it. This is done in ``pyproject.toml``:
+
+```toml
+[project.entry-points."pandas.io_engine"]
+empty = empty_data:EmptyDataEngine
+```
+
+The first line should always be the same, creating the entry point in the ``pandas.io_engine`` group.
+In the second line, ``empty`` is the name of the engine, and ``empty_data:EmptyDataEngine`` is where
+to find the engine class in the package (``empty_data`` is the module name in this case).
+
+If a user have the package of the example installed, them it would be possible to use:
+
+.. code-block:: python
+
+    pd.read_json("myfile.json", engine="empty")
+
+When pandas detects that no ``empty`` engine exists for the ``read_json`` reader in pandas, will
+look at the entry points, will find the ``EmptyDataEngine`` engine, and will call the ``read_json``
+method on it with the arguments provided by the user (except the ``engine`` parameter).
+
+To avoid conflicts in the names of engines, we keep an "IO engines" section in our
+[Ecosystem page](https://pandas.pydata.org/community/ecosystem.html#io-engines).
+
 .. _extending.pandas_priority:
 
 Arithmetic with 3rd party types

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -1340,11 +1340,10 @@ def _get_io_engine(name: str):
         for entry_point in entry_points().select(group="pandas.io_engine"):
             package_name = entry_point.dist.metadata["Name"]
             if entry_point.name in _io_engines:
-                _io_engines[entry_point.name]._other_providers.append(package_name)
+                _io_engines[entry_point.name]._packages.append(package_name)
             else:
                 _io_engines[entry_point.name] = entry_point.load()
-                _io_engines[entry_point.name]._provider_name = package_name
-                _io_engines[entry_point.name]._other_providers = []
+                _io_engines[entry_point.name]._packages = [package_name]
 
     try:
         engine = _io_engines[name]
@@ -1354,23 +1353,22 @@ def _get_io_engine(name: str):
             "after installing the package that provides them."
         ) from err
 
-    if engine._other_providers:
+    if len(engine._packages) > 1:
         msg = (
             f"The engine '{name}' has been registered by the package "
-            f"'{engine._provider_name}' and will be used. "
+            f"'{engine._packages[0]}' and will be used. "
         )
-        if len(engine._other_providers):
+        if len(engine._packages) == 2:
             msg += (
-                "The package '{engine._other_providers}' also tried to register "
+                f"The package '{engine._packages[1]}' also tried to register "
                 "the engine, but it couldn't because it was already registered."
             )
         else:
             msg += (
-                "Other packages that tried to register the engine, but they couldn't "
-                "because it was already registered are: "
-                f"{str(engine._other_providers)[1:-1]}."
+                "The packages {str(engine._packages[1:]}[1:-1] also tried to register "
+                "the engine, but they couldn't because it was already registered."
             )
-        warnings.warn(RuntimeWarning, msg, stacklevel=find_stack_level())
+        warnings.warn(msg, RuntimeWarning, stacklevel=find_stack_level())
 
     return engine
 

diff --git a/pandas/tests/io/test_io_engines.py b/pandas/tests/io/test_io_engines.py
@@ -1,16 +1,57 @@
+from types import SimpleNamespace
+
 import pytest
 
+import pandas._testing as tm
+
 from pandas.io import common
 
 
+class _MockIoEngine:
+    @classmethod
+    def read_foo(cls, fname):
+        return "third-party"
+
+
 @pytest.fixture
 def patch_engine(monkeypatch):
-    class MockIoEngine:
-        @classmethod
-        def read_foo(cls, fname):
-            return "third-party"
+    monkeypatch.setattr(common, "_get_io_engine", lambda name: _MockIoEngine)
+
+
+@pytest.fixture
+def patch_entry_points(monkeypatch):
+    class MockEntryPoint:
+        name = "myengine"
+        dist = SimpleNamespace(metadata={"Name": "mypackage"})
+
+        @staticmethod
+        def load():
+            return _MockIoEngine
 
-    monkeypatch.setattr(common, "_get_io_engine", lambda name: MockIoEngine)
+    class MockDuplicate1:
+        name = "duplicate"
+        dist = SimpleNamespace(metadata={"Name": "package1"})
+
+        @staticmethod
+        def load():
+            return SimpleNamespace(read_foo=lambda fname: "dup1")
+
+    class MockDuplicate2:
+        name = "duplicate"
+        dist = SimpleNamespace(metadata={"Name": "package2"})
+
+        @staticmethod
+        def load():
+            return SimpleNamespace(read_foo=lambda fname: "dup1")
+
+    monkeypatch.setattr(common, "_io_engines", None)
+    monkeypatch.setattr(
+        common,
+        "entry_points",
+        lambda: SimpleNamespace(
+            select=lambda group: [MockEntryPoint, MockDuplicate1, MockDuplicate2]
+        ),
+    )
 
 
 class TestIoEngines:
@@ -46,3 +87,19 @@ def read_bar(fname, engine=None):
         msg = "'third-party' does not provide a 'read_bar'"
         with pytest.raises(ValueError, match=msg):
             read_bar("myfile.foo", engine="third-party")
+
+    def test_correct_io_engine(self, patch_entry_points):
+        result = common._get_io_engine("myengine")
+        assert result is _MockIoEngine
+
+    def test_unknown_io_engine(self, patch_entry_points):
+        with pytest.raises(ValueError, match="'unknown' is not a known engine"):
+            common._get_io_engine("unknown")
+
+    def test_duplicate_engine(self, patch_entry_points):
+        with tm.assert_produces_warning(
+            RuntimeWarning,
+            match="'duplicate' has been registered by the package 'package1'",
+        ):
+            result = common._get_io_engine("duplicate")
+        assert hasattr(result, "read_foo")
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
@@ -712,6 +712,18 @@ authors to coordinate on the namespace.
   | [staircase](https://www.staircase.dev/)                              | `sc`       | `Series`, `DataFrame` |
   | [woodwork](https://github.com/alteryx/woodwork)                      | `slice`    | `Series`, `DataFrame` |
 
+## IO engines
+
+Table with the third-party [IO engines](https://pandas.pydata.org/docs/development/extending.html#io-engines)
+available to `read_*` functions and `DataFrame.to_*` methods.
+
+  | Engine name     | Library                                               | Supported formats               |
+  | ----------------|------------------------------------------------------ | ------------------------------- |
+  |                 |                                                       |                                 |
+
+IO engines can be used by specifying the engine when calling a reader or writer
+(e.g. `pd.read_csv("myfile.csv", engine="myengine")`).
+
 ## Development tools
 
 ### [pandas-stubs](https://github.com/VirtusLab/pandas-stubs)