diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index d5cf40f6d..000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,19 +0,0 @@ ---- -name: Codespell - -on: - push: - branches: [master] - pull_request: - branches: [master] - -jobs: - codespell: - name: Check for spelling errors - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: Codespell - uses: codespell-project/actions-codespell@v1 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 9797e6a9c..0f7967487 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -9,11 +9,16 @@ on: jobs: linux: name: ${{ matrix.PY }}-pytest - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: - PY: ["3.8", "3.9", "3.10", "3.11", "3.12"] + PY: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + - "3.13" env: CIRUN: true @@ -27,7 +32,7 @@ jobs: - name: Setup conda uses: conda-incubator/setup-miniconda@v3 with: - environment-file: ci/environment-py38.yml + environment-file: ci/environment-linux.yml python-version: ${{ matrix.PY }} - name: Run Tests @@ -38,7 +43,7 @@ jobs: win: name: pytest-win - runs-on: windows-2019 + runs-on: windows-2022 env: CIRUN: true @@ -62,7 +67,7 @@ jobs: lint: name: lint - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@main - uses: actions/setup-python@main @@ -70,25 +75,9 @@ jobs: python-version: "3.11" - uses: pre-commit/action@main -# typecheck: -# runs-on: ubuntu-latest -# steps: -# - name: Checkout -# uses: actions/checkout@v4 -# -# - name: Setup conda -# uses: conda-incubator/setup-miniconda@v3 -# with: -# environment-file: ci/environment-typecheck.yml -# -# - name: mypy -# shell: bash -l {0} -# run: | -# mypy fsspec -# downstream: name: downstream - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout @@ -121,11 +110,11 @@ jobs: fsspec_friends: name: ${{ matrix.FRIEND }}-pytest - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: - FRIEND: [gcsfs, s3fs] + FRIEND: [s3fs, gcsfs] env: CIRUN: true @@ -150,8 +139,11 @@ jobs: shell: bash -l {0} run: | pip install -e . --no-deps - pip install -e ./${{ matrix.FRIEND }} --no-deps + pip list - name: Test shell: bash -l {0} - run: pytest -v ${{ matrix.FRIEND }} + run: | + cd ${{ matrix.FRIEND }} + pytest -v + cd .. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 48e3c950d..8afc911e3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ exclude: > repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -13,11 +13,14 @@ repos: - id: check-json - id: check-yaml - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.4.4 + rev: v0.12.2 hooks: # Run the linter. - - id: ruff + - id: ruff-check args: [ --fix, "--show-fixes"] - id: ruff-format types_or: [python] + - repo: https://github.com/codespell-project/codespell + rev: v2.4.0 + hooks: + - id: codespell diff --git a/README.md b/README.md index 9b19198d3..9b85e00cd 100644 --- a/README.md +++ b/README.md @@ -96,3 +96,11 @@ filesystem_spec repository to setup pre-commit hooks. ``black`` will now be run before you commit, reformatting any changed files. You can format without committing via ``pre-commit run`` or skip these checks with ``git commit --no-verify``. + +## Support + +Work on this repository is supported in part by: + +"Anaconda, Inc. - Advancing AI through open source." + +anaconda logo diff --git a/ci/environment-friends.yml b/ci/environment-friends.yml index 114c85444..2126744af 100644 --- a/ci/environment-friends.yml +++ b/ci/environment-friends.yml @@ -9,7 +9,6 @@ dependencies: - pytest-cov - pytest-mock - pip - - pytest<8 - ujson - requests - decorator @@ -25,5 +24,8 @@ dependencies: - google-api-python-client - httpretty - aiobotocore - - "moto >=4,<5" + - moto - flask + - pip: + - git+https://github.com/fsspec/s3fs + - git+https://github.com/fsspec/gcsfs diff --git a/ci/environment-py38.yml b/ci/environment-linux.yml similarity index 100% rename from ci/environment-py38.yml rename to ci/environment-linux.yml diff --git a/ci/environment-typecheck.yml b/ci/environment-typecheck.yml deleted file mode 100644 index 1b7c482f5..000000000 --- a/ci/environment-typecheck.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: test_env -channels: - - conda-forge -dependencies: - - mypy=1.4.1 - - pyarrow - - python=3.8 - - pip - - pip: - - types-paramiko - - types-requests - - types-tqdm - - types-paramiko - - types-PyYAML - - types-ujson diff --git a/docs/source/api.rst b/docs/source/api.rst index 94c4cdd5b..15c857b77 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -7,17 +7,17 @@ User Functions -------------- .. autosummary:: - fsspec.available_compressions - fsspec.available_protocols - fsspec.filesystem - fsspec.fuse.run - fsspec.generic.rsync - fsspec.get_filesystem_class - fsspec.get_mapper - fsspec.gui.FileSelector - fsspec.open - fsspec.open_files - fsspec.open_local + available_compressions + available_protocols + filesystem + fuse.run + generic.rsync + get_filesystem_class + get_mapper + gui.FileSelector + open + open_files + open_local .. autofunction:: fsspec.available_compressions .. autofunction:: fsspec.available_protocols @@ -36,24 +36,24 @@ Base Classes ------------ .. autosummary:: - fsspec.archive.AbstractArchiveFileSystem - fsspec.asyn.AsyncFileSystem - fsspec.callbacks.Callback - fsspec.callbacks.DotPrinterCallback - fsspec.callbacks.NoOpCallback - fsspec.callbacks.TqdmCallback - fsspec.core.BaseCache - fsspec.core.OpenFile - fsspec.core.OpenFiles - fsspec.core.get_fs_token_paths - fsspec.core.url_to_fs - fsspec.dircache.DirCache - fsspec.FSMap - fsspec.generic.GenericFileSystem - fsspec.registry.register_implementation - fsspec.spec.AbstractBufferedFile - fsspec.spec.AbstractFileSystem - fsspec.spec.Transaction + archive.AbstractArchiveFileSystem + asyn.AsyncFileSystem + callbacks.Callback + callbacks.DotPrinterCallback + callbacks.NoOpCallback + callbacks.TqdmCallback + core.BaseCache + core.OpenFile + core.OpenFiles + core.get_fs_token_paths + core.url_to_fs + dircache.DirCache + FSMap + generic.GenericFileSystem + registry.register_implementation + spec.AbstractBufferedFile + spec.AbstractFileSystem + spec.Transaction .. autoclass:: fsspec.archive.AbstractArchiveFileSystem :members: @@ -107,30 +107,31 @@ Built-in Implementations ------------------------ .. autosummary:: - fsspec.implementations.arrow.ArrowFSWrapper - fsspec.implementations.arrow.HadoopFileSystem - fsspec.implementations.cached.CachingFileSystem - fsspec.implementations.cached.SimpleCacheFileSystem - fsspec.implementations.cached.WholeFileCacheFileSystem - fsspec.implementations.dask.DaskWorkerFileSystem - fsspec.implementations.data.DataFileSystem - fsspec.implementations.dbfs.DatabricksFileSystem - fsspec.implementations.dirfs.DirFileSystem - fsspec.implementations.ftp.FTPFileSystem - fsspec.implementations.git.GitFileSystem - fsspec.implementations.github.GithubFileSystem - fsspec.implementations.http.HTTPFileSystem - fsspec.implementations.jupyter.JupyterFileSystem - fsspec.implementations.libarchive.LibArchiveFileSystem - fsspec.implementations.local.LocalFileSystem - fsspec.implementations.memory.MemoryFileSystem - fsspec.implementations.reference.ReferenceFileSystem - fsspec.implementations.reference.LazyReferenceMapper - fsspec.implementations.sftp.SFTPFileSystem - fsspec.implementations.smb.SMBFileSystem - fsspec.implementations.tar.TarFileSystem - fsspec.implementations.webhdfs.WebHDFS - fsspec.implementations.zip.ZipFileSystem + implementations.arrow.ArrowFSWrapper + implementations.arrow.HadoopFileSystem + implementations.cached.CachingFileSystem + implementations.cached.SimpleCacheFileSystem + implementations.cached.WholeFileCacheFileSystem + implementations.dask.DaskWorkerFileSystem + implementations.data.DataFileSystem + implementations.dbfs.DatabricksFileSystem + implementations.dirfs.DirFileSystem + implementations.ftp.FTPFileSystem + implementations.gist.GistFileSystem + implementations.git.GitFileSystem + implementations.github.GithubFileSystem + implementations.http.HTTPFileSystem + implementations.jupyter.JupyterFileSystem + implementations.libarchive.LibArchiveFileSystem + implementations.local.LocalFileSystem + implementations.memory.MemoryFileSystem + implementations.reference.ReferenceFileSystem + implementations.reference.LazyReferenceMapper + implementations.sftp.SFTPFileSystem + implementations.smb.SMBFileSystem + implementations.tar.TarFileSystem + implementations.webhdfs.WebHDFS + implementations.zip.ZipFileSystem .. autoclass:: fsspec.implementations.arrow.ArrowFSWrapper :members: __init__ @@ -162,6 +163,9 @@ Built-in Implementations .. autoclass:: fsspec.implementations.ftp.FTPFileSystem :members: __init__ +.. autoclass:: fsspec.implementations.gist.GistFileSystem + :members: __init__ + .. autoclass:: fsspec.implementations.git.GitFileSystem :members: __init__ @@ -209,45 +213,82 @@ Built-in Implementations Other Known Implementations --------------------------- -- `abfs`_ for Azure Blob service -- `adl`_ for Azure DataLake storage + +Note that most of these projects are hosted outside of the `fsspec` organisation. Please read their +documentation carefully before using any particular package. + +- `abfs`_ for Azure Blob service, with protocol "abfs://" +- `adl`_ for Azure DataLake storage, with protocol "adl://" - `alluxiofs`_ to access fsspec implemented filesystem with Alluxio distributed cache -- `boxfs`_ for access to Box file storage -- `dropbox`_ for access to dropbox shares +- `boxfs`_ for access to Box file storage, with protocol "box://" +- `csvbase`_ for access to csvbase.com hosted CSV files, with protocol "csvbase://" +- `dropbox`_ for access to dropbox shares, with protocol "dropbox://" - `dvc`_ to access DVC/Git repository as a filesystem -- `gcsfs`_ for Google Cloud Storage +- `fsspec-encrypted`_ for transparent encryption on top of other fsspec filesystems. +- `gcsfs`_ for Google Cloud Storage, with protocol "gs://" or "gcs://" - `gdrive`_ to access Google Drive and shares (experimental) +- `git`_ to access Git repositories - `huggingface_hub`_ to access the Hugging Face Hub filesystem, with protocol "hf://" -- `lakefs`_ for lakeFS data lakes -- `ocifs`_ for access to Oracle Cloud Object Storage +- `hdfs-native`_ to access Hadoop filesystem, with protocol "hdfs://" +- `httpfs-sync`_ to access HTTP(s) files in a synchronous manner to offer an alternative to the aiohttp-based implementation. +- `ipfsspec`_ for the InterPlanetary File System (IPFS), with protocol "ipfs://" +- `irods`_ for access to iRODS servers, with protocol "irods://" +- `lakefs`_ for lakeFS data lakes, with protocol "lakefs://" +- `morefs`_ for `OverlayFileSystem`, `DictFileSystem`, and others +- `obstore`_: zero-dependency access to Amazon S3, Google Cloud Storage, and Azure Blob Storage using the underlying Rust `object_store`_ library, with protocols "s3://", "gs://", and "abfs://". +- `ocifs`_ for access to Oracle Cloud Object Storage, with protocol "oci://" - `ocilake`_ for OCI Data Lake storage - `ossfs`_ for Alibaba Cloud (Aliyun) Object Storage System (OSS) - `p9fs`_ for 9P (Plan 9 Filesystem Protocol) servers -- `s3fs`_ for Amazon S3 and other compatible stores +- `PyAthena`_ for S3 access to Amazon Athena, with protocol "s3://" or "s3a://" +- `PyDrive2`_ for Google Drive access +- `fsspec-proxy`_ for "pyscript:" URLs via a proxy server +- `s3fs`_ for Amazon S3 and other compatible stores, with protocol "s3://" +- `sshfs`_ for access to SSH servers, with protocol "ssh://" or "sftp://" +- `swiftspec`_ for OpenStack SWIFT, with protocol "swift://" - `tosfs`_ for ByteDance volcano engine Tinder Object Storage (TOS) - `wandbfs`_ to access Wandb run data (experimental) -- `webdav4`_ for WebDAV +- `wandbfsspec`_ to access Weights & Biases (experimental) +- `webdav4`_ for WebDAV, with protocol "webdav://" or "dav://" - `xrootd`_ for xrootd, with protocol "root://" +- `msgraphfs`_ for Microsoft storage (ie Sharepoint) using the drive API through Microsoft Graph, with protocol "msgd://" .. _abfs: https://github.com/dask/adlfs .. _adl: https://github.com/dask/adlfs .. _alluxiofs: https://github.com/fsspec/alluxiofs .. _boxfs: https://github.com/IBM/boxfs -.. _dropbox: https://github.com/MarineChap/intake_dropbox +.. _csvbase: https://github.com/calpaterson/csvbase-client +.. _dropbox: https://github.com/fsspec/dropboxdrivefs .. _dvc: https://github.com/iterative/dvc +.. _fsspec-encrypted: https://github.com/thevgergroup/fsspec-encrypted +.. _fsspec-proxy: https://github.com/fsspec/fsspec-proxy .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/ .. _gdrive: https://github.com/fsspec/gdrivefs +.. _git: https://github.com/iterative/scmrepo +.. _hdfs-native: https://github.com/Kimahriman/hdfs-native/blob/master/python/hdfs_native/fsspec.py +.. _httpfs-sync: https://github.com/moradology/httpfs-sync .. _huggingface_hub: https://huggingface.co/docs/huggingface_hub/main/en/guides/hf_file_system -.. _lakefs: https://github.com/appliedAI-Initiative/lakefs-spec -.. _ocifs: https://pypi.org/project/ocifs +.. _ipfsspec: https://github.com/fsspec/ipfsspec +.. _irods: https://github.com/xwcl/irods_fsspec +.. _lakefs: https://github.com/aai-institute/lakefs-spec +.. _morefs: https://github.com/iterative/morefs +.. _object_store: https://docs.rs/object_store/latest/object_store/ +.. _obstore: https://developmentseed.org/obstore/latest/ +.. _ocifs: https://ocifs.readthedocs.io/en/latest/ .. _ocilake: https://github.com/oracle/ocifs .. _ossfs: https://github.com/fsspec/ossfs .. _p9fs: https://github.com/pbchekin/p9fs-py +.. _PyAthena: https://github.com/laughingman7743/PyAthena +.. _PyDrive2: https://github.com/iterative/PyDrive2 .. _s3fs: https://s3fs.readthedocs.io/en/latest/ +.. _sshfs: https://github.com/fsspec/sshfs +.. _swiftspec: https://github.com/fsspec/swiftspec .. _tosfs: https://tosfs.readthedocs.io/en/latest/ .. _wandbfs: https://github.com/jkulhanek/wandbfs +.. _wandbfsspec: https://github.com/alvarobartt/wandbfsspec .. _webdav4: https://github.com/skshetry/webdav4 .. _xrootd: https://github.com/CoffeaTeam/fsspec-xrootd +.. _msgraphfs: https://github.com/acsone/msgraphfs .. _readbuffering: @@ -255,12 +296,12 @@ Read Buffering -------------- .. autosummary:: - fsspec.caching.BlockCache - fsspec.caching.BytesCache - fsspec.caching.MMapCache - fsspec.caching.ReadAheadCache - fsspec.caching.FirstChunkCache - fsspec.caching.BackgroundBlockCache + caching.BlockCache + caching.BytesCache + caching.MMapCache + caching.ReadAheadCache + caching.FirstChunkCache + caching.BackgroundBlockCache .. autoclass:: fsspec.caching.BlockCache :members: @@ -285,7 +326,7 @@ Utilities .. autosummary:: - fsspec.utils.read_block + utils.read_block .. autofunction:: fsspec.utils.read_block diff --git a/docs/source/async.rst b/docs/source/async.rst index 58334b333..dc44df381 100644 --- a/docs/source/async.rst +++ b/docs/source/async.rst @@ -152,3 +152,37 @@ available as the attribute ``.loop``. + +AsyncFileSystemWrapper +---------------------- + +The `AsyncFileSystemWrapper` class is an experimental feature that allows you to convert +a synchronous filesystem into an asynchronous one. This is useful for quickly integrating +synchronous filesystems into workflows that may expect `AsyncFileSystem` instances. + +Basic Usage +~~~~~~~~~~~ + +To use `AsyncFileSystemWrapper`, wrap any synchronous filesystem to work in an asynchronous context. +In this example, the synchronous `LocalFileSystem` is wrapped, creating an `AsyncFileSystem` instance +backed by the normal, synchronous methods of `LocalFileSystem`: + +.. code-block:: python + + import asyncio + import fsspec + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + async def async_copy_file(): + sync_fs = fsspec.filesystem('file') # by-default synchronous, local filesystem + async_fs = AsyncFileSystemWrapper(sync_fs) + return await async_fs._copy('/source/file.txt', '/destination/file.txt') + + asyncio.run(async_copy_file()) + +Limitations +----------- + +This is experimental. Users should not expect this wrapper to magically make things faster. +It is primarily provided to allow usage of synchronous filesystems with interfaces that expect +`AsyncFileSystem` instances. diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index fea257b44..be950203e 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,171 @@ Changelog ========= +2025.9.0 +-------- + +Enhancements + +- include Last_modified info from HTTP headers to info (#1909) +- add optional semaphore to async-wrapper (#1908) + +Fixes + +- ensure cachingFSs show correct protocol (#1897) +- fix simplecache cat_ranges (#1892) + +Other + +- Style (#1894) + + +2025.7.0 +-------- + +Enhancements + +- only download HTML mime for http listing (#1889) +- add tos:// to registry (#1878) + +Fixes + +- use st_birthtime in localFS, if available (#1883) +- allow cat_* in simplecache (#1881) +- remove deprecated asyncio use (#1862) +- create event loop if it doesn't exist (#1857) + +Other + +- remove references to py38 (#1888) +- ruff updates (#1887, 1864) +- github rate limits in CI (#1879, 1877) +- acknowledge Anaconda support (#1876) +- add obstore to known implementations (#1875) +- add Microsoft storage to known implementations (#1853) +- use builtins zstd for py3.14 (#1874) +- gdrivefs -> gdrive_fsspec (#1858) +- windows version in CI (#1855) +- error message typo (#1854) + + +2025.5.1 +-------- + +Enhancements + +- file system for GitHub gists (#1791) + +Other + +- doc fixes (#1847, 1848) + +2025.5.0 +-------- + +Enhancements + +- add rm in github: (##1839) +- cachingFS performance, don't ls every time (#1833) +- pyscript: protocol and optional entrypoint (#1828) + +Fixes + +- improve cp in generic (#1835) +- fix ls in dbfs (#1834) +- fix parquet tests for pyarrow 20 (#1831) +- fix crosslink for local tempfile (#1829) +- keep permissions in local transaction (#1826) +- signature compatibility in local cp() (#1820) + +Other + +- add "gs" at mention of gcsfs in docs (#1840) +- lint style (#1837) +- glob docstring to reflect reality (#1825) + + +2025.3.2 +-------- + +- drop support for py3.8 + +2025.3.1 +-------- + +Enhancements + +- LFS support in github: (#1810) + +Fixes + +- json should be a method fo requests shim (#1814) +- don't raise if known_implementation has no given error string (#1804) + +Other + +- rename protocols for sync-http (#1810) + + +2025.3.0 +-------- + +Enhancements + +- add pipe_file to HTTP (#1799, 1801) +- add sync http for pyodide (#1177) +- ls performance for local and detail=False (#1789) + +Fixes + +- dir/info consistency in dirfs (#1798) +- referenceFS async consistency (#1794, 1795) +- CI (#1793) + + +2025.2.0 +-------- + +Enhancements + +- add open() to referenceFS (#1778) + +Fixes + +- don't make async open() in async-wrapper (#1769) +- fix CI following dask-expr upstream change (#1781) +- cope with zarr3 "Buffer" objects in referenceFS (#1784) + +Other + +- use itemgetter in archiveFS (#1764) +- document that newline is included in readline(s) (#1770) +- format/spelling (#1774, 1779, 1780) + +2024.12.0 +--------- + +Enhancements + +- "exclusive" mode for writing (#1762, 1756, 174+) +- "tree" text display of filesystem contents (#1750) +- async wrapper for sync FSs (#1745) +- new known implementation: tosfs (#1739) +- consilidate block fetch requests (#1733) + +Fixes + +- better webHDFS proxies (# +- syn FSs in referenceFS (#1755) +- don't serialize file caches (#1753) +- race condition in local ls() (#1744) +- missing/nan references in parquet (#1738) +- _un_chain kwargs (@1736) +- async _cat_file in referenceFS (#1734) + +Other + +- fallback implementation for _fetch_range (#1732) + 2024.10.0 --------- diff --git a/docs/source/features.rst b/docs/source/features.rst index 907084e0d..949e47107 100644 --- a/docs/source/features.rst +++ b/docs/source/features.rst @@ -408,3 +408,21 @@ tqdm. + + +Exclusive write +--------------- + +Some backends support writing to a file only if it doesn't already exist. This may be +implemented for the following methods: +- pipe_file (with argument ``mode=='create'``) +- put_file (with argument ``mode=='create'``) +- open (with argument ``mode="xb"``) +Since some writes will be achieved in blocks, the timing of when the check is done is +not defined - it may be at the start or at the completion of the operation, depending +on the backend. + +If using exclusive mode on a file that does already exist, a ``FileExistsError`` will +be raised. + +This feature is currently included on a trial basis and may change in the future. diff --git a/docs/source/index.rst b/docs/source/index.rst index 8eb6b5091..b642ff833 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -120,6 +120,10 @@ The current list of known implementations can be found as follows api.rst changelog.rst + +These docs pages collect anonymous tracking data using goatcounter, and the +dashboard is available to the public: https://fsspec.goatcounter.com/ . + .. raw:: html