diff --git a/.coveragerc b/.coveragerc index 05b56af..cea0409 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,3 +1,4 @@ [run] branch = True source = dirhash +omit = _version.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4ab54a1 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +src/dirhash/_version.py export-subst diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..47e7d0d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,57 @@ +name: CI + +on: + push: + branches: + - "master" + pull_request: + branches: + - "**" + workflow_dispatch: + release: + types: [published, edited] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.8" + - uses: pre-commit/action@v3.0.1 + + tests: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os: [ubuntu-latest, windows-latest] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh-actions + - name: Cache tox environments + id: cache-tox + uses: actions/cache@v4 + with: + path: .tox + # setup.py and setup.cfg have versioning info that would impact the + # tox environment. hashFiles only takes a single file path or pattern + # at the moment. + key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} + - name: Test with tox + run: tox + - uses: codecov/codecov-action@v4 + if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest' + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..350681a --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,128 @@ +# Based on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/# +name: Publish Python Package + +on: + push: + tags: + - "v[0-9]+.[0-9]+.[0-9]*" + +jobs: + build: + name: Build distribution + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + # NOTE: tags are not present unless triggered by tag push + # - name: Get tags + # run: git fetch --tags origin + # - name: List tags + # run: git tag --list + # TODO: somehow versioneer does not pickup the tag when workflow is not triggered by a + # tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5, + # see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42) + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + + publish-to-pypi: + name: Publish to PyPI + # TODO we need to make sure the tag matches the version! + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/dirhash + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + github-release: + name: Sign and upload to GitHub Release + needs: + - publish-to-pypi + runs-on: ubuntu-latest + + permissions: + contents: write # IMPORTANT: mandatory for making GitHub Releases + id-token: write # IMPORTANT: mandatory for sigstore + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + - name: Create GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + - name: Upload artifact signatures to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + # Upload to GitHub Release using the `gh` CLI. + # `dist/` contains the built packages, and the + # sigstore-produced signatures and certificates. + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' + + publish-to-testpypi: + name: Publish to TestPyPI + if: startsWith(github.ref, 'refs/tags/') # only publish on tag pushes + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/dirhash + + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index 1d58b6e..0000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: Run tests - -on: - push: - branches: - - "master" - pull_request: - branches: - - "*" - workflow_dispatch: - release: - types: [published, edited] - -jobs: - tests: - - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] - - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install tox tox-gh-actions - - name: Cache tox environments - id: cache-tox - uses: actions/cache@v4 - with: - path: .tox - # setup.py and setup.cfg have versioning info that would impact the - # tox environment. hashFiles only takes a single file path or pattern - # at the moment. - key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - - name: Test with tox - run: tox - - uses: codecov/codecov-action@v4 - env: - token: ${{ secrets.CODECOV_TOKEN }} - with: - verbose: true diff --git a/.gitignore b/.gitignore index bd70d88..b4973ea 100644 --- a/.gitignore +++ b/.gitignore @@ -106,6 +106,8 @@ venv.bak/ # Pycharm .idea/ +# VSC +.vscode/ # Project specific -benchmark/test_cases/* \ No newline at end of file +benchmark/test_cases/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..393ce81 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,13 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + args: [--prose-wrap=preserve, --print-width=88] + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.7 + hooks: + - id: ruff + args: + - --fix + - id: ruff-format diff --git a/CHANGELOG.md b/CHANGELOG.md index 50a06f1..ca27faa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,27 +6,32 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + NIL ## [0.2.0] - 2019-04-20 + Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/v0.1.0) ### Added + - A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash). - This changelog. - Results form a new benchmark run after changes. The `benchmark/run.py` now outputs results files which names include the `dirhash.__version__`. ### Changed -- **Significant breaking changes** from version 0.1.1 - both regarding API and the -underlying method/protocol for computing the hash. This means that **hashes -computed with this version will differ from hashes computed with version < 0.2.0 for -same directory**. -- This dirhash python implementation has moved to here -[github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from -the previous repository -[github.com/andhus/dirhash](https://github.com/andhus/dirhash) -which now contains the formal description of the Dirhash Standard. + +- **Significant breaking changes** from version 0.1.1 - both regarding API and the + underlying method/protocol for computing the hash. This means that **hashes + computed with this version will differ from hashes computed with version < 0.2.0 for + same directory**. +- This dirhash python implementation has moved to here + [github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from + the previous repository + [github.com/andhus/dirhash](https://github.com/andhus/dirhash) + which now contains the formal description of the Dirhash Standard. ### Removed -- All support for the `.dirhashignore` file. This seemed superfluous, please file an -issue if you need this feature. + +- All support for the `.dirhashignore` file. This seemed superfluous, please file an + issue if you need this feature. diff --git a/README.md b/README.md index dc763ab..7efd153 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ -[![Build Status](https://travis-ci.com/andhus/dirhash-python.svg?branch=master)](https://travis-ci.com/andhus/dirhash-python) [![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) # dirhash + A lightweight python module and CLI for computing the hash of any directory based on its files' structure and content. + - Supports all hashing algorithms of Python's built-in `hashlib` module. - Glob/wildcard (".gitignore style") path matching for expressive filtering of files to include/exclude. - Multiprocessing for up to [6x speed-up](#performance) @@ -11,18 +12,24 @@ directory based on its files' structure and content. The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. ## Installation + From PyPI: + ```commandline pip install dirhash ``` + Or directly from source: + ```commandline git clone git@github.com:andhus/dirhash-python.git pip install dirhash/ ``` ## Usage + Python module: + ```python from dirhash import dirhash @@ -31,7 +38,9 @@ dir_md5 = dirhash(dirpath, "md5") pyfiles_md5 = dirhash(dirpath, "md5", match=["*.py"]) no_hidden_sha1 = dirhash(dirpath, "sha1", ignore=[".*", ".*/"]) ``` + CLI: + ```commandline dirhash path/to/directory -a md5 dirhash path/to/directory -a md5 --match "*.py" @@ -39,56 +48,59 @@ dirhash path/to/directory -a sha1 --ignore ".*" ".*/" ``` ## Why? + If you (or your application) need to verify the integrity of a set of files as well -as their name and location, you might find this useful. Use-cases range from -verification of your image classification dataset (before spending GPU-$$$ on +as their name and location, you might find this useful. Use-cases range from +verification of your image classification dataset (before spending GPU-$$$ on training your fancy Deep Learning model) to validation of generated files in regression-testing. -There isn't really a standard way of doing this. There are plenty of recipes out +There isn't really a standard way of doing this. There are plenty of recipes out there (see e.g. these SO-questions for [linux](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents) and [python](https://stackoverflow.com/questions/24937495/how-can-i-calculate-a-hash-for-a-filesystem-directory-using-python)) -but I couldn't find one that is properly tested (there are some gotcha:s to cover!) -and documented with a compelling user interface. `dirhash` was created with this as +but I couldn't find one that is properly tested (there are some gotcha:s to cover!) +and documented with a compelling user interface. `dirhash` was created with this as the goal. -[checksumdir](https://github.com/cakepietoast/checksumdir) is another python +[checksumdir](https://github.com/cakepietoast/checksumdir) is another python module/tool with similar intent (that inspired this project) but it lacks much of the functionality offered here (most notably including file names/structure in the hash) and lacks tests. ## Performance + The python `hashlib` implementation of common hashing algorithms are highly -optimised. `dirhash` mainly parses the file tree, pipes data to `hashlib` and -combines the output. Reasonable measures have been taken to minimize the overhead -and for common use-cases, the majority of time is spent reading data from disk +optimised. `dirhash` mainly parses the file tree, pipes data to `hashlib` and +combines the output. Reasonable measures have been taken to minimize the overhead +and for common use-cases, the majority of time is spent reading data from disk and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py) with the shell command: -`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` +`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` -which is the top answer for the SO-question: +which is the top answer for the SO-question: [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents) -Results for two test cases are shown below. Both have 1 GiB of random data: in -"flat_1k_1MB", split into 1k files (1 MiB each) in a flat structure, and in -"nested_32k_32kB", into 32k files (32 KiB each) spread over the 256 leaf directories +Results for two test cases are shown below. Both have 1 GiB of random data: in +"flat_1k_1MB", split into 1k files (1 MiB each) in a flat structure, and in +"nested_32k_32kB", into 32k files (32 KiB each) spread over the 256 leaf directories in a binary tree of depth 8. -Implementation | Test Case | Time (s) | Speed up -------------------- | --------------- | -------: | -------: -shell reference | flat_1k_1MB | 2.29 | -> 1.0 -`dirhash` | flat_1k_1MB | 1.67 | 1.36 -`dirhash`(8 workers)| flat_1k_1MB | 0.48 | **4.73** -shell reference | nested_32k_32kB | 6.82 | -> 1.0 -`dirhash` | nested_32k_32kB | 3.43 | 2.00 -`dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** +| Implementation | Test Case | Time (s) | Speed up | +| -------------------- | --------------- | -------: | -------: | +| shell reference | flat_1k_1MB | 2.29 | -> 1.0 | +| `dirhash` | flat_1k_1MB | 1.67 | 1.36 | +| `dirhash`(8 workers) | flat_1k_1MB | 0.48 | **4.73** | +| shell reference | nested_32k_32kB | 6.82 | -> 1.0 | +| `dirhash` | nested_32k_32kB | 3.43 | 2.00 | +| `dirhash`(8 workers) | nested_32k_32kB | 1.14 | **6.00** | The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/tree/master/benchmark). ## Documentation -Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file + +Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). diff --git a/benchmark/README.md b/benchmark/README.md index 52bb5c1..0fb9d61 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -2,32 +2,31 @@ As a reference, the performance of `dirhash` is benchmarked against the shell command: -`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` +`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` -(top answer for the SO-question: +(top answer for the SO-question: [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents)) Each test case contains 1 GiB of random data, split equally into 8, 1k or 32k files, in a flat or nested (binary tree of depth 8) structure. -For a fair comparison, *the CLI version* of `dirhash` was used (including startup +For a fair comparison, _the CLI version_ of `dirhash` was used (including startup time for loading of python modules etc.). -For full details/reproducibility see/run the `run.py` script for which the output is -found in `results.csv`. These results were generated on a MacBook Pro (2018): +For full details/reproducibility see/run the `run.py` script for which the output is +found in `results.csv`. These results were generated on a MacBook Pro (2018): + - 2,2 GHz Intel Core i7 (`sysctl -n hw.physicalcpu hw.logicalcpu`-> 6, 12) - 16 GB 2400 MHz DDR4 - APPLE SSD AP0512M - - ## Sample results: -Implementation | Test Case | Time (s) | Speed up -------------------- | --------------- | -------: | -------: -shell reference | flat_1k_1MB | 2.29 | -> 1.0 -`dirhash` | flat_1k_1MB | 1.67 | 1.36 -`dirhash`(8 workers)| flat_1k_1MB | 0.48 | **4.73** -shell reference | nested_32k_32kB | 6.82 | -> 1.0 -`dirhash` | nested_32k_32kB | 3.43 | 2.00 -`dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** +| Implementation | Test Case | Time (s) | Speed up | +| -------------------- | --------------- | -------: | -------: | +| shell reference | flat_1k_1MB | 2.29 | -> 1.0 | +| `dirhash` | flat_1k_1MB | 1.67 | 1.36 | +| `dirhash`(8 workers) | flat_1k_1MB | 0.48 | **4.73** | +| shell reference | nested_32k_32kB | 6.82 | -> 1.0 | +| `dirhash` | nested_32k_32kB | 3.43 | 2.00 | +| `dirhash`(8 workers) | nested_32k_32kB | 1.14 | **6.00** | diff --git a/benchmark/results.json b/benchmark/results.json index 62c622b..0304dfd 100644 --- a/benchmark/results.json +++ b/benchmark/results.json @@ -1,402 +1,402 @@ [ - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.014, - "t_median": 2.02 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.602, - "t_median": 1.604 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.977, - "t_median": 0.98 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.562, - "t_median": 0.569 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.464, - "t_median": 0.473 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.263, - "t_median": 2.268 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.662, - "t_median": 1.667 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.978, - "t_median": 0.983 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.57, - "t_median": 0.58 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.476, - "t_median": 0.48 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 6.711, - "t_median": 6.721 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 3.329, - "t_median": 3.354 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 2.067, - "t_median": 2.074 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 1.345, - "t_median": 1.362 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 1.09, - "t_median": 1.094 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.296, - "t_median": 2.306 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.713, - "t_median": 1.714 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.996, - "t_median": 1.009 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.601, - "t_median": 0.602 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.499, - "t_median": 0.505 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 6.814, - "t_median": 6.818 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 3.376, - "t_median": 3.426 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 2.147, - "t_median": 2.153 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 1.414, - "t_median": 1.416 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 1.137, - "t_median": 1.138 - }, - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.181, - "t_median": 2.196 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.214, - "t_median": 1.225 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.768, - "t_median": 0.774 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.467, - "t_median": 0.474 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.47, - "t_median": 0.477 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.221, - "t_median": 2.229 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.252, - "t_median": 1.263 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.774, - "t_median": 0.777 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.471, - "t_median": 0.477 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.378, - "t_median": 0.478 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.178, - "t_median": 4.224 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.921, - "t_median": 3.008 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 1.888, - "t_median": 1.892 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 1.266, - "t_median": 1.275 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 1.072, - "t_median": 1.079 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.236, - "t_median": 2.26 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.308, - "t_median": 1.314 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.797, - "t_median": 0.8 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.501, - "t_median": 0.509 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.499, - "t_median": 0.503 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.383, - "t_median": 4.406 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 3.041, - "t_median": 3.05 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 1.943, - "t_median": 1.965 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 1.329, - "t_median": 1.334 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 1.14, - "t_median": 1.149 - } -] \ No newline at end of file + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.014, + "t_median": 2.02 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.602, + "t_median": 1.604 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.977, + "t_median": 0.98 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.562, + "t_median": 0.569 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.464, + "t_median": 0.473 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.263, + "t_median": 2.268 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.662, + "t_median": 1.667 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.978, + "t_median": 0.983 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.57, + "t_median": 0.58 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.476, + "t_median": 0.48 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 6.711, + "t_median": 6.721 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 3.329, + "t_median": 3.354 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.067, + "t_median": 2.074 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.345, + "t_median": 1.362 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.09, + "t_median": 1.094 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.296, + "t_median": 2.306 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.713, + "t_median": 1.714 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.996, + "t_median": 1.009 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.601, + "t_median": 0.602 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.499, + "t_median": 0.505 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 6.814, + "t_median": 6.818 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 3.376, + "t_median": 3.426 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.147, + "t_median": 2.153 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.414, + "t_median": 1.416 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.137, + "t_median": 1.138 + }, + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.181, + "t_median": 2.196 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.214, + "t_median": 1.225 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.768, + "t_median": 0.774 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.467, + "t_median": 0.474 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.47, + "t_median": 0.477 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.221, + "t_median": 2.229 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.252, + "t_median": 1.263 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.774, + "t_median": 0.777 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.471, + "t_median": 0.477 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.378, + "t_median": 0.478 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.178, + "t_median": 4.224 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.921, + "t_median": 3.008 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 1.888, + "t_median": 1.892 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.266, + "t_median": 1.275 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.072, + "t_median": 1.079 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.236, + "t_median": 2.26 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.308, + "t_median": 1.314 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.797, + "t_median": 0.8 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.501, + "t_median": 0.509 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.499, + "t_median": 0.503 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.383, + "t_median": 4.406 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 3.041, + "t_median": 3.05 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 1.943, + "t_median": 1.965 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.329, + "t_median": 1.334 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.14, + "t_median": 1.149 + } +] diff --git a/benchmark/results_v0.2.0.json b/benchmark/results_v0.2.0.json index 71a652b..a707fcf 100644 --- a/benchmark/results_v0.2.0.json +++ b/benchmark/results_v0.2.0.json @@ -1,402 +1,402 @@ [ - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.079, - "t_median": 2.083 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.734, - "t_median": 1.945 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.999, - "t_median": 1.183 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.711, - "t_median": 0.728 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.504, - "t_median": 0.518 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 3.383, - "t_median": 3.679 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.846, - "t_median": 1.921 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 1.137, - "t_median": 1.158 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.74, - "t_median": 0.749 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.53, - "t_median": 0.534 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 13.827, - "t_median": 18.213 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 13.655, - "t_median": 13.808 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 3.276, - "t_median": 3.33 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 2.409, - "t_median": 2.421 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 2.045, - "t_median": 2.086 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 3.284, - "t_median": 3.332 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.717, - "t_median": 1.725 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 1.026, - "t_median": 1.034 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.622, - "t_median": 0.633 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.522, - "t_median": 0.529 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 11.898, - "t_median": 12.125 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 13.858, - "t_median": 14.146 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 2.781, - "t_median": 2.987 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 1.894, - "t_median": 1.92 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 1.55, - "t_median": 1.568 - }, - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.042, - "t_median": 2.05 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.338, - "t_median": 1.354 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.79, - "t_median": 0.794 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.583, - "t_median": 0.593 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.483, - "t_median": 0.487 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.118, - "t_median": 2.129 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.39, - "t_median": 1.531 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.925, - "t_median": 0.932 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.614, - "t_median": 0.629 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.511, - "t_median": 0.52 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 10.551, - "t_median": 10.97 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.663, - "t_median": 4.76 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 3.108, - "t_median": 3.235 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 2.342, - "t_median": 2.361 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 2.071, - "t_median": 2.094 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.11, - "t_median": 2.159 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.436, - "t_median": 1.47 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.925, - "t_median": 0.937 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.627, - "t_median": 0.643 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.516, - "t_median": 0.527 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 3.982, - "t_median": 7.147 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.114, - "t_median": 4.156 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 2.598, - "t_median": 2.616 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 1.809, - "t_median": 1.831 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 1.552, - "t_median": 1.58 - } -] \ No newline at end of file + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.079, + "t_median": 2.083 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.734, + "t_median": 1.945 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.999, + "t_median": 1.183 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.711, + "t_median": 0.728 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.504, + "t_median": 0.518 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.383, + "t_median": 3.679 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.846, + "t_median": 1.921 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.137, + "t_median": 1.158 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.74, + "t_median": 0.749 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.53, + "t_median": 0.534 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 13.827, + "t_median": 18.213 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.655, + "t_median": 13.808 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 3.276, + "t_median": 3.33 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 2.409, + "t_median": 2.421 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 2.045, + "t_median": 2.086 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.284, + "t_median": 3.332 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.717, + "t_median": 1.725 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.026, + "t_median": 1.034 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.622, + "t_median": 0.633 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.522, + "t_median": 0.529 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 11.898, + "t_median": 12.125 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.858, + "t_median": 14.146 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.781, + "t_median": 2.987 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.894, + "t_median": 1.92 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.55, + "t_median": 1.568 + }, + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.042, + "t_median": 2.05 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.338, + "t_median": 1.354 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.79, + "t_median": 0.794 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.583, + "t_median": 0.593 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.483, + "t_median": 0.487 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.118, + "t_median": 2.129 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.39, + "t_median": 1.531 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.932 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.614, + "t_median": 0.629 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.511, + "t_median": 0.52 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 10.551, + "t_median": 10.97 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.663, + "t_median": 4.76 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 3.108, + "t_median": 3.235 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 2.342, + "t_median": 2.361 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 2.071, + "t_median": 2.094 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.11, + "t_median": 2.159 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.436, + "t_median": 1.47 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.937 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.627, + "t_median": 0.643 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.516, + "t_median": 0.527 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 3.982, + "t_median": 7.147 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.114, + "t_median": 4.156 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 2.598, + "t_median": 2.616 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.809, + "t_median": 1.831 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.552, + "t_median": 1.58 + } +] diff --git a/benchmark/run.py b/benchmark/run.py index f930b2e..712aa9f 100644 --- a/benchmark/run.py +++ b/benchmark/run.py @@ -1,24 +1,18 @@ -from __future__ import print_function, division - import json import os import subprocess - -from statistics import median, mean +from statistics import median from dirhash import __version__ - -BENCHMARK_ROOT = os.path.abspath( - os.path.join(__file__, os.pardir) -) +BENCHMARK_ROOT = os.path.abspath(os.path.join(__file__, os.pardir)) TEST_CASES = { - 'flat_8_128MB': {'depth': 0, 'num_files': 2**3, 'file_size': 2**27}, - 'flat_1k_1MB': {'depth': 0, 'num_files': 2**10, 'file_size': 2**20}, - 'flat_32k_32kB': {'depth': 0, 'num_files': 2**15, 'file_size': 2**15}, - 'nested_1k_1MB': {'depth': 8, 'num_files': 2**10, 'file_size': 2**20}, - 'nested_32k_32kB': {'depth': 8, 'num_files': 2**15, 'file_size': 2**15}, + "flat_8_128MB": {"depth": 0, "num_files": 2**3, "file_size": 2**27}, + "flat_1k_1MB": {"depth": 0, "num_files": 2**10, "file_size": 2**20}, + "flat_32k_32kB": {"depth": 0, "num_files": 2**15, "file_size": 2**15}, + "nested_1k_1MB": {"depth": 8, "num_files": 2**10, "file_size": 2**20}, + "nested_32k_32kB": {"depth": 8, "num_files": 2**15, "file_size": 2**15}, } @@ -33,36 +27,32 @@ def int_chunks(x, n): def write_file_tree(dirpath, depth, num_files, file_size, branch_factor=2): - assert num_files >= branch_factor ** depth + assert num_files >= branch_factor**depth os.mkdir(dirpath) if depth == 0: fill = len(str(num_files)) for i in range(num_files): - filepath = os.path.join(dirpath, 'f_' + str(i).rjust(fill, '0')) - with open(filepath, 'wb') as f: + filepath = os.path.join(dirpath, "f_" + str(i).rjust(fill, "0")) + with open(filepath, "wb") as f: f.write(os.urandom(file_size)) else: fill = len(str(branch_factor)) for i, num_files_branch in enumerate(int_chunks(num_files, branch_factor)): - dirpath_branch = os.path.join(dirpath, 'd_' + str(i).rjust(fill, '0')) + dirpath_branch = os.path.join(dirpath, "d_" + str(i).rjust(fill, "0")) write_file_tree( - dirpath_branch, - depth - 1, - num_files_branch, - file_size, - branch_factor + dirpath_branch, depth - 1, num_files_branch, file_size, branch_factor ) def require_test_cases(): - test_cases_root = os.path.join(BENCHMARK_ROOT, 'test_cases') + test_cases_root = os.path.join(BENCHMARK_ROOT, "test_cases") if not os.path.exists(test_cases_root): os.mkdir(test_cases_root) test_case_paths = [] for name, kwargs in TEST_CASES.items(): test_case_path = os.path.join(test_cases_root, name) if not os.path.exists(test_case_path): - print('creating test case: {}: {}'.format(name, kwargs)) + print(f"creating test case: {name}: {kwargs}") write_file_tree(test_case_path, **kwargs) test_case_paths.append(test_case_path) @@ -70,56 +60,48 @@ def require_test_cases(): def time_shell(cmd, runs=1, repetitions=1, setup=None): - time_cmd = "time for i in {{1..{rep}}}; do {cmd}; done".format( - cmd=cmd, - rep=repetitions - ) + time_cmd = f"time for i in {{1..{repetitions}}}; do {cmd}; done" if setup is not None: - time_cmd = "{}; {}".format(setup, time_cmd) + time_cmd = f"{setup}; {time_cmd}" realtimes = [] - for i in range(runs): + for _run in range(runs): process = subprocess.run( - time_cmd, - capture_output=True, - text=True, - shell=True, - check=True + time_cmd, capture_output=True, text=True, shell=True, check=True ) - output_lines = process.stderr.split('\n') + output_lines = process.stderr.split("\n") try: t_real, t_user, t_sys = output_lines[-4:-1] - assert t_real.startswith('real') - t_str = t_real.split('\t')[1] - min_str, sec_str = t_str.split('m') + assert t_real.startswith("real") + t_str = t_real.split("\t")[1] + min_str, sec_str = t_str.split("m") sec = 60 * int(min_str) + float(sec_str[:-1]) sec_per_rep = sec / repetitions - except: + except Exception as exc: raise RuntimeError( - 'Failed to parse `time` stderr output: {}'.format(process.stderr) - ) + f"Failed to parse `time` stderr output: {process.stderr}" + ) from exc realtimes.append(sec_per_rep) return realtimes def get_reference_shell_cmd(dirpath, algorithm): - if algorithm == 'md5': + if algorithm == "md5": pass - elif algorithm.startswith('sha'): + elif algorithm.startswith("sha"): version = int(algorithm[3:]) - algorithm = 'shasum -a {}'.format(version) + algorithm = f"shasum -a {version}" else: - raise ValueError('only md5 and sha supported') + raise ValueError("only md5 and sha supported") - return 'find {dir} -type f -print0 | sort -z | xargs -0 {alg} | {alg}'.format( - dir=dirpath, - alg=algorithm + return ( + f"find {dirpath} -type f -print0 | sort -z | xargs -0 {algorithm} | {algorithm}" ) def get_dirhash_shell_cmd(dirpath, algorithm, workers=1): - return 'dirhash {} -a {} -j {}'.format(dirpath, algorithm, workers) + return f"dirhash {dirpath} -a {algorithm} -j {workers}" def benchmark(dirpath, algorithm, **kwargs): @@ -129,12 +111,12 @@ def benchmark(dirpath, algorithm, **kwargs): cmd = get_reference_shell_cmd(dirpath, algorithm) realtimes = time_shell(cmd=cmd, **kwargs) res = { - 'test_case': test_case, - 'implementation': 'shell reference', - 'algorithm': algorithm, - 'workers': 1, - 't_best': min(realtimes), - 't_median': median(realtimes), + "test_case": test_case, + "implementation": "shell reference", + "algorithm": algorithm, + "workers": 1, + "t_best": min(realtimes), + "t_median": median(realtimes), } print(res) print(realtimes) @@ -144,12 +126,12 @@ def benchmark(dirpath, algorithm, **kwargs): cmd = get_dirhash_shell_cmd(dirpath, algorithm, workers) realtimes = time_shell(cmd=cmd, **kwargs) res = { - 'test_case': test_case, - 'implementation': 'dirhash', - 'algorithm': algorithm, - 'workers': workers, - 't_best': min(realtimes), - 't_median': median(realtimes), + "test_case": test_case, + "implementation": "dirhash", + "algorithm": algorithm, + "workers": workers, + "t_best": min(realtimes), + "t_median": median(realtimes), } print(res) print(realtimes) @@ -158,40 +140,49 @@ def benchmark(dirpath, algorithm, **kwargs): return result -if __name__ == '__main__': +if __name__ == "__main__": test_cases = require_test_cases() results = [] - for alg in ['md5', 'sha1']: + for alg in ["md5", "sha1"]: for test_case in test_cases: result = benchmark(test_case, algorithm=alg, runs=5, repetitions=1) results.extend(result) - result_fname = 'results_v{}'.format(__version__) + result_fname = f"results_v{__version__}" - with open(os.path.join(BENCHMARK_ROOT, result_fname + '.json'), 'w') as f: - json.dump(results, f, indent=4) + with open(os.path.join(BENCHMARK_ROOT, result_fname + ".json"), "w") as f: + json.dump(results, f, indent=2) try: import pandas as pd + df = pd.DataFrame(results) - df = df[['test_case', 'implementation', 'algorithm', 'workers', - 't_best', 't_median']] - for (tc, alg), subdf in df.groupby(['test_case', 'algorithm']): - t_ref = subdf[ - subdf['implementation'] == 'shell reference' - ]['t_median'].values[0] - speed_up = t_ref / subdf['t_median'] - df.loc[speed_up.index, 'speed-up (median)'] = speed_up + df = df[ + [ + "test_case", + "implementation", + "algorithm", + "workers", + "t_best", + "t_median", + ] + ] + for (_tc, _alg), subdf in df.groupby(["test_case", "algorithm"]): + t_ref = subdf[subdf["implementation"] == "shell reference"][ + "t_median" + ].values[0] + speed_up = t_ref / subdf["t_median"] + df.loc[speed_up.index, "speed-up (median)"] = speed_up print(df) - df_hd = df[df['implementation'] == 'dirhash'] - df_hd_1w = df_hd[df_hd['workers'] == 1] - df_hd_8w = df_hd[df_hd['workers'] == 8] - mean_speedup_1w = df_hd_1w.mean()['speed-up (median)'] - mean_speedup_8w = df_hd_8w.mean()['speed-up (median)'] - print('\nAverage speedup (single process): {}'.format(mean_speedup_1w)) + df_hd = df[df["implementation"] == "dirhash"] + df_hd_1w = df_hd[df_hd["workers"] == 1] + df_hd_8w = df_hd[df_hd["workers"] == 8] + mean_speedup_1w = df_hd_1w.mean()["speed-up (median)"] + mean_speedup_8w = df_hd_8w.mean()["speed-up (median)"] + print(f"\nAverage speedup (single process): {mean_speedup_1w}") print(df_hd_1w) - print('\nAverage speedup multiprocess (8 workers): {}'.format(mean_speedup_8w)) + print(f"\nAverage speedup multiprocess (8 workers): {mean_speedup_8w}") print(df_hd_8w) - df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + '.csv')) + df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + ".csv")) except ImportError: pass diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..1dde27a --- /dev/null +++ b/codecov.yml @@ -0,0 +1,6 @@ +coverage: + status: + project: + default: + target: 100% # the required coverage value + threshold: 5% # the leniency in hitting the target diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a032c1b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["setuptools", "versioneer==0.29"] +build-backend = "setuptools.build_meta" + +[tool.ruff] +target-version = "py38" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade +] + +[tool.ruff.lint.isort] +known-local-folder = ["dirhash"] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..4c05ed2 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[versioneer] +VCS = git +style = pep440 +versionfile_source = src/dirhash/_version.py +versionfile_build = dirhash/_version.py +tag_prefix = v +parentdir_prefix = dirhash- diff --git a/setup.py b/setup.py index 7132af5..708f049 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,36 @@ -import io import os -from setuptools import setup, find_packages -PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) +import versioneer +from setuptools import find_packages, setup -version = {} -with io.open(os.path.join(PROJECT_ROOT, "src", "dirhash", "version.py")) as fp: - exec(fp.read(), version) +PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) -DESCRIPTION = 'Python module and CLI for hashing of file system directories.' +DESCRIPTION = "Python module and CLI for hashing of file system directories." try: - with io.open(os.path.join(PROJECT_ROOT, 'README.md'), encoding='utf-8') as f: - long_description = '\n' + f.read() -except IOError: + with open(os.path.join(PROJECT_ROOT, "README.md"), encoding="utf-8") as f: + long_description = "\n" + f.read() +except OSError: long_description = DESCRIPTION setup( - name='dirhash', - version=version['__version__'], + name="dirhash", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/andhus/dirhash-python', + url="https://github.com/andhus/dirhash-python", author="Anders Huss", author_email="andhus@kth.se", - license='MIT', - install_requires=['scantree>=0.0.2', 'pathspec<0.10.0'], - packages=find_packages('src'), - package_dir={'': 'src'}, + license="MIT", + python_requires=">=3.8", + install_requires=["scantree>=0.0.4"], + packages=find_packages("src"), + package_dir={"": "src"}, include_package_data=True, entry_points={ - 'console_scripts': ['dirhash=dirhash.cli:main'], + "console_scripts": ["dirhash=dirhash.cli:main"], }, - tests_require=['pytest', 'pytest-cov'] + tests_require=["pre-commit", "pytest", "pytest-cov"], ) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index f24f698..0e49b64 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -1,36 +1,31 @@ #!/usr/bin/env python -"""dirhash - a python library (and CLI) for hashing of file system directories. -""" -from __future__ import print_function, division +"""dirhash - a python library (and CLI) for hashing of file system directories.""" -import os import hashlib - +import os from functools import partial from multiprocessing import Pool -from scantree import ( - scantree, - RecursionFilter, - CyclicLinkedDir, -) +from scantree import CyclicLinkedDir, RecursionFilter, scantree + +from . import _version -from dirhash.version import __version__ +__version__ = _version.get_versions()["version"] __all__ = [ - '__version__', - 'algorithms_guaranteed', - 'algorithms_available', - 'dirhash', - 'dirhash_impl', - 'included_paths', - 'Filter', - 'get_match_patterns', - 'Protocol' + "__version__", + "algorithms_guaranteed", + "algorithms_available", + "dirhash", + "dirhash_impl", + "included_paths", + "Filter", + "get_match_patterns", + "Protocol", ] -algorithms_guaranteed = {'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'} +algorithms_guaranteed = {"md5", "sha1", "sha224", "sha256", "sha384", "sha512"} algorithms_available = hashlib.algorithms_available @@ -42,10 +37,10 @@ def dirhash( linked_dirs=True, linked_files=True, empty_dirs=False, - entry_properties=('name', 'data'), + entry_properties=("name", "data"), allow_cyclic_links=False, chunk_size=2**20, - jobs=1 + jobs=1, ): """Computes the hash of a directory based on its structure and content. @@ -150,11 +145,10 @@ def dirhash( match_patterns=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, linked_files=linked_files, - empty_dirs=empty_dirs + empty_dirs=empty_dirs, ) protocol = Protocol( - entry_properties=entry_properties, - allow_cyclic_links=allow_cyclic_links + entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links ) return dirhash_impl( directory=directory, @@ -162,17 +156,12 @@ def dirhash( filter_=filter_, protocol=protocol, chunk_size=chunk_size, - jobs=jobs + jobs=jobs, ) def dirhash_impl( - directory, - algorithm, - filter_=None, - protocol=None, - chunk_size=2**20, - jobs=1 + directory, algorithm, filter_=None, protocol=None, chunk_size=2**20, jobs=1 ): """Computes the hash of a directory based on its structure and content. @@ -214,25 +203,26 @@ def dirhash_impl( See https://github.com/andhus/dirhash/README.md for a formal description of how the returned hash value is computed. """ + def get_instance(value, cls_, argname): if isinstance(value, cls_): return value if value is None: return cls_() - raise TypeError('{} must be an instance of {} or None'.format(argname, cls_)) + raise TypeError(f"{argname} must be an instance of {cls_} or None") - filter_ = get_instance(filter_, Filter, 'filter_') - protocol = get_instance(protocol, Protocol, 'protocol') + filter_ = get_instance(filter_, Filter, "filter_") + protocol = get_instance(protocol, Protocol, "protocol") hasher_factory = _get_hasher_factory(algorithm) def dir_apply(dir_node): if not filter_.empty_dirs: - if dir_node.path.relative == '' and dir_node.empty: + if dir_node.path.relative == "" and dir_node.empty: # only check if root node is empty (other empty dirs are filter # before `dir_apply` with `filter_.empty_dirs=False`) - raise ValueError('{}: Nothing to hash'.format(directory)) + raise ValueError(f"{directory}: Nothing to hash") descriptor = protocol.get_descriptor(dir_node) - _dirhash = hasher_factory(descriptor.encode('utf-8')).hexdigest() + _dirhash = hasher_factory(descriptor.encode("utf-8")).hexdigest() return dir_node.path, _dirhash @@ -241,10 +231,7 @@ def dir_apply(dir_node): def file_apply(path): return path, _get_filehash( - path.real, - hasher_factory, - chunk_size=chunk_size, - cache=cache + path.real, hasher_factory, chunk_size=chunk_size, cache=cache ) _, dirhash_ = scantree( @@ -256,7 +243,7 @@ def file_apply(path): allow_cyclic_links=protocol.allow_cyclic_links, cache_file_apply=False, include_empty=filter_.empty_dirs, - jobs=1 + jobs=1, ) else: # multiprocessing real_paths = set() @@ -273,18 +260,16 @@ def extract_real_paths(path): allow_cyclic_links=protocol.allow_cyclic_links, cache_file_apply=False, include_empty=filter_.empty_dirs, - jobs=1 + jobs=1, ) real_paths = list(real_paths) # hash files in parallel file_hashes = _parmap( partial( - _get_filehash, - hasher_factory=hasher_factory, - chunk_size=chunk_size + _get_filehash, hasher_factory=hasher_factory, chunk_size=chunk_size ), real_paths, - jobs=jobs + jobs=jobs, ) # prepare the mapping with precomputed file hashes real_path_to_hash = dict(zip(real_paths, file_hashes)) @@ -323,7 +308,7 @@ def included_paths( match_patterns=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, linked_files=linked_files, - empty_dirs=empty_dirs + empty_dirs=empty_dirs, ) protocol = Protocol(allow_cyclic_links=allow_cyclic_links) @@ -332,11 +317,11 @@ def included_paths( recursion_filter=filter_, follow_links=True, allow_cyclic_links=protocol.allow_cyclic_links, - include_empty=filter_.empty_dirs + include_empty=filter_.empty_dirs, ).leafpaths() return [ - path.relative if path.is_file() else os.path.join(path.relative, '.') + path.relative if path.is_file() else os.path.join(path.relative, ".") for path in leafpaths ] @@ -363,17 +348,12 @@ class Filter(RecursionFilter): that *matches provided matching criteria*. Default `False`, i.e. empty directories are ignored (as is done in git version control). """ + def __init__( - self, - match_patterns=None, - linked_dirs=True, - linked_files=True, - empty_dirs=False + self, match_patterns=None, linked_dirs=True, linked_files=True, empty_dirs=False ): - super(Filter, self).__init__( - linked_dirs=linked_dirs, - linked_files=linked_files, - match=match_patterns + super().__init__( + linked_dirs=linked_dirs, linked_files=linked_files, match=match_patterns ) self.empty_dirs = empty_dirs @@ -399,23 +379,23 @@ def get_match_patterns( ignore_hidden: bool - If `True` ignore hidden files and directories. Short for `ignore=['.*', '.*/']` Default `False`. """ - match = ['*'] if match is None else list(match) + match = ["*"] if match is None else list(match) ignore = [] if ignore is None else list(ignore) ignore_extensions = [] if ignore_extensions is None else list(ignore_extensions) if ignore_hidden: - ignore.extend(['.*', '.*/']) + ignore.extend([".*", ".*/"]) for ext in ignore_extensions: - if not ext.startswith('.'): - ext = '.' + ext - ext = '*' + ext + if not ext.startswith("."): + ext = "." + ext + ext = "*" + ext ignore.append(ext) - match_spec = match + ['!' + ign for ign in ignore] + match_spec = match + ["!" + ign for ign in ignore] def deduplicate(items): - items_set = set([]) + items_set = set() dd_items = [] for item in items: if item not in items_set: @@ -427,7 +407,7 @@ def deduplicate(items): return deduplicate(match_spec) -class Protocol(object): +class Protocol: """Specifications of which file and directory properties to consider when computing the `dirhash` value. @@ -462,33 +442,30 @@ class Protocol(object): dirhash value for directory causing the cyclic link is replaced with the hash function hexdigest of the relative path from the link to the target. """ - class EntryProperties(object): - NAME = 'name' - DATA = 'data' - IS_LINK = 'is_link' + + class EntryProperties: + NAME = "name" + DATA = "data" + IS_LINK = "is_link" options = {NAME, DATA, IS_LINK} - _DIRHASH = 'dirhash' + _DIRHASH = "dirhash" - _entry_property_separator = '\000' - _entry_descriptor_separator = '\000\000' + _entry_property_separator = "\000" + _entry_descriptor_separator = "\000\000" - def __init__( - self, - entry_properties=('name', 'data'), - allow_cyclic_links=False - ): + def __init__(self, entry_properties=("name", "data"), allow_cyclic_links=False): entry_properties = set(entry_properties) if not entry_properties.issubset(self.EntryProperties.options): raise ValueError( - 'entry properties {} not supported'.format( - entry_properties - self.EntryProperties.options) + f"entry properties {entry_properties - self.EntryProperties.options} " + "not supported" ) if not ( - self.EntryProperties.NAME in entry_properties or - self.EntryProperties.DATA in entry_properties + self.EntryProperties.NAME in entry_properties + or self.EntryProperties.DATA in entry_properties ): raise ValueError( - 'at least one of entry properties `name` and `data` must be used' + "at least one of entry properties `name` and `data` must be used" ) self.entry_properties = entry_properties self._include_name = self.EntryProperties.NAME in entry_properties @@ -497,8 +474,7 @@ def __init__( if not isinstance(allow_cyclic_links, bool): raise ValueError( - 'allow_cyclic_link must be a boolean, ' - 'got {}'.format(allow_cyclic_links) + f"allow_cyclic_link must be a boolean, got {allow_cyclic_links}" ) self.allow_cyclic_links = allow_cyclic_links @@ -508,18 +484,14 @@ def get_descriptor(self, dir_node): entries = dir_node.directories + dir_node.files entry_descriptors = [ - self._get_entry_descriptor( - self._get_entry_properties(path, entry_hash) - ) for path, entry_hash in entries + self._get_entry_descriptor(self._get_entry_properties(path, entry_hash)) + for path, entry_hash in entries ] return self._entry_descriptor_separator.join(sorted(entry_descriptors)) @classmethod def _get_entry_descriptor(cls, entry_properties): - entry_strings = [ - '{}:{}'.format(name, value) - for name, value in entry_properties - ] + entry_strings = [f"{name}:{value}" for name, value in entry_properties] return cls._entry_property_separator.join(sorted(entry_strings)) def _get_entry_properties(self, path, entry_hash): @@ -542,8 +514,8 @@ def _get_cyclic_linked_dir_descriptor(self, dir_node): path_to_target = os.path.relpath( # the extra '.' is needed if link back to root, because # an empty path ('') is not supported by os.path.relpath - os.path.join('.', target_relpath), - os.path.join('.', relpath) + os.path.join(".", target_relpath), + os.path.join(".", relpath), ) # TODO normalize posix! return path_to_target @@ -561,15 +533,14 @@ def _get_hasher_factory(algorithm): return partial(hashlib.new, algorithm) try: # bypass algorithm if already a hasher factory - hasher = algorithm(b'') - hasher.update(b'') + hasher = algorithm(b"") + hasher.update(b"") hasher.hexdigest() return algorithm - except: + except: # noqa: E722 pass - raise ValueError( - '`algorithm` must be one of: {}`'.format(algorithms_available)) + raise ValueError(f"`algorithm` must be one of: {algorithms_available}`") def _parmap(func, iterable, jobs=1): @@ -613,8 +584,8 @@ def _get_filehash(filepath, hasher_factory, chunk_size, cache=None): return filehash hasher = hasher_factory() - with open(filepath, 'rb') as f: - for chunk in iter(lambda: f.read(chunk_size), b''): + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(chunk_size), b""): hasher.update(chunk) return hasher.hexdigest() diff --git a/src/dirhash/_version.py b/src/dirhash/_version.py new file mode 100644 index 0000000..db747a1 --- /dev/null +++ b/src/dirhash/_version.py @@ -0,0 +1,717 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer + +# ruff: noqa + +"""Git implementation of _version.py.""" + +import errno +import functools +import os +import re +import subprocess +import sys +from typing import Any, Callable, Dict, List, Optional, Tuple + + +def get_keywords() -> Dict[str, str]: + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool + + +def get_config() -> VersioneerConfig: + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "dirhash-" + cfg.versionfile_source = "src/dirhash/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f: Callable) -> Callable: + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) + break + except OSError as e: + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print(f"unable to find command, tried {commands}") + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords: Dict[str, str] = {} + try: + with open(versionfile_abs) as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs( + tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command +) -> Dict[str, Any]: + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + f"{tag_prefix}[[:digit:]]*", + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces: Dict[str, Any] = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ( + f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces: Dict[str, Any]) -> str: + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces: Dict[str, Any]) -> str: + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces: Dict[str, Any]) -> str: + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces: Dict[str, Any]) -> str: + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces: Dict[str, Any]) -> str: + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces: Dict[str, Any]) -> str: + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +def get_versions() -> Dict[str, Any]: + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split("/"): + root = os.path.dirname(root) + except NameError: + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 89f8308..ae34de7 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -1,10 +1,8 @@ #!/usr/bin/env python -"""Get hash for the content and/or structure of a directory. -""" -from __future__ import print_function +"""Get hash for the content and/or structure of a directory.""" -import sys import argparse +import sys import dirhash @@ -12,169 +10,172 @@ def main(): try: kwargs = get_kwargs(sys.argv[1:]) - if kwargs.pop('list'): + if kwargs.pop("list"): # kwargs below have no effect when listing - for k in ['algorithm', 'chunk_size', 'jobs', 'entry_properties']: + for k in ["algorithm", "chunk_size", "jobs", "entry_properties"]: kwargs.pop(k) for leafpath in dirhash.included_paths(**kwargs): print(leafpath) else: print(dirhash.dirhash(**kwargs)) except Exception as e: # pragma: no cover (not picked up by coverage) - sys.stderr.write('dirhash: {}\n'.format(e)) + sys.stderr.write(f"dirhash: {e}\n") sys.exit(1) def get_kwargs(args): - parser = argparse.ArgumentParser( - description='Determine the hash for a directory.' - ) - parser.add_argument( - '-v', '--version', - action='version', - version='dirhash {}'.format(dirhash.__version__) - ) + parser = argparse.ArgumentParser(description="Determine the hash for a directory.") parser.add_argument( - 'directory', - help='Directory to hash.' + "-v", + "--version", + action="version", + version=f"dirhash {dirhash.__version__}", ) + parser.add_argument("directory", help="Directory to hash.") parser.add_argument( - '-a', '--algorithm', + "-a", + "--algorithm", choices=dirhash.algorithms_available, - default='md5', + default="md5", help=( - 'Hashing algorithm to use, by default "md5". Always available: {}. ' - 'Additionally available on current platform: {}. Note that the same ' - 'algorithm may appear multiple times in this set under different names ' - '(thanks to OpenSSL) ' - '[https://docs.python.org/2/library/hashlib.html]'.format( - sorted(dirhash.algorithms_guaranteed), - sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed) - ) + "Hashing algorithm to use, by default 'md5'. " + f"Always available: {sorted(dirhash.algorithms_guaranteed)}. " + f"Additionally available on current platform: " + f"{sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed)}. " + "Note that the same algorithm may appear multiple times in this set " + "under different names (thanks to OpenSSL) " + "[https://docs.python.org/2/library/hashlib.html]." ), - metavar='' + metavar="", ) filter_options = parser.add_argument_group( - title='Filtering options', + title="Filtering options", description=( - 'Specify what files and directories to include. All files and ' - 'directories (including symbolic links) are included by default. The ' - '--match/--ignore arguments allows for selection using glob/wildcard ' + "Specify what files and directories to include. All files and " + "directories (including symbolic links) are included by default. The " + "--match/--ignore arguments allows for selection using glob/wildcard " '(".gitignore style") path matching. Paths relative to the root ' - '`directory` (i.e. excluding the name of the root directory itself) are ' - 'matched against the provided patterns. For example, to only include ' + "`directory` (i.e. excluding the name of the root directory itself) are " + "matched against the provided patterns. For example, to only include " 'python source files, use: `dirhash path/to/dir -m "*.py"` or to ' - 'exclude hidden files and directories use: ' + "exclude hidden files and directories use: " '`dirhash path/to.dir -i ".*" ".*/"` which is short for ' '`dirhash path/to.dir -m "*" "!.*" "!.*/"`. By adding the --list ' - 'argument, all included paths, for the given filtering arguments, are ' - 'returned instead of the hash value. For further details see ' - 'https://github.com/andhus/dirhash/README.md#filtering' - ) + "argument, all included paths, for the given filtering arguments, are " + "returned instead of the hash value. For further details see " + "https://github.com/andhus/dirhash/README.md#filtering" + ), ) filter_options.add_argument( - '-m', '--match', - nargs='+', - default=['*'], + "-m", + "--match", + nargs="+", + default=["*"], help=( - 'One or several patterns for paths to include. NOTE: patterns ' + "One or several patterns for paths to include. NOTE: patterns " 'with an asterisk must be in quotes ("*") or the asterisk ' - 'preceded by an escape character (`*).' + "preceded by an escape character (`*)." ), - metavar='' + metavar="", ) filter_options.add_argument( - '-i', '--ignore', - nargs='+', + "-i", + "--ignore", + nargs="+", default=None, help=( - 'One or several patterns for paths to exclude. NOTE: patterns ' + "One or several patterns for paths to exclude. NOTE: patterns " 'with an asterisk must be in quotes ("*") or the asterisk ' - 'preceded by an escape character (`*).' + "preceded by an escape character (`*)." ), - metavar='' + metavar="", ) filter_options.add_argument( - '--empty-dirs', - action='store_true', + "--empty-dirs", + action="store_true", default=False, - help='Include empty directories (containing no files that meet the matching ' - 'criteria and no non-empty sub directories).' + help="Include empty directories (containing no files that meet the matching " + "criteria and no non-empty sub directories).", ) filter_options.add_argument( - '--no-linked-dirs', - dest='linked_dirs', - action='store_false', - help='Do not include symbolic links to other directories.' + "--no-linked-dirs", + dest="linked_dirs", + action="store_false", + help="Do not include symbolic links to other directories.", ) filter_options.add_argument( - '--no-linked-files', - dest='linked_files', - action='store_false', - help='Do not include symbolic links to files.' + "--no-linked-files", + dest="linked_files", + action="store_false", + help="Do not include symbolic links to files.", ) parser.set_defaults(linked_dirs=True, linked_files=True) protocol_options = parser.add_argument_group( - title='Protocol options', + title="Protocol options", description=( - 'Specify what properties of files and directories to include and ' - 'whether to allow cyclic links. For further details see ' - 'https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol' - ) + "Specify what properties of files and directories to include and " + "whether to allow cyclic links. For further details see " + "https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol" + ), ) protocol_options.add_argument( - '-p', '--properties', - nargs='+', - dest='entry_properties', - default=['data', 'name'], + "-p", + "--properties", + nargs="+", + dest="entry_properties", + default=["data", "name"], help=( - 'List of file/directory properties to include in the hash. Available ' - 'properties are: {} and at least one of name and data must be ' - 'included. Default is [data name] which means that both the name/paths' - ' and content (actual data) of files and directories will be included' - ).format(list(dirhash.Protocol.EntryProperties.options)), - metavar='' + "List of file/directory properties to include in the hash. Available " + f"properties are: {list(dirhash.Protocol.EntryProperties.options)} and at " + "least one of name and data must be included. Default is [data name] which " + "means that both the name/paths and content (actual data) of files and " + "directories will be included" + ), + metavar="", ) protocol_options.add_argument( - '-c', '--allow-cyclic-links', + "-c", + "--allow-cyclic-links", default=False, - action='store_true', + action="store_true", help=( - 'Allow presence of cyclic links (by hashing the relative path to the ' - 'target directory).' - ) + "Allow presence of cyclic links (by hashing the relative path to the " + "target directory)." + ), ) implementation_options = parser.add_argument_group( - title='Implementation options', - description='' + title="Implementation options", description="" ) implementation_options.add_argument( - '-s', '--chunk-size', + "-s", + "--chunk-size", default=2**20, type=int, - help='The chunk size (in bytes) for reading of files.' + help="The chunk size (in bytes) for reading of files.", ) implementation_options.add_argument( - '-j', '--jobs', + "-j", + "--jobs", type=int, default=1, # TODO make default number of cores? - help='Number of jobs (parallel processes) to use.' + help="Number of jobs (parallel processes) to use.", ) - special_options = parser.add_argument_group(title='Special options') + special_options = parser.add_argument_group(title="Special options") special_options.add_argument( - '-l', '--list', - action='store_true', + "-l", + "--list", + action="store_true", default=False, - help='List the file paths that will be taken into account, given the ' - 'provided filtering options.' + help="List the file paths that will be taken into account, given the " + "provided filtering options.", ) return vars(parser.parse_args(args)) -if __name__ == '__main__': # pragma: no cover +if __name__ == "__main__": # pragma: no cover main() diff --git a/src/dirhash/version.py b/src/dirhash/version.py deleted file mode 100644 index fc79d63..0000000 --- a/src/dirhash/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.2.1' diff --git a/tests/test_cli.py b/tests/test_cli.py index 3886fb9..ef34ecd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,24 +1,24 @@ -from __future__ import print_function, division - import os -import sys import shlex import subprocess - -import dirhash +import sys import pytest +import dirhash console_script = os.path.join( os.path.dirname(sys.executable), - 'dirhash' + "dirhash.exe" if os.name == "nt" else "dirhash", ) +if not os.path.isfile(console_script): + print(os.listdir(os.path.dirname(sys.executable))) + raise FileNotFoundError(f"Could not find console script at {console_script}.") +if not os.access(console_script, os.X_OK): + raise PermissionError(f"Console script at {console_script} is not executable.") def dirhash_run(argstring, add_env=None): - assert os.path.isfile(console_script) - assert os.access(console_script, os.X_OK) if add_env: env = os.environ.copy() env.update(add_env) @@ -28,15 +28,16 @@ def dirhash_run(argstring, add_env=None): [console_script] + shlex.split(argstring), stdout=subprocess.PIPE, stderr=subprocess.PIPE, - env=env + text=True, + env=env, ) output, error = process.communicate() # in python3 output and error are `bytes` as opposed to `str` in python2 if isinstance(output, bytes): - output = output.decode('utf-8') + output = output.decode("utf-8") if isinstance(error, bytes): - error = error.decode('utf-8') + error = error.decode("utf-8") return output, error, process.returncode @@ -54,155 +55,128 @@ def create_default_tree(tmpdir): |__file.ext1 |__file.ext2 """ - dotdir = tmpdir.mkdir('.dir') - dotdir.join('file').write('file in hidden sub-directory') - tmpdir.join(".file").write('hidden file') - dir = tmpdir.mkdir('dir') - dir.join('file').write('file in sub-directory') - tmpdir.mkdir('empty') - tmpdir.join("file").write('file') - tmpdir.join("file.ext1").write('file with extension .ext1') - tmpdir.join("file.ext2").write('file with extension .ext2') + dotdir = tmpdir.mkdir(".dir") + dotdir.join("file").write("file in hidden sub-directory") + tmpdir.join(".file").write("hidden file") + dir = tmpdir.mkdir("dir") + dir.join("file").write("file in sub-directory") + tmpdir.mkdir("empty") + tmpdir.join("file").write("file") + tmpdir.join("file.ext1").write("file with extension .ext1") + tmpdir.join("file.ext2").write("file with extension .ext2") + + +def osp(path: str) -> str: + """Normalize path for OS.""" + if os.name == "nt": # pragma: no cover + return path.replace("/", "\\") + return path -class TestCLI(object): +class TestCLI: @pytest.mark.parametrize( - 'argstring, non_default_kwargs', + "argstring, non_default_kwargs", [ - ( - '. -a md5', - {} - ), - ( - '.. -a md5', - {'directory': '..'} - ), - ( - 'target-dir -a md5', - {'directory': 'target-dir'} - ), - ( - '. -a sha256', - {'algorithm': 'sha256'} - ), + (". -a md5", {}), + (".. -a md5", {"directory": ".."}), + ("target-dir -a md5", {"directory": "target-dir"}), + (". -a sha256", {"algorithm": "sha256"}), # Filtering options - ( - '. -a md5 -m "*" "!.*"', - {'match': ['*', '!.*']} - ), + ('. -a md5 -m "*" "!.*"', {"match": ["*", "!.*"]}), ( '. -a md5 --match "d1/*" "d2/*" --ignore "*.txt"', - {'match': ['d1/*', 'd2/*'], 'ignore': ['*.txt']} - ), - ( - '. -a md5 --empty-dirs', - {'empty_dirs': True} - ), - ( - '. -a md5 --no-linked-dirs', - {'linked_dirs': False} - ), - ( - '. -a md5 --no-linked-files', - {'linked_files': False} + {"match": ["d1/*", "d2/*"], "ignore": ["*.txt"]}, ), + (". -a md5 --empty-dirs", {"empty_dirs": True}), + (". -a md5 --no-linked-dirs", {"linked_dirs": False}), + (". -a md5 --no-linked-files", {"linked_files": False}), # Protocol options - ( - '. -a md5 --allow-cyclic-links', - {'allow_cyclic_links': True} - - ), - ( - '. -a md5 --properties name', - {'entry_properties': ['name']} - - ), - ( - '. -a md5 --properties name data', - {'entry_properties': ['name', 'data']} - - ), + (". -a md5 --allow-cyclic-links", {"allow_cyclic_links": True}), + (". -a md5 --properties name", {"entry_properties": ["name"]}), + (". -a md5 --properties name data", {"entry_properties": ["name", "data"]}), # Implementation - ( - '. -a md5 -j 10', - {'jobs': 10} - ), - ( - '. -a md5 -s 32000', - {'chunk_size': 32000} - ), - ] + (". -a md5 -j 10", {"jobs": 10}), + (". -a md5 -s 32000", {"chunk_size": 32000}), + ], ) def test_get_kwargs(self, argstring, non_default_kwargs): from dirhash.cli import get_kwargs + kwargs_expected = { - 'list': False, - 'directory': '.', - 'algorithm': 'md5', - 'match': ['*'], - 'ignore': None, - 'empty_dirs': False, - 'linked_dirs': True, - 'linked_files': True, - 'entry_properties': ['data', 'name'], - 'allow_cyclic_links': False, - 'chunk_size': 2 ** 20, - 'jobs': 1 + "list": False, + "directory": ".", + "algorithm": "md5", + "match": ["*"], + "ignore": None, + "empty_dirs": False, + "linked_dirs": True, + "linked_files": True, + "entry_properties": ["data", "name"], + "allow_cyclic_links": False, + "chunk_size": 2**20, + "jobs": 1, } kwargs_expected.update(non_default_kwargs) kwargs = get_kwargs(shlex.split(argstring)) assert kwargs == kwargs_expected @pytest.mark.parametrize( - 'description, argstrings, output', + "description, argstrings, output", [ - ('ARGS WITHOUT EFFECT WHEN LISTING', - ['. -l', - '. --list', - '. -a md5 --list', - '. -a sha256 --list', - '. --properties name --list', - '. --jobs 2 --list', - '. --chunk-size 2 --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n')), - ('IGNORE EXTENSION', - ['. -i "*.ext1" --list', - '. --ignore "*.ext1" --list', - '. -m "*" "!*.ext1" --list', - '. --match "*" "!*.ext1" --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n' - 'file.ext2\n')), - ('IGNORE MULTIPLE EXTENSIONS', - ['. -i "*.ext1" "*.ext2" --list', - '. -i "*.ext*" --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n')), - ('IGNORE HIDDEN', - ['. -i ".*" ".*/" --list'], - ('dir/file\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n')), - ('INCLUDE EMPTY', - ['. --empty-dirs --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'empty/.\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n')), - ] + ( + "ARGS WITHOUT EFFECT WHEN LISTING", + [ + ". -l", + ". --list", + ". -a md5 --list", + ". -a sha256 --list", + ". --properties name --list", + ". --jobs 2 --list", + ". --chunk-size 2 --list", + ], + ( + ".dir/file\n" + ".file\n" + "dir/file\n" + "file\n" + "file.ext1\n" + "file.ext2\n" + ), + ), + ( + "IGNORE EXTENSION", + [ + '. -i "*.ext1" --list', + '. --ignore "*.ext1" --list', + '. -m "*" "!*.ext1" --list', + '. --match "*" "!*.ext1" --list', + ], + (".dir/file\n" ".file\n" "dir/file\n" "file\n" "file.ext2\n"), + ), + ( + "IGNORE MULTIPLE EXTENSIONS", + ['. -i "*.ext1" "*.ext2" --list', '. -i "*.ext*" --list'], + (".dir/file\n" ".file\n" "dir/file\n" "file\n"), + ), + ( + "IGNORE HIDDEN", + ['. -i ".*" ".*/" --list'], + ("dir/file\n" "file\n" "file.ext1\n" "file.ext2\n"), + ), + ( + "INCLUDE EMPTY", + [". --empty-dirs --list"], + ( + ".dir/file\n" + ".file\n" + "dir/file\n" + "empty/.\n" + "file\n" + "file.ext1\n" + "file.ext2\n" + ), + ), + ], ) def test_list(self, description, argstrings, output, tmpdir): create_default_tree(tmpdir) @@ -210,24 +184,31 @@ def test_list(self, description, argstrings, output, tmpdir): for argstring in argstrings: o, error, returncode = dirhash_run(argstring) assert returncode == 0 - assert error == '' - assert o == output + assert error == "" + assert o == osp(output) @pytest.mark.parametrize( - 'argstring, kwargs, expected_hashes', + "argstring, kwargs, expected_hashes", [ - ('. -a md5', - {'algorithm': 'md5'}, - ['594c48dde0776b03eddeeb0232190be7', - 'd8ab965636d48e407b73b9dbba4cb928', - '050e7bc9ffcb09c15186c04e0f8026df'] - ), - ('. -a sha256', - {'algorithm': 'sha256'}, - ['23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b', - '7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a', - '7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5']), - ] + ( + ". -a md5", + {"algorithm": "md5"}, + [ + "594c48dde0776b03eddeeb0232190be7", + "d8ab965636d48e407b73b9dbba4cb928", + "050e7bc9ffcb09c15186c04e0f8026df", + ], + ), + ( + ". -a sha256", + {"algorithm": "sha256"}, + [ + "23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b", + "7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a", + "7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5", + ], + ), + ], ) def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): # verify same result from cmdline and library + regression test of actual @@ -235,28 +216,28 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): create_default_tree(tmpdir) with tmpdir.as_cwd(): for add_argstring, add_kwargs, expected_hash in zip( - ['', ' -p data', ' -p name'], + ["", " -p data", " -p name"], [ {}, - {'entry_properties': ['data']}, - {'entry_properties': ['name']}, + {"entry_properties": ["data"]}, + {"entry_properties": ["name"]}, ], - expected_hashes + expected_hashes, ): # run CLI full_argstring = argstring + add_argstring cli_out, error, returncode = dirhash_run(full_argstring) - assert error == '' + assert error == "" assert returncode == 0 - assert cli_out[-1] == '\n' + assert cli_out[-1] == "\n" cli_hash = cli_out[:-1] # run CLI multiproc - full_argstring_mp = argstring + add_argstring + ' --jobs 2' + full_argstring_mp = argstring + add_argstring + " --jobs 2" cli_out_mp, error_mp, returncode_mp = dirhash_run(full_argstring_mp) - assert error_mp == '' + assert error_mp == "" assert returncode_mp == 0 - assert cli_out_mp[-1] == '\n' + assert cli_out_mp[-1] == "\n" cli_hash_mp = cli_out_mp[:-1] # run lib function @@ -268,6 +249,6 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): def test_error_bad_argument(self, tmpdir): with tmpdir.as_cwd(): - o, error, returncode = dirhash_run('. --chunk-size not_an_int') + o, error, returncode = dirhash_run(". --chunk-size not_an_int") assert returncode > 0 - assert error != '' + assert error != "" diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index f082392..68df656 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -1,38 +1,46 @@ -from __future__ import print_function, division - +import hashlib import os import shutil -import hashlib import tempfile from time import sleep, time import pytest +from scantree import SymlinkRecursionError from dirhash import ( + Filter, + Protocol, _get_hasher_factory, - get_match_patterns, - included_paths, - dirhash, + _parmap, algorithms_available, algorithms_guaranteed, - Protocol, - _parmap, - Filter, - dirhash_impl + dirhash, + dirhash_impl, + get_match_patterns, + included_paths, ) -from scantree import SymlinkRecursionError -class TestGetHasherFactory(object): +def osp(path: str) -> str: + """Normalize path for OS.""" + if os.name == "nt": # pragma: no cover + return path.replace("/", "\\") + return path + +def map_osp(paths): + return [osp(path) for path in paths] + + +class TestGetHasherFactory: def test_get_guaranteed(self): algorithm_and_hasher_factory = [ - ('md5', hashlib.md5), - ('sha1', hashlib.sha1), - ('sha224', hashlib.sha224), - ('sha256', hashlib.sha256), - ('sha384', hashlib.sha384), - ('sha512', hashlib.sha512) + ("md5", hashlib.md5), + ("sha1", hashlib.sha1), + ("sha224", hashlib.sha224), + ("sha256", hashlib.sha256), + ("sha384", hashlib.sha384), + ("sha512", hashlib.sha512), ] assert algorithms_guaranteed == {a for a, _ in algorithm_and_hasher_factory} for algorithm, expected_hasher_factory in algorithm_and_hasher_factory: @@ -45,30 +53,28 @@ def test_get_available(self): try: hasher = hasher_factory() except ValueError as exc: - # Some "available" algorithms are not necessarily available (fails for e.g. - # 'ripemd160' in github actions for python 3.8). See: - # https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python # noqa + # Some "available" algorithms are not necessarily available + # (fails for e.g. 'ripemd160' in github actions for python 3.8). + # See: https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python # noqa: E501 print(f"Failed to create hasher for {algorithm}: {exc}") assert exc.args[0] == f"unsupported hash type {algorithm}" hasher = None - + if hasher is not None: - assert hasattr(hasher, 'update') - assert hasattr(hasher, 'hexdigest') + assert hasattr(hasher, "update") + assert hasattr(hasher, "hexdigest") def test_not_available(self): with pytest.raises(ValueError): - _get_hasher_factory('not available') + _get_hasher_factory("not available") def test_bypass_hasher_factory(self): - # test standard hasher hasher_factory = _get_hasher_factory(hashlib.sha256) assert hasher_factory is hashlib.sha256 # test raise on custom hasher with bad interface - class IncompleteMockHasher(object): - + class IncompleteMockHasher: def __init__(self, *args, **kwargs): pass @@ -80,68 +86,65 @@ def update(self, *args, **kwargs): # test custom hasher with ok interface class MockHasher(IncompleteMockHasher): - def hexdigest(self): - return '' + return "" hasher_factory = _get_hasher_factory(MockHasher) assert hasher_factory is MockHasher -class TestGetMatchPatterns(object): - +class TestGetMatchPatterns: def test_default_match_all(self): ms = get_match_patterns() - assert ms == ['*'] + assert ms == ["*"] def test_only_match(self): - ms = get_match_patterns(match=['a*', 'b*']) - assert ms == ['a*', 'b*'] + ms = get_match_patterns(match=["a*", "b*"]) + assert ms == ["a*", "b*"] def test_only_ignore(self): - ms = get_match_patterns(ignore=['a*', 'b*']) - assert ms == ['*', '!a*', '!b*'] + ms = get_match_patterns(ignore=["a*", "b*"]) + assert ms == ["*", "!a*", "!b*"] def test_match_and_ignore(self): - ms = get_match_patterns(match=['a*'], ignore=['*.ext']) - assert ms == ['a*', '!*.ext'] + ms = get_match_patterns(match=["a*"], ignore=["*.ext"]) + assert ms == ["a*", "!*.ext"] def test_ignore_hidden(self): ms = get_match_patterns(ignore_hidden=True) - assert ms == ['*', '!.*', '!.*/'] + assert ms == ["*", "!.*", "!.*/"] # should not duplicate if present in (general) ignore - ms = get_match_patterns(ignore=['.*'], ignore_hidden=True) - assert ms == ['*', '!.*', '!.*/'] + ms = get_match_patterns(ignore=[".*"], ignore_hidden=True) + assert ms == ["*", "!.*", "!.*/"] - ms = get_match_patterns(ignore=['.*/'], ignore_hidden=True) - assert ms == ['*', '!.*/', '!.*'] + ms = get_match_patterns(ignore=[".*/"], ignore_hidden=True) + assert ms == ["*", "!.*/", "!.*"] - ms = get_match_patterns(ignore=['.*', '.*/'], ignore_hidden=True) - assert ms == ['*', '!.*', '!.*/'] + ms = get_match_patterns(ignore=[".*", ".*/"], ignore_hidden=True) + assert ms == ["*", "!.*", "!.*/"] def test_ignore_extensions(self): - ms = get_match_patterns(ignore_extensions=['.ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore_extensions=[".ext"]) + assert ms == ["*", "!*.ext"] # automatically adds '.' - ms = get_match_patterns(ignore_extensions=['ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore_extensions=["ext"]) + assert ms == ["*", "!*.ext"] # mixed also works - ms = get_match_patterns(ignore_extensions=['ext1', '.ext2']) - assert ms == ['*', '!*.ext1', '!*.ext2'] + ms = get_match_patterns(ignore_extensions=["ext1", ".ext2"]) + assert ms == ["*", "!*.ext1", "!*.ext2"] # should not duplicate if present in (general) ignore - ms = get_match_patterns(ignore=['*.ext'], ignore_extensions=['.ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=[".ext"]) + assert ms == ["*", "!*.ext"] - ms = get_match_patterns(ignore=['*.ext'], ignore_extensions=['ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=["ext"]) + assert ms == ["*", "!*.ext"] -class TempDirTest(object): - +class TempDirTest: def setup_method(self): self.dir = tempfile.mkdtemp() @@ -150,13 +153,13 @@ def teardown_method(self): shutil.rmtree(self.dir) def path_to(self, relpath): - return os.path.join(self.dir, relpath) + return os.path.join(self.dir, osp(relpath)) def mkdirs(self, dirpath): os.makedirs(self.path_to(dirpath)) def mkfile(self, relpath, content=None): - with open(self.path_to(relpath), 'w') as f: + with open(self.path_to(relpath), "w") as f: if content: f.write(content) @@ -173,546 +176,524 @@ class TestGetIncludedPaths(TempDirTest): # Integration tests with `pathspec` for basic use cases. def test_basic(self): - self.mkdirs('root/d1/d11') - self.mkdirs('root/d2') + self.mkdirs("root/d1/d11") + self.mkdirs("root/d2") - self.mkfile('root/f1') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/d11/f1') - self.mkfile('root/d2/f1') + self.mkfile("root/f1") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/d11/f1") + self.mkfile("root/d2/f1") - expected_filepaths = ['d1/d11/f1', 'd1/f1', 'd2/f1', 'f1'] - filepaths = included_paths(self.path_to('root')) + expected_filepaths = map_osp(["d1/d11/f1", "d1/f1", "d2/f1", "f1"]) + filepaths = included_paths(self.path_to("root")) assert filepaths == expected_filepaths # end with '/' or not should not matter - filepaths = included_paths(self.path_to('root/')) + filepaths = included_paths(self.path_to("root/")) assert filepaths == expected_filepaths def test_not_a_directory(self): - self.mkdirs('root') - self.mkfile('root/f1') + self.mkdirs("root") + self.mkfile("root/f1") # does not exist with pytest.raises(ValueError): - included_paths(self.path_to('wrong_root')) + included_paths(self.path_to("wrong_root")) with pytest.raises(ValueError): - included_paths(self.path_to('root/f1')) + included_paths(self.path_to("root/f1")) def test_symlinked_file(self): - self.mkdirs('root') - self.mkfile('root/f1') - self.mkfile('linked_file') - self.symlink('linked_file', 'root/f2') + self.mkdirs("root") + self.mkfile("root/f1") + self.mkfile("linked_file") + self.symlink("linked_file", "root/f2") - filepaths = included_paths( - self.path_to('root'), - linked_files=True - ) - assert filepaths == ['f1', 'f2'] + filepaths = included_paths(self.path_to("root"), linked_files=True) + assert filepaths == ["f1", "f2"] - filepaths = included_paths( - self.path_to('root'), - linked_files=False - ) - assert filepaths == ['f1'] + filepaths = included_paths(self.path_to("root"), linked_files=False) + assert filepaths == ["f1"] # default is 'linked_files': True - filepaths = included_paths(self.path_to('root'), ) - assert filepaths == ['f1', 'f2'] + filepaths = included_paths( + self.path_to("root"), + ) + assert filepaths == ["f1", "f2"] def test_symlinked_dir(self): - self.mkdirs('root') - self.mkfile('root/f1') - self.mkdirs('linked_dir') - self.mkfile('linked_dir/f1') - self.mkfile('linked_dir/f2') - self.symlink('linked_dir', 'root/d1') + self.mkdirs("root") + self.mkfile("root/f1") + self.mkdirs("linked_dir") + self.mkfile("linked_dir/f1") + self.mkfile("linked_dir/f2") + self.symlink("linked_dir", "root/d1") - filepaths = included_paths( - self.path_to('root'), - linked_dirs=False - ) - assert filepaths == ['f1'] + filepaths = included_paths(self.path_to("root"), linked_dirs=False) + assert filepaths == ["f1"] - filepaths = included_paths( - self.path_to('root'), - linked_dirs=True - ) - assert filepaths == ['d1/f1', 'd1/f2', 'f1'] + filepaths = included_paths(self.path_to("root"), linked_dirs=True) + assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"]) # default is 'linked_dirs': True - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['d1/f1', 'd1/f2', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"]) def test_cyclic_link(self): - self.mkdirs('root/d1') - self.symlink('root', 'root/d1/link_back') + self.mkdirs("root/d1") + self.symlink("root", "root/d1/link_back") with pytest.raises(SymlinkRecursionError) as exc_info: - included_paths( - self.path_to('root'), - allow_cyclic_links=False - ) - assert exc_info.value.real_path == os.path.realpath(self.path_to('root')) - assert exc_info.value.first_path == self.path_to('root/') - assert exc_info.value.second_path == self.path_to('root/d1/link_back') - assert str(exc_info.value).startswith('Symlink recursion:') + included_paths(self.path_to("root"), allow_cyclic_links=False) + assert exc_info.value.real_path == os.path.realpath(self.path_to("root")) + assert exc_info.value.first_path == self.path_to("root/") + assert exc_info.value.second_path == self.path_to("root/d1/link_back") + assert str(exc_info.value).startswith("Symlink recursion:") - filepaths = included_paths( - self.path_to('root'), - allow_cyclic_links=True - ) - assert filepaths == ['d1/link_back/.'] + filepaths = included_paths(self.path_to("root"), allow_cyclic_links=True) + assert filepaths == map_osp(["d1/link_back/."]) # default is 'allow_cyclic_links': False with pytest.raises(SymlinkRecursionError): - filepaths = included_paths(self.path_to('root')) + filepaths = included_paths(self.path_to("root")) + + def test_ignore_hidden(self): + self.mkdirs("root/d1") + self.mkdirs("root/.d2") + + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") + + # no ignore + filepaths = included_paths(self.path_to("root")) + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) + + # with ignore + filepaths = included_paths(self.path_to("root"), match=["*", "!.*"]) + assert filepaths == map_osp(["d1/f1", "f1"]) - def test_ignore_hidden_files(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + def test_ignore_hidden_files_only(self): + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # with ignore filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'] + self.path_to("root"), match=["**/*", "!**/.*", "**/.*/*", "!**/.*/.*"] ) - assert filepaths == ['.d2/f1', 'd1/f1', 'f1'] + assert filepaths == map_osp([".d2/f1", "d1/f1", "f1"]) - def test_exclude_hidden_dirs(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') - self.mkdirs('root/d1/.d1') + def test_ignore_hidden_explicitly_recursive(self): + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root'), empty_dirs=True) - assert filepaths == ['.d2/f1', '.f2', 'd1/.d1/.', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # with ignore - filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*/'] + filepaths = included_paths(self.path_to("root"), match=["*", "!**/.*"]) + assert filepaths == map_osp(["d1/f1", "f1"]) + + def test_exclude_hidden_dirs(self): + self.mkdirs("root/d1") + self.mkdirs("root/.d2") + self.mkdirs("root/d1/.d1") + + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") + + # no ignore + filepaths = included_paths(self.path_to("root"), empty_dirs=True) + assert filepaths == map_osp( + [".d2/f1", ".f2", "d1/.d1/.", "d1/.f2", "d1/f1", "f1"] ) - assert filepaths == ['.f2', 'd1/.f2', 'd1/f1', 'f1'] + + # with ignore + filepaths = included_paths(self.path_to("root"), match=["*", "!.*/"]) + assert filepaths == map_osp([".f2", "d1/.f2", "d1/f1", "f1"]) def test_exclude_hidden_dirs_and_files(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # using ignore - filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*/', '!.*'] - ) - assert filepaths == ['d1/f1', 'f1'] + filepaths = included_paths(self.path_to("root"), match=["*", "!.*/", "!.*"]) + assert filepaths == map_osp(["d1/f1", "f1"]) def test_exclude_extensions(self): - self.mkdirs('root/d1') - - self.mkfile('root/f') - self.mkfile('root/f.txt') - self.mkfile('root/f.skip1') - self.mkfile('root/fskip1') - self.mkfile('root/f.skip2') - self.mkfile('root/f.skip1.txt') - self.mkfile('root/f.skip1.skip2') - self.mkfile('root/f.skip1skip2') - self.mkfile('root/d1/f.txt') - self.mkfile('root/d1/f.skip1') + self.mkdirs("root/d1") + + self.mkfile("root/f") + self.mkfile("root/f.txt") + self.mkfile("root/f.skip1") + self.mkfile("root/fskip1") + self.mkfile("root/f.skip2") + self.mkfile("root/f.skip1.txt") + self.mkfile("root/f.skip1.skip2") + self.mkfile("root/f.skip1skip2") + self.mkfile("root/d1/f.txt") + self.mkfile("root/d1/f.skip1") filepaths = included_paths( - self.path_to('root'), - match=['*', '!*.skip1', '!*.skip2'] + self.path_to("root"), match=["*", "!*.skip1", "!*.skip2"] + ) + assert filepaths == map_osp( + [ + "d1/f.txt", + "f", + "f.skip1.txt", + "f.skip1skip2", + "f.txt", + "fskip1", + ] ) - assert filepaths == [ - 'd1/f.txt', 'f', 'f.skip1.txt', 'f.skip1skip2', 'f.txt', 'fskip1'] def test_empty_dirs_include_vs_exclude(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') - self.mkdirs('root/d3/d31') - self.mkdirs('root/d4/d41') + self.mkdirs("root/d1") + self.mkdirs("root/d2") + self.mkdirs("root/d3/d31") + self.mkdirs("root/d4/d41") - self.mkfile('root/d1/f') - self.mkfile('root/d3/d31/f') + self.mkfile("root/d1/f") + self.mkfile("root/d3/d31/f") - filepaths = included_paths( - self.path_to('root'), - empty_dirs=False - ) - assert filepaths == ['d1/f', 'd3/d31/f'] + filepaths = included_paths(self.path_to("root"), empty_dirs=False) + assert filepaths == map_osp(["d1/f", "d3/d31/f"]) # `include_empty=False` is default - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['d1/f', 'd3/d31/f'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == map_osp(["d1/f", "d3/d31/f"]) - filepaths = included_paths( - self.path_to('root'), - empty_dirs=True - ) - assert filepaths == ['d1/f', 'd2/.', 'd3/d31/f', 'd4/d41/.'] + filepaths = included_paths(self.path_to("root"), empty_dirs=True) + assert filepaths == map_osp(["d1/f", "d2/.", "d3/d31/f", "d4/d41/."]) def test_empty_dirs_because_of_filter_include_vs_exclude(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') + self.mkdirs("root/d1") + self.mkdirs("root/d2") - self.mkfile('root/d1/f') - self.mkfile('root/d2/.f') + self.mkfile("root/d1/f") + self.mkfile("root/d2/.f") filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], - empty_dirs=False + self.path_to("root"), match=["*", "!.*"], empty_dirs=False ) - assert filepaths == ['d1/f'] + assert filepaths == map_osp(["d1/f"]) # `include_empty=False` is default filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], + self.path_to("root"), + match=["*", "!.*"], ) - assert filepaths == ['d1/f'] + assert filepaths == map_osp(["d1/f"]) filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], - empty_dirs=True + self.path_to("root"), match=["*", "!.*"], empty_dirs=True ) - assert filepaths == ['d1/f', 'd2/.'] + assert filepaths == map_osp(["d1/f", "d2/."]) def test_empty_dir_inclusion_not_affected_by_match(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") # NOTE that empty dirs are not excluded by match_patterns: filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], - empty_dirs=True + self.path_to("root"), match=["*", "!.*"], empty_dirs=True ) - assert filepaths == ['.d2/.', 'd1/.'] + assert filepaths == map_osp([".d2/.", "d1/."]) filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*/'], - empty_dirs=True + self.path_to("root"), match=["*", "!.*/"], empty_dirs=True ) - assert filepaths == ['.d2/.', 'd1/.'] + assert filepaths == map_osp([".d2/.", "d1/."]) filepaths = included_paths( - self.path_to('root'), - match=['*', '!d1'], - empty_dirs=True + self.path_to("root"), match=["*", "!d1"], empty_dirs=True ) - assert filepaths == ['.d2/.', 'd1/.'] + assert filepaths == map_osp([".d2/.", "d1/."]) def dirhash_mp_comp(*args, **kwargs): res = dirhash(*args, **kwargs) - res_mp = dirhash(jobs=2, *args, **kwargs) + res_mp = dirhash(*args, **{**kwargs, "jobs": 2}) assert res == res_mp return res class TestDirhash(TempDirTest): - def test_guaranteed_algorithms(self): - self.mkdirs('root/d1/d11') - self.mkdirs('root/d2') - self.mkfile('root/f1', 'a') - self.mkfile('root/d1/f1', 'b') - self.mkfile('root/d1/d11/f1', 'c') - self.mkfile('root/d2/f1', 'd') + self.mkdirs("root/d1/d11") + self.mkdirs("root/d2") + self.mkfile("root/f1", "a") + self.mkfile("root/d1/f1", "b") + self.mkfile("root/d1/d11/f1", "c") + self.mkfile("root/d2/f1", "d") for algorithm, expected_hash in [ - ('md5', '3c631c7f5771468a2187494f802fad8f'), - ('sha1', '992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41'), - ('sha224', '18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999'), - ('sha256', 'ef7e95269fbc0e3478ad31fddd1c7d08' - '907d189c61725332e8a2fd14448fe175'), - ('sha384', '64ef4360c172bc68250f9326ea231cd1' - '46a7fa1afe9d386cee0cae0e9f1b4ad2' - '1df050d1df436cff792bbe81d6698026'), - ('sha512', '7854226eb0278bc136056998890a8399' - 'f85ca383f7c54665026358d28b5dc716' - '0ec654d2bcebf5d60974f82ed820600d' - '8e807ea53d57578d076ec1c82f501208') + ("md5", "3c631c7f5771468a2187494f802fad8f"), + ("sha1", "992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41"), + ("sha224", "18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999"), + ( + "sha256", + "ef7e95269fbc0e3478ad31fddd1c7d08" "907d189c61725332e8a2fd14448fe175", + ), + ( + "sha384", + "64ef4360c172bc68250f9326ea231cd1" + "46a7fa1afe9d386cee0cae0e9f1b4ad2" + "1df050d1df436cff792bbe81d6698026", + ), + ( + "sha512", + "7854226eb0278bc136056998890a8399" + "f85ca383f7c54665026358d28b5dc716" + "0ec654d2bcebf5d60974f82ed820600d" + "8e807ea53d57578d076ec1c82f501208", + ), ]: - hash_value = dirhash_mp_comp(self.path_to('root'), algorithm) + hash_value = dirhash_mp_comp(self.path_to("root"), algorithm) assert hash_value == expected_hash def test_recursive_descriptor(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') - self.mkfile('root/f1', 'a') - self.mkfile('root/d1/f12', 'b') - - f1_desc = 'data:a\000name:f1' - f12_desc = 'data:b\000name:f12' - d1_desc = 'dirhash:{}\000name:d1'.format(f12_desc) - d2_desc = 'dirhash:\000name:d2' - - empty_dirs_false_expected = '\000\000'.join([f1_desc, d1_desc]) - empty_dirs_true_expected = '\000\000'.join([f1_desc, d2_desc, d1_desc]) - - empty_dirs_false = dirhash( - self.path_to('root'), - algorithm=IdentityHasher - ) + self.mkdirs("root/d1") + self.mkdirs("root/d2") + self.mkfile("root/f1", "a") + self.mkfile("root/d1/f12", "b") + + f1_desc = "data:a\000name:f1" + f12_desc = "data:b\000name:f12" + d1_desc = f"dirhash:{f12_desc}\000name:d1" + d2_desc = "dirhash:\000name:d2" + + empty_dirs_false_expected = "\000\000".join([f1_desc, d1_desc]) + empty_dirs_true_expected = "\000\000".join([f1_desc, d2_desc, d1_desc]) + + empty_dirs_false = dirhash(self.path_to("root"), algorithm=IdentityHasher) assert empty_dirs_false == empty_dirs_false_expected empty_dirs_true = dirhash( - self.path_to('root'), - algorithm=IdentityHasher, - empty_dirs=True + self.path_to("root"), algorithm=IdentityHasher, empty_dirs=True ) assert empty_dirs_true == empty_dirs_true_expected def test_symlinked_file(self): - self.mkdirs('root1') - self.mkfile('root1/f1', 'a') - self.mkfile('linked_file', 'b') - self.symlink('linked_file', 'root1/f2') + self.mkdirs("root1") + self.mkfile("root1/f1", "a") + self.mkfile("linked_file", "b") + self.symlink("linked_file", "root1/f2") - self.mkdirs('root2') - self.mkfile('root2/f1', 'a') - self.mkfile('root2/f2', 'b') + self.mkdirs("root2") + self.mkfile("root2/f1", "a") + self.mkfile("root2/f2", "b") root1_linked_files_true = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5' + self.path_to("root1"), algorithm="md5" ) root1_linked_files_false = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5', - linked_files=False + self.path_to("root1"), algorithm="md5", linked_files=False ) - root2 = dirhash_mp_comp( - self.path_to('root2'), algorithm='md5' - ) + root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5") assert root1_linked_files_false != root1_linked_files_true assert root1_linked_files_true == root2 def test_symlinked_dir(self): - self.mkdirs('root1') - self.mkfile('root1/f1', 'a') - self.mkdirs('linked_dir') - self.mkfile('linked_dir/f1', 'b') - self.mkfile('linked_dir/f2', 'c') - self.symlink('linked_dir', 'root1/d1') - - self.mkdirs('root2') - self.mkfile('root2/f1', 'a') - self.mkdirs('root2/d1') - self.mkfile('root2/d1/f1', 'b') - self.mkfile('root2/d1/f2', 'c') + self.mkdirs("root1") + self.mkfile("root1/f1", "a") + self.mkdirs("linked_dir") + self.mkfile("linked_dir/f1", "b") + self.mkfile("linked_dir/f2", "c") + self.symlink("linked_dir", "root1/d1") + + self.mkdirs("root2") + self.mkfile("root2/f1", "a") + self.mkdirs("root2/d1") + self.mkfile("root2/d1/f1", "b") + self.mkfile("root2/d1/f2", "c") root1_linked_dirs_true = dirhash_mp_comp( - self.path_to('root1'), - algorithm='md5', - linked_dirs=True + self.path_to("root1"), algorithm="md5", linked_dirs=True ) root1_linked_dirs_false = dirhash_mp_comp( - self.path_to('root1'), - algorithm='md5', - linked_dirs=False - ) - root2 = dirhash_mp_comp( - self.path_to('root2'), algorithm='md5' + self.path_to("root1"), algorithm="md5", linked_dirs=False ) + root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5") assert root1_linked_dirs_false != root1_linked_dirs_true assert root1_linked_dirs_true == root2 def test_cache_used_for_symlinks(self): - - self.mkdirs('root/dir') - self.mkfile('root/file', '< one chunk content') + self.mkdirs("root/dir") + self.mkfile("root/file", "< one chunk content") for i in range(10): - self.symlink('root/file', 'root/link_{}'.format(i)) + self.symlink("root/file", f"root/link_{i}") for i in range(10): - self.symlink('root/file', 'root/dir/link_{}'.format(i)) + self.symlink("root/file", f"root/dir/link_{i}") start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher) + dirhash(self.path_to("root"), algorithm=SlowHasher) end = time() elapsed = end - start assert elapsed < SlowHasher.wait_time * 2 def test_raise_on_empty_root_without_include_empty(self): - self.mkdirs('root') + self.mkdirs("root") with pytest.raises(ValueError): - dirhash_mp_comp(self.path_to('root'), 'sha256') + dirhash_mp_comp(self.path_to("root"), "sha256") def test_empty_root_include_empty(self): - self.mkdirs('root') - dirhash_ = dirhash_mp_comp( - self.path_to('root'), - 'sha256', - empty_dirs=True - ) - expected_dirhash = hashlib.sha256(''.encode('utf-8')).hexdigest() + self.mkdirs("root") + dirhash_ = dirhash_mp_comp(self.path_to("root"), "sha256", empty_dirs=True) + expected_dirhash = hashlib.sha256(b"").hexdigest() assert dirhash_ == expected_dirhash def test_include_empty(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') - self.mkfile('root/d1/f') - - args = (self.path_to('root'), 'sha256') - dirhash_ = dirhash_mp_comp( - *args, - empty_dirs=False - ) - dirhash_empty = dirhash_mp_comp( - *args, - empty_dirs=True - ) + self.mkdirs("root/d1") + self.mkdirs("root/d2") + self.mkfile("root/d1/f") + + args = (self.path_to("root"), "sha256") + dirhash_ = dirhash_mp_comp(*args, empty_dirs=False) + dirhash_empty = dirhash_mp_comp(*args, empty_dirs=True) assert dirhash_ != dirhash_empty def test_chunksize(self): - self.mkdirs('root') - self.mkfile('root/numbers.txt', str(range(1000))) + self.mkdirs("root") + self.mkfile("root/numbers.txt", str(range(1000))) - hash_value = dirhash_mp_comp(self.path_to('root'), 'sha256') + hash_value = dirhash_mp_comp(self.path_to("root"), "sha256") for chunk_size in [2**4, 2**8, 2**16]: - assert dirhash_mp_comp( - self.path_to('root'), - 'sha256', - chunk_size=chunk_size - ) == hash_value + assert ( + dirhash_mp_comp(self.path_to("root"), "sha256", chunk_size=chunk_size) + == hash_value + ) def test_data_only(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - self.mkfile('root1/b.txt', 'def') - self.mkdirs('root2') - self.mkfile('root2/a.txt', 'abc') - self.mkfile('root2/c.txt', 'def') - - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + self.mkfile("root1/b.txt", "def") + self.mkdirs("root2") + self.mkfile("root2/a.txt", "abc") + self.mkfile("root2/c.txt", "def") + + hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") + hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") assert hash1 != hash2 # with entry hash remains the same as long as order of files is the # same [dhash1, dhash2] = [ - dirhash_mp_comp( - self.path_to(root), - 'sha256', - entry_properties=['data'] - ) for root in ['root1', 'root2'] + dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["data"]) + for root in ["root1", "root2"] ] assert dhash1 == dhash2 def test_name_only(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - self.mkfile('root1/b.txt', 'def') - self.mkdirs('root2') - self.mkfile('root2/a.txt', 'abc') - self.mkfile('root2/b.txt', '___') - - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + self.mkfile("root1/b.txt", "def") + self.mkdirs("root2") + self.mkfile("root2/a.txt", "abc") + self.mkfile("root2/b.txt", "___") + + hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") + hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") assert hash1 != hash2 [dhash1, dhash2] = [ - dirhash_mp_comp( - self.path_to(root), - 'sha256', - entry_properties=['name'] - ) for root in ['root1', 'root2'] + dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["name"]) + for root in ["root1", "root2"] ] assert dhash1 == dhash2 def test_is_link_property(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - self.mkfile('root1/b.txt', 'def') - self.mkdirs('root2') - self.mkfile('b_target', 'def') - self.mkfile('root2/a.txt', 'abc') - self.symlink('b_target', 'root2/b.txt') - - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + self.mkfile("root1/b.txt", "def") + self.mkdirs("root2") + self.mkfile("b_target", "def") + self.mkfile("root2/a.txt", "abc") + self.symlink("b_target", "root2/b.txt") + + hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") + hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") assert hash1 == hash2 for entry_properties in [ - ['name', 'data', 'is_link'], - ['name', 'is_link'], - ['data', 'is_link'], + ["name", "data", "is_link"], + ["name", "is_link"], + ["data", "is_link"], ]: [hash1, hash2] = [ dirhash_mp_comp( - self.path_to(root), - 'sha256', - entry_properties=entry_properties - ) for root in ['root1', 'root2'] + self.path_to(root), "sha256", entry_properties=entry_properties + ) + for root in ["root1", "root2"] ] assert hash1 != hash2 def test_raise_on_not_at_least_one_of_name_and_data(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - dirhash_mp_comp(self.path_to('root1'), 'sha256') # check ok + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + dirhash_mp_comp(self.path_to("root1"), "sha256") # check ok with pytest.raises(ValueError): - dirhash_mp_comp( - self.path_to('root1'), - 'sha256', - entry_properties=[] - ) + dirhash_mp_comp(self.path_to("root1"), "sha256", entry_properties=[]) with pytest.raises(ValueError): dirhash_mp_comp( - self.path_to('root1'), - 'sha256', - entry_properties=['is_link'] + self.path_to("root1"), "sha256", entry_properties=["is_link"] ) + @pytest.mark.skipif( + os.name == "nt", + reason="TODO: not getting expected speedup on Windows.", + # TODO: see https://github.com/andhus/scantree/issues/25 + ) def test_multiproc_speedup(self): - - self.mkdirs('root/dir') + self.mkdirs("root/dir") num_files = 10 for i in range(num_files): - self.mkfile('root/file_{}'.format(i), '< one chunk content') + self.mkfile(f"root/file_{i}", "< one chunk content") expected_min_elapsed_sequential = SlowHasher.wait_time * num_files start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher) + dirhash(self.path_to("root"), algorithm=SlowHasher) end = time() elapsed_sequential = end - start assert elapsed_sequential > expected_min_elapsed_sequential start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher, jobs=num_files) + dirhash(self.path_to("root"), algorithm=SlowHasher, jobs=num_files) end = time() elapsed_muliproc = end - start assert elapsed_muliproc < 0.9 * expected_min_elapsed_sequential @@ -722,11 +703,11 @@ def test_cache_by_real_path_speedup(self, tmpdir): num_links = 10 # reference run without links - root1 = tmpdir.join('root1') + root1 = tmpdir.join("root1") root1.ensure(dir=True) for i in range(num_links): - file_i = root1.join('file_{}'.format(i)) - file_i.write('< one chunk content', ensure=True) + file_i = root1.join(f"file_{i}") + file_i.write("< one chunk content", ensure=True) wait_time = SlowHasher.wait_time expected_min_elapsed_no_links = wait_time * num_links @@ -738,12 +719,12 @@ def test_cache_by_real_path_speedup(self, tmpdir): overhead = elapsed_no_links - expected_min_elapsed_no_links # all links to same file - root2 = tmpdir.join('root2') + root2 = tmpdir.join("root2") root2.ensure(dir=True) - target_file = tmpdir.join('target_file') + target_file = tmpdir.join("target_file") target_file.ensure() for i in range(num_links): - root2.join('link_{}'.format(i)).mksymlinkto(target_file) + os.symlink(target_file, root2.join(f"link_{i}")) overhead_margin_factor = 1.5 expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time @@ -755,16 +736,16 @@ def test_cache_by_real_path_speedup(self, tmpdir): assert elapsed_with_links < expected_max_elapsed_with_links def test_cache_together_with_multiprocess_speedup(self, tmpdir): - target_file_names = ['target_file_1', 'target_file_2'] + target_file_names = ["target_file_1", "target_file_2"] num_links_per_file = 10 num_links = num_links_per_file * len(target_file_names) # reference run without links - root1 = tmpdir.join('root1') + root1 = tmpdir.join("root1") root1.ensure(dir=True) for i in range(num_links): - file_i = root1.join('file_{}'.format(i)) - file_i.write('< one chunk content', ensure=True) + file_i = root1.join(f"file_{i}") + file_i.write("< one chunk content", ensure=True) jobs = 2 wait_time = SlowHasher.wait_time @@ -776,16 +757,18 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): assert elapsed_no_links > expected_min_elapsed_no_links overhead = elapsed_no_links - expected_min_elapsed_no_links - root2 = tmpdir.join('root2') + root2 = tmpdir.join("root2") root2.ensure(dir=True) for i, target_file_name in enumerate(target_file_names): target_file = tmpdir.join(target_file_name) - target_file.write('< one chunk content', ensure=True) + target_file.write("< one chunk content", ensure=True) for j in range(num_links_per_file): - root2.join('link_{}_{}'.format(i, j)).mksymlinkto(target_file) + os.symlink(target_file, root2.join(f"link_{i}_{j}")) overhead_margin_factor = 1.5 - expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time * 2 + expected_max_elapsed_with_links = ( + overhead * overhead_margin_factor + wait_time * 2 + ) assert expected_max_elapsed_with_links < expected_min_elapsed_no_links start = time() dirhash(root2, algorithm=SlowHasher, jobs=jobs) @@ -794,84 +777,74 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): assert elapsed_mp_with_links < expected_max_elapsed_with_links def test_hash_cyclic_link_to_root(self): - self.mkdirs('root/d1') - self.symlink('root', 'root/d1/link_back') - dirhash( - self.path_to('root'), - 'sha256', - allow_cyclic_links=True - ) + self.mkdirs("root/d1") + self.symlink("root", "root/d1/link_back") + dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True) def test_hash_cyclic_link(self): - self.mkdirs('root/d1/d2') - self.symlink('root/d1', 'root/d1/d2/link_back') - dirhash( - self.path_to('root'), - 'sha256', - allow_cyclic_links=True - ) + self.mkdirs("root/d1/d2") + self.symlink("root/d1", "root/d1/d2/link_back") + dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True) def test_pass_filtering_instance(self): - self.mkdirs('root') - self.mkfile('root/f1', '') - dirhash_impl(self.path_to('root'), 'sha256', filter_=Filter()) + self.mkdirs("root") + self.mkfile("root/f1", "") + dirhash_impl(self.path_to("root"), "sha256", filter_=Filter()) def test_pass_protocol_instance(self): - self.mkdirs('root') - self.mkfile('root/f1', '') - dirhash_impl(self.path_to('root'), 'sha256', protocol=Protocol()) + self.mkdirs("root") + self.mkfile("root/f1", "") + dirhash_impl(self.path_to("root"), "sha256", protocol=Protocol()) def test_raise_on_wrong_type(self): - self.mkdirs('root') - self.mkfile('root/f1', '') + self.mkdirs("root") + self.mkfile("root/f1", "") with pytest.raises(TypeError): - dirhash_impl(self.path_to('root'), 'sha256', filter_='') + dirhash_impl(self.path_to("root"), "sha256", filter_="") with pytest.raises(TypeError): - dirhash_impl(self.path_to('root'), 'sha256', protocol='') + dirhash_impl(self.path_to("root"), "sha256", protocol="") -class SlowHasher(object): +class SlowHasher: wait_time = 0.25 def __init__(self, *args, **kwargs): pass def update(self, data): - if data != b'': + if data != b"": sleep(self.wait_time) def hexdigest(self): - return '' + return "" -class IdentityHasher(object): - - def __init__(self, initial_data=b''): - self.datas = [initial_data.decode('utf-8')] +class IdentityHasher: + def __init__(self, initial_data=b""): + self.datas = [initial_data.decode("utf-8")] def update(self, data): - self.datas.append(data.decode('utf-8')) + self.datas.append(data.decode("utf-8")) def hexdigest(self): - return ''.join(self.datas) - + return "".join(self.datas) -class TestProtocol(object): +class TestProtocol: def test_raise_for_invalid_entry_properties(self): with pytest.raises(ValueError): - Protocol(entry_properties=['not-valid']) + Protocol(entry_properties=["not-valid"]) def test_raise_for_invalid_allow_cyclic_links(self): with pytest.raises(ValueError): - Protocol(allow_cyclic_links='not-valid') + Protocol(allow_cyclic_links="not-valid") def mock_func(x): return x * 2 -@pytest.mark.parametrize('jobs', [1, 2, 4]) +@pytest.mark.parametrize("jobs", [1, 2, 4]) def test_parmap(jobs): inputs = [1, 2, 3, 4] assert _parmap(mock_func, inputs, jobs=jobs) == [2, 4, 6, 8] diff --git a/tox.ini b/tox.ini index afa3d43..168724a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,12 +1,17 @@ [tox] -envlist = py{38,39,310,311,312} +envlist = pre-commit,py{38,39,310,311,312} [testenv] deps = pytest pytest-cov commands = - pytest --cov=scantree --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ + pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc {posargs:tests} + +[testenv:pre-commit] +skip_install = true +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure [gh-actions] python =