From 3d230022301d46cf8a1d4191b1f8d8b843c9758e Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sat, 18 Apr 2020 14:01:15 +0200 Subject: [PATCH 01/51] update reference according separation of standard and python implementation --- DIRHASH_STANDARD.md | 212 ---------------------------------------- README.md | 14 +-- src/dirhash/__init__.py | 2 +- src/dirhash/cli.py | 2 +- 4 files changed, 9 insertions(+), 221 deletions(-) delete mode 100644 DIRHASH_STANDARD.md diff --git a/DIRHASH_STANDARD.md b/DIRHASH_STANDARD.md deleted file mode 100644 index e5e7d1c..0000000 --- a/DIRHASH_STANDARD.md +++ /dev/null @@ -1,212 +0,0 @@ -# The Dirhash Standard - -[https://github.com/andhus/dirhash/DIRHASH_STANDARD.md](https://github.com/andhus/dirhash/DIRHASH_STANDARD.md -) - -VERSION: 0.1.0 - - -## Table of Content - -- [Introduction](#introduction) -- [Hash Function](#hash-function) -- [Filtering](#filtering) -- [Protocol](#protocol) -- [Error Conditions](#error-conditions) -- [The `DIRSUM` Object](#the-dirsum-object) -- [Extensions](#extensions) -- [Contribute](#contribute) -- [Appendix](#appendix) - -## Introduction -The Dirhash Standard describes a formal procedure for computing a single hash value, the `DIRHASH`, of a filesystem directory. - ---- - -*TL;DR*: Data and metadata of files and subdirectories are hashed recursively by a standard hash function. The Dirhash Standard defines options for filtering of which files to include, which data and metadata to consider for the included files and a protocol for how any piece of information is fed to the hash function. A fixed set of options, including the hash function, yields a deterministic hash value for any directory under the Dirhash Standard. - ---- - -The standard is designed with the following objectives: -- *Platform and filesystem agnostic*. -- *Easy to understand and implement* in any language. -- *Extendable* - instead of trying to cover all possible use cases. -- *Same collision resistance* as the underlying hashing algorithm. - -The process of computing the `DIRHASH` relies on the concepts below. Each concept is configurable by one or several options, which *affects the obtained hash value* (except for "Implementation" options): -1. **Hash Function**: The underlying hash function used to map any data to a fixed length hash value. -2. **Filtering**: The process of selecting what *entries* (subdirectories and files) within the directory to include. -3. **Protocol**: Defines which data and metadata to include for each entry and precisely how it is fed to the hash function for a reproducible and collision resistant result. -4. **Implementation**: Additional, implementation specific, aspects of how the `DIRHASH` is programmatically obtained. - -Since the configuration options for 1-3 above affects the obtained `DIRHASH`, they must be represented in a [`DIRHASH` based checksum](#the-dirsum-object) of a directory. Subsequently, the naming and interpretation of these options must be standardized to allow for communication and verification of such checksums. -It is not required that a given package/module, implementing the `DIRHASH` standard, strictly follows the option naming (to adhere to language-specific convention e.g.) as long as they are properly translated to the standard options. - -Implementation options are naturally not covered by the Dirhash Standard as these *should not affect the obtained* `DIRHASH`. Such options should be clearly distinguished from those in 1-3. - - - -## Hash Function -All data and metadata which forms the basis for the `DIRHASH` is hashed by a standard hash function. The Dirhash Standard defines precisely how data is fed to the hash function. The hash function is specified using the single option `algorithm`. The following algorithms are covered in the Dirhash Standard: [`"md5"`](https://www.ietf.org/rfc/rfc1321.txt), [`"sha1"`](https://tools.ietf.org/html/rfc3174), [`"sha224"`, `"sha256"`, `"sha384"`, `"sha512"`](https://tools.ietf.org/html/rfc6234) but can naturally be extended. - - -### Hash function Options -Name | Type | Default | Description ----- | ---------------- | ------------------- | ----------- -algorithm | String | (no default) | The standard hashing algorithm (function) to use. - - - - -## Filtering -Filtering governs what files and subdirectories to include. This is done by matching of file paths as well as handling of symbolic links and empty directories. - - -### Filtering Options -Name | Type | Default | Description ----- | ---------------- | ------------------- | ----------- -match_patterns | Array of Strings | `["*"]` (match all) | [Glob/Wildcard matching](https://en.wikipedia.org/wiki/Glob_(programming)). The path *relative to the Dirhash root* is matched against the provided patterns. The path must match at least one of the "match patterns" (*not* starting with `!`) and none of the "ignore patterns" (starting with `!`). -linked_dirs | Boolean | `true` | Include (i.e. follow) symbolic links to directories. -linked_files | Boolean | `true` | Include symbolic links to files. -empty_dirs | Boolean | `false` | Include empty directories. A directory is considered empty if it contains no files or directories to include *given the Filtering Options*. - - - - -## Protocol -The `DIRHASH` of a directory is obtained by taking the underlying hash function's hexdigest of a `DIR-DESCRIPTOR` string. The `DIR-DESCRIPTOR` is composed by concatenation of an ordered sequence of `ENTRY-DESCRIPTOR`:s, separated by two [null characters](https://en.wikipedia.org/wiki/Null_character): - -```\000\000[...]```, - -in python: - -```python -# entry_descriptors: List[str] -dir_descriptor = '\000\000'.join(sorted(entry_descriptors)) -``` - -A *directory entry* is either a subdirectory, a file or a symbolic link. Other file types (named pipe, socket, device file, door) are excluded in the core version of the Dirhash Standard. No distinction is made between files and "hard links" to files. - -The `ENTRY-DESCRIPTOR` is composed by concatenation of an ordered sequence of entry properties separated by a single [null character](https://en.wikipedia.org/wiki/Null_character). Each property is represented by its name and value separated by a colon `:`: - -```:\000[...]:```, - -in python: - -```python -# entry_properties: Dict[str, str] -entry_property_strings = [f'{k}:{v}' for k, v in entry_properties.items()] -entry_descriptor = '\000'.join(sorted(entry_property_strings)) -``` -The null character is (the only character) not allowed in property names or values according to the DIRHASH standard, to maintain [collision resistance](https://en.wikipedia.org/wiki/Collision_resistance). - - -### Entry Properties -An entry property refers to data or metadata of a directory entry. - -Name | Value | Inclusion | Comment/Rationale ------ | ----- | --------- | ----------------- -dirhash | The `DIRHASH` of a subdirectory or the target directory in case of a symbolic link, except for [cyclic links](#cyclic-links). | Always included for directories. *Not applicable to files*. | This is the recursive part of the Dirhash Protocol; the content of each subdirectory is summarized by its `DIRHASH`. -data | Hash function hexdigest of the binary data of the file, or the file linked to, if a symlink. | Optional, but one of `name` and `data` must always be included. *Not applicable to directories*. | For the typical use case, the data should affect the hash. Without it, only the paths to files and subdirectories are hashed. -name | The name of the entry (the name of the link itself if a symlink, *not* the entry linked to). | Optional, but one of `name` and `data` must always be included. | For the typical use case, the entry name should affect the hash, so that data and other metadata is tied to the name and, subsequently, to the entry's relative path to the Dirhash root (which follows from the recursive nature of the Dirhash Standard). -is_link | Whether the entry is a symlink, one of `"true"` or `"false"`. | Optional. | For the typical use case, it does *not* matter if a file or directory is linked or not - the file tree is "perceived the same" for many applications. If it matters, this property can be included. - - -### Cyclic Links -Symbolically linked directories can create cycles in the, otherwise acyclic, graph representing the file tree. If not handled properly, this leads to infinite recursion when traversing the file tree (this is e.g. the case for Python's built-in [`os.walk(directory, followlinks=True)`](https://stackoverflow.com/questions/36977259/avoiding-infinite-recursion-with-os-walk/36977580)). Moreover, this breaks the recursive definition of the Dirhash Protocol, which offers two alternative solutions for the special case, specified by the option `allow_cyclic_links`. - -#### `allow_cyclic_links: false` -The the first option is to consider cyclic links an [error condition](#error-conditions) and raise an appropriate exception when detected (preferably before reaching the recursion limit of the language of implementation!). - -#### `allow_cyclic_links: true` -The other option is to replace the dirhash value for the cyclic link with the hash function hexdigest of the relative path from the link to the target. The path is normalized according to the unix standard (with a forward slash `/` separating directories) and without a leading or trailing slash. - -Sometimes multiple links form cycles together. Without loss of generality, cyclic links are defined as the *first occurrence of a link to a directory that has already been visited on the current branch of recursion*. The real path (or inode and device ID) of visited directories, together with the path relative to the Dirhash root, must typically be cached during traversal of the file tree to identify and resolve cyclic links. For further details, see these [examples](#cyclic-links-examples). - - -### Protocol Options -Name | Type | Default | Description ----- | ---------------- | ------------------- | ----------- -entry_properties | Array of Strings | `["name", "data"]` | Which Directory Entry properties to consider. NOTE that `type` is a mandatory property and should not be provided -allow_cyclic_links | Boolean | `false` | Whether or not to allow the presence of [cyclic links](#cyclic-links). - -The Dirhash Protocol is designed so that the same hash should not be obtained with different Protocol Options. Subsequently, when the same hash is obtained one can be sure that the same Protocol Options were used. The options must still be provided when comparing checksums, but this removes the risk of false positives (confirmation of the checksum) due to wrong options used. - - - -## Error Conditions -**Directory Not Accessible**: A not accessible (sub)directory results in an error unless excluded by the `match_patterns` filtering option. - -**File Not Accessible**: A not accessible file results in an error if the entry property `data` is used unless excluded by the `match_patterns` filtering option. - -**Cyclic Symbolic Links**: Presence of cyclic links, with `allow_cyclic_links` set to `false`. - -**Directory Empty**: No (non-empty) directory entries to hash in the Dirhash root directory, given provided Filtering Options and `empty_dirs` set to `false`. - - - -## The `DIRSUM` Object -Checksums based on the `DIRSHASH` must contain the additional configuration options to be properly validated (as was discussed in the [Introduction](#introduction)). For this purpose, the Dirhash Standard provides the `DIRSUM` object which contains the DIRHASH value as well as the necessary information for verification. It's structure is laid out in JSON below, with properties according [Hash Function Options](#hash-function-options), [Filtering Options](#filtering-options) and [Protocol Options](#protocol-options) and version (as stated in the top of this document) for which version of this standard the DIRHASH computation complies with. -```json -{ - "dirhash": "...", - "algorithm": "...", - "filtering": { - "match_patterns": ["*"], - "linked_dirs": true, - "linked_files": true, - "empty_dirs": false}, - "protocol": { - "entry_properties": ["name", "data"], - "allow_cyclic_links": false}, - "version": "0.1.0" -} -``` -When saved to file, the recommended extension is `.dirsum.json`. - - -## Extending the Dirhash Standard -The Dirhash Standard can be extended by introducing additional Filtering Options and/or entry properties. A few possible examples below: -- **Permission Properties**: These are hard to get platform independent (different on windows/unix). The best option is probably to let go of platform independence here. Possible properties could be `permission-[owner|group|user]` or `permission-me` for permissions of current -process, -- **Owning User/Group** -- **Last Modified/Opened Properties** - - -## Contribute -If you find a bug, inconsistency or weakness in the Dirhash Standard, or that the documentation or the Standard itself can be simplified without loss of generality, please file an issue at [https://github.com/andhus/dirhash](https://github.com/andhus/dirhash). - - -If you have a use case that is not covered, it can hopefully be supported by an extension of the Standard. Please file an issue or make a PR if you think that it can benefit others. - -## Appendix - -### Cyclic Links: Examples - -In the example below there are cycles on all branches `A/B`, `A/C` and `D`. -``` -root/ -|__A/ -| |__B/ -| | |__toA@ -> .. -| |__C/ -| |__toA@ -> .. -|__D/ - |__toB@ -> ../A/B -``` -In this case, the value of the dirhash property for the symlinks `A/B/toA`, `A/C/toA` and `D/toB/toA/B/toA` is replaced by the hash of `".."`. Note that for the third branch, the presence of cyclic links can be *detected* already at `D/toB/toA/B` (since `B` is already visited) but it is for `D/toB/toA/B/toA` that the replacement happens. This reflects the fact that it is the `toA` that's *causing* the cycle, not `D/toB` or `D/toB/toA/B` (which is not even a link), and at `D/toB/toA/` the cycle is not yet be detected. - -Below is another example where multiple links are involved in forming cycles as well as links which absolute path is external to the Dirhash root. In this case the cyclic links and relative paths to hash are: `root/A/toB/toA` (`"../A"`), `root/B/toA/toB` (`"../B"`) and `root/C/toD/toC` (`"../.."`). - -``` -/path/to/root/ - |__A/ - | |__toB@ -> ../B - |__B/ - | |__toA@ -> /path/to/root/A - |__C/ - |__toD@ -> /path/to/D - -/path/to/D/ - |__toC@ -> /path/to/root/C -``` diff --git a/README.md b/README.md index f8ff3b5..ca6b06a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![Build Status](https://travis-ci.com/andhus/dirhash.svg?branch=master)](https://travis-ci.com/andhus/dirhash) -[![codecov](https://codecov.io/gh/andhus/dirhash/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash) +[![Build Status](https://travis-ci.com/andhus/dirhash-python.svg?branch=master)](https://travis-ci.com/andhus/dirhash-python) +[![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) # dirhash A lightweight python module and tool for computing the hash of any @@ -9,7 +9,7 @@ directory based on its files' structure and content. include/exclude. - Multiprocessing for up to [6x speed-up](#performance) -The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash/DIRHASH_STANDARD.md), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. +The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. ## Installation From PyPI: @@ -18,7 +18,7 @@ pip install dirhash ``` Or directly from source: ```commandline -git clone git@github.com:andhus/dirhash.git +git clone git@github.com:andhus/dirhash-standard.git pip install dirhash/ ``` @@ -68,7 +68,7 @@ and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash/blob/master/dirhash/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash/dirhash-python/cli.py) with the shell command: `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` @@ -89,7 +89,7 @@ shell reference | nested_32k_32kB | 6.82 | -> 1.0 `dirhash` | nested_32k_32kB | 3.43 | 2.00 `dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** -The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash/tree/master/benchmark). +The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/benchmark). ## Documentation -Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash/blob/master/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash/DIRHASH_STANDARD.md). \ No newline at end of file +Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash/dirhash-python/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 5df9bd7..599609e 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -112,7 +112,7 @@ def dirhash( `protocol={'allow_cyclic_links': True}`. # References - See https://github.com/andhus/dirhash/DIRHASH_STANDARD.md for a formal + See https://github.com/andhus/dirhash/README.md for a formal description of how the returned hash value is computed. """ diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 0b994ae..faaf54d 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -69,7 +69,7 @@ def get_kwargs(args): '`dirhash path/to.dir -m "*" "!.*" "!.*/"`. By adding the --list ' 'argument, all included paths, for the given filtering arguments, are ' 'returned instead of the hash value. For further details see ' - 'https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#filtering' + 'https://github.com/andhus/dirhash/README.md#filtering' ) ) filter_options.add_argument( From a4c267e4d825c3cae9fed33ee0e0f966379524ab Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sat, 18 Apr 2020 14:06:46 +0200 Subject: [PATCH 02/51] update repo ref and bump version in in setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4ba2cbe..242919a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ import os from setuptools import setup, find_packages -VERSION = '0.1.1' +VERSION = '0.2.0' DESCRIPTION = 'Python module and CLI for hashing of file system directories.' @@ -19,7 +19,7 @@ description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/andhus/dirhash', + url='https://github.com/andhus/dirhash-python', author="Anders Huss", author_email="andhus@kth.se", license='MIT', From a1ba7c1017cabe1cfc09971a3ea46d931de1cdb3 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 19 Apr 2020 12:55:16 +0200 Subject: [PATCH 03/51] draft of changelog and update of benchmark/run.py --- CHANGELOG.md | 25 +++++++++++++++++++++++++ benchmark/run.py | 8 ++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..88a7784 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,25 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +NIL + +## [0.2.0] - 2019-04-18 +Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/0.1.0) + +### Added +- A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash). +- This changelog. +- Results form a new benchmark run after changes. The `benchmark/run.py` now outputs results files which names include the `dirhash.__version__`. + +### Changed +- **Significant breaking changes** from version 0.1.1 - both regarding API and the +underlying method/protocol for computing the hash. This means that **hashes +computed with this version will differ from hashes computed with version < 0.2.0 for +same directory**. +- The dirhash python implementation has moved to here +[github.com/andhus/dirhash](https://github.com/andhus/dirhash) from the previous re \ No newline at end of file diff --git a/benchmark/run.py b/benchmark/run.py index 0d9a10d..f930b2e 100644 --- a/benchmark/run.py +++ b/benchmark/run.py @@ -6,6 +6,8 @@ from statistics import median, mean +from dirhash import __version__ + BENCHMARK_ROOT = os.path.abspath( os.path.join(__file__, os.pardir) @@ -164,7 +166,9 @@ def benchmark(dirpath, algorithm, **kwargs): result = benchmark(test_case, algorithm=alg, runs=5, repetitions=1) results.extend(result) - with open(os.path.join(BENCHMARK_ROOT, 'results.json'), 'w') as f: + result_fname = 'results_v{}'.format(__version__) + + with open(os.path.join(BENCHMARK_ROOT, result_fname + '.json'), 'w') as f: json.dump(results, f, indent=4) try: @@ -188,6 +192,6 @@ def benchmark(dirpath, algorithm, **kwargs): print(df_hd_1w) print('\nAverage speedup multiprocess (8 workers): {}'.format(mean_speedup_8w)) print(df_hd_8w) - df.to_csv(os.path.join(BENCHMARK_ROOT, 'results.csv')) + df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + '.csv')) except ImportError: pass From 39c47853d55792e5d68f4073494c30591338e577 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 19 Apr 2020 13:03:16 +0200 Subject: [PATCH 04/51] fixes in changelog --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88a7784..dcb417a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,5 +21,8 @@ Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1 underlying method/protocol for computing the hash. This means that **hashes computed with this version will differ from hashes computed with version < 0.2.0 for same directory**. -- The dirhash python implementation has moved to here -[github.com/andhus/dirhash](https://github.com/andhus/dirhash) from the previous re \ No newline at end of file +- This dirhash python implementation has moved to here +[github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from +the previous repository +[github.com/andhus/dirhash](https://github.com/andhus/dirhash) +which now contains the formal description of the Dirhash Standard. From 9806bfbd102cc4ec639624cca9ddd7c0c2269ac2 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 19 Apr 2020 14:30:59 +0200 Subject: [PATCH 05/51] flatten args to dirhash, add dirhash_impl for passing filter_ and protocol implementations --- src/dirhash/__init__.py | 80 +++++++++++++++++++++-------- src/dirhash/cli.py | 54 ++++++++++---------- tests/test_cli.py | 46 ++++++++--------- tests/test_dirhash.py | 109 ++++++++++++++++++---------------------- 4 files changed, 156 insertions(+), 133 deletions(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 599609e..934ad0e 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -28,6 +28,7 @@ 'algorithms_guaranteed', 'algorithms_available', 'dirhash', + 'dirhash_impl', 'included_paths', 'Filter', 'get_match_patterns', @@ -43,7 +44,40 @@ def dirhash( directory, algorithm, - filtering=None, + match=("*",), + ignore=None, + linked_dirs=True, + linked_files=True, + empty_dirs=False, + entry_properties=('name', 'data'), + allow_cyclic_links=False, + chunk_size=2**20, + jobs=1 +): + filter_ = Filter( + match=get_match_patterns(match=match, ignore=ignore), + linked_dirs=linked_dirs, + linked_files=linked_files, + empty_dirs=empty_dirs + ) + protocol = Protocol( + entry_properties=entry_properties, + allow_cyclic_links=allow_cyclic_links + ) + return dirhash_impl( + directory=directory, + algorithm=algorithm, + filter_=filter_, + protocol=protocol, + chunk_size=chunk_size, + jobs=jobs + ) + + +def dirhash_impl( + directory, + algorithm, + filter_=None, protocol=None, chunk_size=2**20, jobs=1 @@ -56,7 +90,7 @@ def dirhash( `dirhash.algorithms_available` for the available options. It is also possible to provide a callable object that returns an instance implementing the `hashlib._hashlib.HASH` interface. - filtering: Optional[Union[dirhash.Filter, Dict[str, str]]] - An instance of + filter_: Optional[Union[dirhash.Filter, Dict[str, str]]] - An instance of dirhash.Filter or a dictionary of keyword arguments for the same. Determines what paths within the `directory` to include when computing the hash value. Default `None`, which means that all files and @@ -115,9 +149,15 @@ def dirhash( See https://github.com/andhus/dirhash/README.md for a formal description of how the returned hash value is computed. """ - - filter_ = _get_instance('filtering', filtering, Filter) - protocol = _get_instance('protocol', protocol, Protocol) + def get_instance(value, cls_, argname): + if isinstance(value, cls_): + return value + if value is None: + return cls_() + raise TypeError('{} must be an instance of {} or None'.format(argname, cls_)) + + filter_ = get_instance(filter_, Filter, 'filter_') + protocol = get_instance(protocol, Protocol, 'protocol') hasher_factory = _get_hasher_factory(algorithm) def dir_apply(dir_node): @@ -194,8 +234,12 @@ def file_apply(path): def included_paths( directory, - filtering=None, - protocol=None + match=("*",), + ignore=None, + linked_dirs=True, + linked_files=True, + empty_dirs=False, + allow_cyclic_links=False, ): """Inspect what paths are included for the corresponding arguments to the `dirhash.dirhash` function. @@ -209,8 +253,13 @@ def included_paths( List[str] - A sorted list of the paths that would be included when computing the hash of `directory` using `dirhash.dirhash` and the same arguments. """ - protocol = _get_instance('protocol', protocol, Protocol) - filter_ = _get_instance('filtering', filtering, Filter) + filter_ = Filter( + match=get_match_patterns(match=match, ignore=ignore), + linked_dirs=linked_dirs, + linked_files=linked_files, + empty_dirs=empty_dirs + ) + protocol = Protocol(allow_cyclic_links=allow_cyclic_links) leafpaths = scantree( directory, @@ -472,19 +521,6 @@ def _parmap(func, iterable, jobs=1): return results -def _get_instance(argname, instance_or_kwargs, cls): - if instance_or_kwargs is None: - return cls() - if isinstance(instance_or_kwargs, dict): - return cls(**instance_or_kwargs) - if isinstance(instance_or_kwargs, cls): - return instance_or_kwargs - raise TypeError( - 'argument {argname} must be an instance of, or kwargs for, ' - '{cls}'.format(argname=argname, cls=cls) - ) - - def _get_filehash(filepath, hasher_factory, chunk_size, cache=None): """Compute the hash of the given filepath. diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index faaf54d..365fee3 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -14,7 +14,7 @@ def main(): kwargs = get_kwargs(sys.argv[1:]) if kwargs.pop('list'): # kwargs below have no effect when listing - for k in ['algorithm', 'chunk_size', 'jobs']: + for k in ['algorithm', 'chunk_size', 'jobs', 'entry_properties']: kwargs.pop(k) for leafpath in dirhash.included_paths(**kwargs): print(leafpath) @@ -75,7 +75,7 @@ def get_kwargs(args): filter_options.add_argument( '-m', '--match', nargs='+', - default='*', + default=['*'], help=( 'String of match-patterns, separated by blank space. NOTE: patterns ' 'with an asterisk must be in quotes ("*") or the asterisk ' @@ -126,10 +126,12 @@ def get_kwargs(args): protocol_options.add_argument( '-p', '--properties', nargs='+', + dest='entry_properties', + default=['data', 'name'], help=( 'List of file/directory properties to include in the hash. Available ' 'properties are: {} and at least one of name and data must be ' - 'included. Default is [name data] which means that both the name/paths' + 'included. Default is [data name] which means that both the name/paths' ' and content (actual data) of files and directories will be included' ).format(list(dirhash.Protocol.EntryProperties.options)), metavar='' @@ -170,29 +172,29 @@ def get_kwargs(args): 'provided filtering options.' ) - return preprocess_kwargs(vars(parser.parse_args(args))) - - -def preprocess_kwargs(kwargs): - match_kwargs = {} - for kwarg in ['match', 'ignore']: - match_kwargs[kwarg] = kwargs.pop(kwarg) - match_patterns = dirhash.get_match_patterns(**match_kwargs) - - filtering_kwargs = { - 'match': match_patterns, - 'linked_dirs': kwargs.pop('linked_dirs'), - 'linked_files': kwargs.pop('linked_files'), - 'empty_dirs': kwargs.pop('empty_dirs'), - } - protocol_kwargs = { - 'allow_cyclic_links': kwargs.pop('allow_cyclic_links'), - 'entry_properties': kwargs.pop('properties') or ["data", "name"] - } - kwargs['filtering'] = filtering_kwargs - kwargs['protocol'] = protocol_kwargs - - return kwargs + return vars(parser.parse_args(args)) + + +# def preprocess_kwargs(kwargs): +# match_kwargs = {} +# for kwarg in ['match', 'ignore']: +# match_kwargs[kwarg] = kwargs.pop(kwarg) +# match_patterns = dirhash.get_match_patterns(**match_kwargs) +# +# filtering_kwargs = { +# 'match': match_patterns, +# 'linked_dirs': kwargs.pop('linked_dirs'), +# 'linked_files': kwargs.pop('linked_files'), +# 'empty_dirs': kwargs.pop('empty_dirs'), +# } +# protocol_kwargs = { +# 'allow_cyclic_links': kwargs.pop('allow_cyclic_links'), +# 'entry_properties': kwargs.pop('properties') or ["data", "name"] +# } +# kwargs['filtering'] = filtering_kwargs +# kwargs['protocol'] = protocol_kwargs +# +# return kwargs if __name__ == '__main__': # pragma: no cover diff --git a/tests/test_cli.py b/tests/test_cli.py index e78754b..3886fb9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -88,38 +88,38 @@ class TestCLI(object): # Filtering options ( '. -a md5 -m "*" "!.*"', - {'filtering': {'match': ['*', '!.*']}} + {'match': ['*', '!.*']} ), ( '. -a md5 --match "d1/*" "d2/*" --ignore "*.txt"', - {'filtering': {'match': ['d1/*', 'd2/*', '!*.txt']}} + {'match': ['d1/*', 'd2/*'], 'ignore': ['*.txt']} ), ( '. -a md5 --empty-dirs', - {'filtering': {'empty_dirs': True}} + {'empty_dirs': True} ), ( '. -a md5 --no-linked-dirs', - {'filtering': {'linked_dirs': False}} + {'linked_dirs': False} ), ( '. -a md5 --no-linked-files', - {'filtering': {'linked_files': False}} + {'linked_files': False} ), # Protocol options ( '. -a md5 --allow-cyclic-links', - {'protocol': {'allow_cyclic_links': True}} + {'allow_cyclic_links': True} ), ( '. -a md5 --properties name', - {'protocol': {'entry_properties': ['name']}} + {'entry_properties': ['name']} ), ( '. -a md5 --properties name data', - {'protocol': {'entry_properties': ['name', 'data']}} + {'entry_properties': ['name', 'data']} ), # Implementation @@ -135,29 +135,23 @@ class TestCLI(object): ) def test_get_kwargs(self, argstring, non_default_kwargs): from dirhash.cli import get_kwargs - filter_kwargs = { + kwargs_expected = { + 'list': False, + 'directory': '.', + 'algorithm': 'md5', 'match': ['*'], + 'ignore': None, 'empty_dirs': False, 'linked_dirs': True, - 'linked_files': True - } - protocol_kwargs = { + 'linked_files': True, 'entry_properties': ['data', 'name'], - 'allow_cyclic_links': False - } - filter_kwargs.update(non_default_kwargs.pop('filtering', {})) - protocol_kwargs.update(non_default_kwargs.pop('protocol', {})) - kwargs = { - 'list': False, - 'directory': '.', - 'algorithm': 'md5', - 'filtering': filter_kwargs, - 'protocol': protocol_kwargs, + 'allow_cyclic_links': False, 'chunk_size': 2 ** 20, 'jobs': 1 } - kwargs.update(non_default_kwargs) - assert kwargs == get_kwargs(shlex.split(argstring)) + kwargs_expected.update(non_default_kwargs) + kwargs = get_kwargs(shlex.split(argstring)) + assert kwargs == kwargs_expected @pytest.mark.parametrize( 'description, argstrings, output', @@ -244,8 +238,8 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): ['', ' -p data', ' -p name'], [ {}, - {'protocol': {'entry_properties': ['data']}}, - {'protocol': {'entry_properties': ['name']}}, + {'entry_properties': ['data']}, + {'entry_properties': ['name']}, ], expected_hashes ): diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index c6617be..fa000b1 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -15,7 +15,7 @@ dirhash, algorithms_available, algorithms_guaranteed, - Protocol, _parmap, Filter) + Protocol, _parmap, Filter, dirhash_impl) from scantree import SymlinkRecursionError @@ -192,13 +192,13 @@ def test_symlinked_file(self): filepaths = included_paths( self.path_to('root'), - filtering={'linked_files': True} + linked_files=True ) assert filepaths == ['f1', 'f2'] filepaths = included_paths( self.path_to('root'), - filtering={'linked_files': False} + linked_files=False ) assert filepaths == ['f1'] @@ -216,13 +216,13 @@ def test_symlinked_dir(self): filepaths = included_paths( self.path_to('root'), - filtering={'linked_dirs': False} + linked_dirs=False ) assert filepaths == ['f1'] filepaths = included_paths( self.path_to('root'), - filtering={'linked_dirs': True} + linked_dirs=True ) assert filepaths == ['d1/f1', 'd1/f2', 'f1'] @@ -236,7 +236,7 @@ def test_cyclic_link(self): with pytest.raises(SymlinkRecursionError) as exc_info: included_paths( self.path_to('root'), - protocol={'allow_cyclic_links': False} + allow_cyclic_links=False ) assert exc_info.value.real_path == os.path.realpath(self.path_to('root')) assert exc_info.value.first_path == self.path_to('root/') @@ -245,7 +245,7 @@ def test_cyclic_link(self): filepaths = included_paths( self.path_to('root'), - protocol={'allow_cyclic_links': True} + allow_cyclic_links=True ) assert filepaths == ['d1/link_back/.'] @@ -270,7 +270,7 @@ def test_ignore_hidden_files(self): # with ignore filepaths = included_paths( self.path_to('root'), - filtering={'match': ['*', '!.*']} + match=['*', '!.*'] ) assert filepaths == ['.d2/f1', 'd1/f1', 'f1'] @@ -291,7 +291,7 @@ def test_exclude_hidden_dirs(self): # with ignore filepaths = included_paths( self.path_to('root'), - filtering={'match': ['*', '!.*/']} + match=['*', '!.*/'] ) assert filepaths == ['.f2', 'd1/.f2', 'd1/f1', 'f1'] @@ -312,7 +312,7 @@ def test_exclude_hidden_dirs_and_files(self): # using ignore filepaths = included_paths( self.path_to('root'), - filtering={'match': ['*', '!.*/', '!.*']} + match=['*', '!.*/', '!.*'] ) assert filepaths == ['d1/f1', 'f1'] @@ -332,7 +332,7 @@ def test_exclude_extensions(self): filepaths = included_paths( self.path_to('root'), - filtering={'match': ['*', '!*.skip1', '!*.skip2']} + match=['*', '!*.skip1', '!*.skip2'] ) assert filepaths == [ 'd1/f.txt', 'f', 'f.skip1.txt', 'f.skip1skip2', 'f.txt', 'fskip1'] @@ -348,7 +348,7 @@ def test_empty_dirs_include_vs_exclude(self): filepaths = included_paths( self.path_to('root'), - filtering={'empty_dirs': False} + empty_dirs=False ) assert filepaths == ['d1/f', 'd3/d31/f'] @@ -358,7 +358,7 @@ def test_empty_dirs_include_vs_exclude(self): filepaths = included_paths( self.path_to('root'), - filtering={'empty_dirs': True} + empty_dirs=True ) assert filepaths == ['d1/f', 'd2/.', 'd3/d31/f', 'd4/d41/.'] @@ -371,26 +371,22 @@ def test_empty_dirs_because_of_filter_include_vs_exclude(self): filepaths = included_paths( self.path_to('root'), - filtering={ - 'match': ['*', '!.*'], - 'empty_dirs': False - } + match=['*', '!.*'], + empty_dirs=False ) assert filepaths == ['d1/f'] # `include_empty=False` is default filepaths = included_paths( self.path_to('root'), - filtering={'match': ['*', '!.*']}, + match=['*', '!.*'], ) assert filepaths == ['d1/f'] filepaths = included_paths( self.path_to('root'), - filtering={ - 'match': ['*', '!.*'], - 'empty_dirs': True - } + match=['*', '!.*'], + empty_dirs=True ) assert filepaths == ['d1/f', 'd2/.'] @@ -402,28 +398,22 @@ def test_empty_dir_inclusion_not_affected_by_match(self): filepaths = included_paths( self.path_to('root'), - filtering={ - 'match': ['*', '!.*'], - 'empty_dirs': True - } + match=['*', '!.*'], + empty_dirs=True ) assert filepaths == ['.d2/.', 'd1/.'] filepaths = included_paths( self.path_to('root'), - filtering={ - 'match': ['*', '!.*/'], - 'empty_dirs': True - } + match=['*', '!.*/'], + empty_dirs=True ) assert filepaths == ['.d2/.', 'd1/.'] filepaths = included_paths( self.path_to('root'), - filtering={ - 'match': ['*', '!d1'], - 'empty_dirs': True - } + match=['*', '!d1'], + empty_dirs=True ) assert filepaths == ['.d2/.', 'd1/.'] @@ -485,7 +475,7 @@ def test_recursive_descriptor(self): empty_dirs_true = dirhash( self.path_to('root'), algorithm=IdentityHasher, - filtering={'empty_dirs': True} + empty_dirs=True ) assert empty_dirs_true == empty_dirs_true_expected @@ -504,7 +494,7 @@ def test_symlinked_file(self): ) root1_linked_files_false = dirhash_mp_comp( self.path_to('root1'), algorithm='md5', - filtering={'linked_files': False} + linked_files=False ) root2 = dirhash_mp_comp( @@ -531,12 +521,12 @@ def test_symlinked_dir(self): root1_linked_dirs_true = dirhash_mp_comp( self.path_to('root1'), algorithm='md5', - filtering={'linked_dirs': True} + linked_dirs=True ) root1_linked_dirs_false = dirhash_mp_comp( self.path_to('root1'), algorithm='md5', - filtering={'linked_dirs': False} + linked_dirs=False ) root2 = dirhash_mp_comp( self.path_to('root2'), algorithm='md5' @@ -569,7 +559,7 @@ def test_empty_root_include_empty(self): dirhash_ = dirhash_mp_comp( self.path_to('root'), 'sha256', - filtering={'empty_dirs': True} + empty_dirs=True ) expected_dirhash = hashlib.sha256(''.encode('utf-8')).hexdigest() assert dirhash_ == expected_dirhash @@ -582,10 +572,11 @@ def test_include_empty(self): args = (self.path_to('root'), 'sha256') dirhash_ = dirhash_mp_comp( *args, - filtering={'empty_dirs': False}) + empty_dirs=False + ) dirhash_empty = dirhash_mp_comp( *args, - filtering={'empty_dirs': True} + empty_dirs=True ) assert dirhash_ != dirhash_empty @@ -619,7 +610,7 @@ def test_data_only(self): dirhash_mp_comp( self.path_to(root), 'sha256', - protocol={'entry_properties': ['data']} + entry_properties=['data'] ) for root in ['root1', 'root2'] ] assert dhash1 == dhash2 @@ -640,7 +631,7 @@ def test_name_only(self): dirhash_mp_comp( self.path_to(root), 'sha256', - protocol={'entry_properties': ['name']} + entry_properties=['name'] ) for root in ['root1', 'root2'] ] assert dhash1 == dhash2 @@ -667,7 +658,7 @@ def test_is_link_property(self): dirhash_mp_comp( self.path_to(root), 'sha256', - protocol={'entry_properties': entry_properties} + entry_properties=entry_properties ) for root in ['root1', 'root2'] ] assert hash1 != hash2 @@ -680,14 +671,14 @@ def test_raise_on_not_at_least_one_of_name_and_data(self): dirhash_mp_comp( self.path_to('root1'), 'sha256', - protocol={'entry_properties': []} + entry_properties=[] ) with pytest.raises(ValueError): dirhash_mp_comp( self.path_to('root1'), 'sha256', - protocol={'entry_properties': ['is_link']} + entry_properties=['is_link'] ) def test_multiproc_speedup(self): @@ -793,7 +784,7 @@ def test_hash_cyclic_link_to_root(self): dirhash( self.path_to('root'), 'sha256', - protocol={'allow_cyclic_links': True} + allow_cyclic_links=True ) def test_hash_cyclic_link(self): @@ -802,26 +793,26 @@ def test_hash_cyclic_link(self): dirhash( self.path_to('root'), 'sha256', - protocol={'allow_cyclic_links': True} + allow_cyclic_links=True ) - def test_pass_filtering_instance(self): - self.mkdirs('root') - self.mkfile('root/f1', '') - dirhash(self.path_to('root'), 'sha256', filtering=Filter()) - - def test_pass_protocol_instance(self): - self.mkdirs('root') - self.mkfile('root/f1', '') - dirhash(self.path_to('root'), 'sha256', protocol=Protocol()) + # def test_pass_filtering_instance(self): + # self.mkdirs('root') + # self.mkfile('root/f1', '') + # dirhash(self.path_to('root'), 'sha256', filtering=Filter()) + # + # def test_pass_protocol_instance(self): + # self.mkdirs('root') + # self.mkfile('root/f1', '') + # dirhash(self.path_to('root'), 'sha256', protocol=Protocol()) def test_raise_on_wrong_type(self): self.mkdirs('root') self.mkfile('root/f1', '') with pytest.raises(TypeError): - dirhash(self.path_to('root'), 'sha256', filtering='') + dirhash_impl(self.path_to('root'), 'sha256', filter_='') with pytest.raises(TypeError): - dirhash(self.path_to('root'), 'sha256', protocol='') + dirhash_impl(self.path_to('root'), 'sha256', protocol='') class SlowHasher(object): From 53ed900ed9b737afe7e944860815c0cecc356b2e Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 06:15:35 +0200 Subject: [PATCH 06/51] update docs according flat args --- src/dirhash/__init__.py | 175 +++++++++++++++++++++++++++------------- tests/test_dirhash.py | 5 +- 2 files changed, 123 insertions(+), 57 deletions(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 934ad0e..6fd00d5 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -1,12 +1,5 @@ #!/usr/bin/env python -"""dirhash - a python module (and CLI) for hashing of file system directories. - -Provides the the following public functions and classes: -- `dirhash` -- `included_paths` -- `Filter` -- `get_match_patterns` -- `Protocol` +"""dirhash - a python library (and CLI) for hashing of file system directories. """ from __future__ import print_function, division @@ -54,6 +47,105 @@ def dirhash( chunk_size=2**20, jobs=1 ): + """Computes the hash of a directory based on its structure and content. + + # Arguments + directory: Union[str, pathlib.Path] - Path to the directory to hash. + algorithm: str - The name of the hashing algorithm to use. See + `dirhash.algorithms_available` for the available options. + match: Iterable[str] - An iterable of glob/wildcard match-patterns for paths + to include when computing the hash. Default is ["*"] which means that all + files and directories are matched. To e.g. only include python source + files, use: `match=["*.py"]`. See "Path Selection and Filtering" section + below for further details. + ignore: Optional[Iterable[str]] - An iterable of glob/wildcard match-patterns + for paths to ignore when computing the hash. Default `None` (no ignore + patterns). To e.g. exclude hidden files and directories use: + `ignore=[".*/", ".*"]`. See "Path Selection and Filtering" section below + for further details. + linked_dirs: bool - If `True` (default), follow symbolic links to other + *directories* and include these and their content in the hash + computation. + linked_files: bool - If `True` (default), include symbolic linked files in + the hash computation. + empty_dirs: bool - If `True`, include empty directories when computing the + hash. A directory is considered empty if it does not contain any files + that *matches provided matching criteria*. Default `False`, i.e. empty + directories are ignored (as is done in git version control). + entry_properties: Iterable[str] - A set (i.e. order does not matter) of the + file/directory properties to consider when computing the hash. Supported + properties are {"name", "data", "is_link"} where at least one of + "name" and "data" must be included. Default is ["name", "data"] which + means that the content (actual data) as well as the path relative to the + root `directory` of files will affect the hash value. See "Entry + Properties Interpretation" section below for further details.  + allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is + raised on presence of cyclic symbolic links. If set to `True` the the + dirhash value for directory causing the cyclic link is replaced with the + hash function hexdigest of the relative path from the link to the target. + chunk_size: int - The number of bytes to read in one go from files while + being hashed. A too small size will slow down the processing and a larger + size consumes more working memory. Default 2**20 byte = 1 MiB. + jobs: int - The number of processes to use when computing the hash. + Default `1`, which means that a single (the main) process is used. NOTE + that using multiprocessing can significantly speed-up execution, see + `https://github.com/andhus/dirhash-python/benchmark` for further + details. + + # Returns + str - The hash/checksum as a string of the hexadecimal digits (the result of + `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the + provided `algorithm`). + + # Raises + TypeError/ValueError: For incorrectly provided arguments. + SymlinkRecursionError: In case the `directory` contains symbolic links that + lead to (infinite) recursion and `allow_cyclic_links=False` (default). + + # Path Selection and Filtering + Provided glob/wildcard (".gitignore style") match-patterns determine what + paths within the `directory` to include when computing the hash value. Paths + *relative to the root `directory` (i.e. excluding the name of the directory + itself) are matched against the patterns. + The `match` argument represent what should be *included* - as opposed + to `ignore` patterns for which matches are *excluded*. Using `ignore` is + just short for adding the same patterns to the `match` argument with the + prefix "!", i.e. the calls bellow are equivalent: + `dirhash(..., match=['*', '!'])` + `dirhash(..., ignore=[''])` + To validate which paths are included, call `dirhash.included_paths` with + the same values for the arguments: `match`, `ignore`, `linked_dirs`, + `linked_files` and `empty_dirs` to get a list of all paths that will be + included when computing the hash by this function. + + # Entry Properties Interpretation + - ["name", "data"] (Default) - The name as well as data is included. Due to + the recursive nature of the dirhash computation, "name" implies that the + path relative to the root `directory` of each file/directory affects the + computed hash value. + - ["data"] - Compute the hash only based on the data of files - + *not* their names or the names of their parent directories. NOTE that + the tree structure in which files are organized under the `directory` + root still influences the computed hash. As longs as all files have + the same content and are organised the same way in relation to all + other files in the Directed Acyclic Graph representing the file-tree, + the hash will remain the same (but the "name of nodes" does not + matter). This option can e.g. be used to verify that that data is + unchanged after renaming files (change extensions etc.). + - ["name"] - Compute the hash only based on the name and location of + files in the file tree under the `directory` root. This option can + e.g. be used to check if any files have been added/moved/removed, + ignoring the content of each file. + - "is_link" - if this options is added to any of the cases above the + hash value is also affected by whether a file or directory is a + symbolic link or not. NOTE: with this property added, the hash + will be different than without it even if there are no symbolic links + in the directory. + + # References + See https://github.com/andhus/dirhash/README.md for a formal + description of how the returned hash value is computed. + """ filter_ = Filter( match=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, @@ -84,45 +176,20 @@ def dirhash_impl( ): """Computes the hash of a directory based on its structure and content. + In contrast to `dirhash.dirhash`, this function accepts custom implementations of + the `dirhash.Filter` and `dirhash.Protocol` classes. + # Arguments directory: Union[str, pathlib.Path] - Path to the directory to hash. algorithm: str - The name of the hashing algorithm to use. See `dirhash.algorithms_available` for the available options. It is also possible to provide a callable object that returns an instance implementing the `hashlib._hashlib.HASH` interface. - filter_: Optional[Union[dirhash.Filter, Dict[str, str]]] - An instance of - dirhash.Filter or a dictionary of keyword arguments for the same. - Determines what paths within the `directory` to include when computing - the hash value. Default `None`, which means that all files and - directories are included *except for empty directories*. - The `dirhash.Filter` supports glob/wildcard (".gitignore style") path - matching by the `match` argument. Paths *relative to the root `directory` - (i.e. excluding the name of the root directory itself) are matched - against the provided patterns. For example, to include all files, - except for hidden ones use: `filtering={'match': ['*', '!.*']}` - (or the equivalent `filtering=Filter(match=['*', '!.*'])`). For - inspection and verification, you can pass the `filtering` argument to - `dirhash.included_paths` to get a list of all paths that would be - included when computing the hash value. - For further options and details, see `dirhash.Filter`. - protocol: Optional[Union[dirhash.Protocol, Dict[str, str]]] - An instance of - dirhash.Protocol or a dictionary of keyword arguments for the same. - Determines (mainly) what properties of files and directories to consider - when computing the hash value. Default `None`, which means that both the - name and content (actual data) of files and directories will be included. - To only hash the "file structure", as in the name of files and - directories and their location relative to the root `directory`, use: - `protocol={'entry_properties': ['name']}`. Contrary, to only hash the - data and ignoring the name of directories and files use - `protocol={'entry_properties': ['data']}`. NOTE that the tree structure - in which files are organized under the root `directory` still influences - the computed hash with this option. As longs as all files have the same - content and are organised the same way in relation to all other files in - the Directed Acyclic Graph representing the file tree, the hash will - remain the same (but the "name of nodes" does not matter). This option - can e.g. be used to verify that that data is unchanged after renaming - files (change extensions etc.). - For further options and details, see `dirhash.Protocol`. + filter_: dirhash.Filter - Determines what files and directories to include + when computing the hash. See docs of `dirhash.Filter` for further + details. + protocol: dirhash.Protocol - Determines (mainly) what properties of files and + directories to consider when computing the hash value. chunk_size: int - The number of bytes to read in one go from files while being hashed. A too small size will slow down the processing and a larger size consumes more working memory. Default 2**20 byte = 1 MiB. @@ -140,10 +207,8 @@ def dirhash_impl( # Raises TypeError/ValueError: For incorrectly provided arguments. SymlinkRecursionError: In case the `directory` contains symbolic links that - lead to (infinite) recursion and `protocol=None` (default) or - `protocol={'allow_cyclic_links': False}`. - To be able to hash directories with cyclic links use - `protocol={'allow_cyclic_links': True}`. + lead to (infinite) recursion and the protocol option `allow_cyclic_links` + is `False`. # References See https://github.com/andhus/dirhash/README.md for a formal @@ -246,12 +311,13 @@ def included_paths( # Arguments: This function accepts the following subset of the function `dirhash.dirhash` - arguments: `directory`, `filtering` and `protocol`, with the same meaning. - See docs of `dirhash.dirhash` for further details. + arguments: `directory`, `match`, `ignore`, `linked_dirs`, `linked_files`, + `empty_dirs` and `allow_cyclic_links`, *with the same interpretation*. See + docs of `dirhash.dirhash` for further details. # Returns List[str] - A sorted list of the paths that would be included when computing - the hash of `directory` using `dirhash.dirhash` and the same arguments. + the hash of the `directory` using `dirhash.dirhash` and the same arguments. """ filter_ = Filter( match=get_match_patterns(match=match, ignore=ignore), @@ -276,12 +342,11 @@ def included_paths( class Filter(RecursionFilter): - """Specification of what files and directories to include for the `dirhash` computation. # Arguments - match: Optional[List[str]] - A list of glob/wildcard (".gitignore style") + match: Iterable[str] - An iterable of glob/wildcard (".gitignore style") match patterns for selection of which files and directories to include. Paths *relative to the root `directory` (i.e. excluding the name of the root directory itself) are matched against the provided patterns. For @@ -297,9 +362,6 @@ class Filter(RecursionFilter): hash. A directory is considered empty if it does not contain any files that *matches provided matching criteria*. Default `False`, i.e. empty directories are ignored (as is done in git version control). - - NOTE: To inspection/verify which paths are included, pass an instance of this - class to `dirhash.included_paths`. """ def __init__( self, @@ -370,10 +432,13 @@ class Protocol(object): computing the `dirhash` value. # Arguments - entry_properties: List[str] - A combination of the supported properties + entry_properties: Iterable[str] - A combination of the supported properties {"name", "data", "is_link"} where at least one of "name" and "data" is included. Interpretation: - - ["name", "data"] (Default) - The name as well as data is included. + - ["name", "data"] (Default) - The name as well as data is included. Due + to the recursive nature of the dirhash computation, "name" implies + that the path relative to the root `directory` of each file/directory + affects the computed hash value. - ["data"] - Compute the hash only based on the data of files - *not* their names or the names of their parent directories. NOTE that the tree structure in which files are organized under the `directory` diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index fa000b1..d8b97f6 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -277,6 +277,7 @@ def test_ignore_hidden_files(self): def test_exclude_hidden_dirs(self): self.mkdirs('root/d1') self.mkdirs('root/.d2') + self.mkdirs('root/d1/.d1') self.mkfile('root/f1') self.mkfile('root/.f2') @@ -285,8 +286,8 @@ def test_exclude_hidden_dirs(self): self.mkfile('root/.d2/f1') # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to('root'), empty_dirs=True) + assert filepaths == ['.d2/f1', '.f2', 'd1/.d1/.', 'd1/.f2', 'd1/f1', 'f1'] # with ignore filepaths = included_paths( From 1b500c11274a4f03b7beb7a4b5b6262c6248ebcc Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 06:20:00 +0200 Subject: [PATCH 07/51] update README --- README.md | 6 +++--- src/dirhash/__init__.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ca6b06a..17e4108 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ pip install dirhash ``` Or directly from source: ```commandline -git clone git@github.com:andhus/dirhash-standard.git +git clone git@github.com:andhus/dirhash-python.git pip install dirhash/ ``` @@ -29,8 +29,8 @@ from dirhash import dirhash dirpath = "path/to/directory" dir_md5 = dirhash(dirpath, "md5") -pyfiles_md5 = dirhash(dirpath, "md5", filtering={"match": ["*.py"]}) -no_hidden_sha1 = dirhash(dirpath, "sha1", filtering={"match": ["!.*", "!.*/"]}) +pyfiles_md5 = dirhash(dirpath, "md5", match=["*.py"]) +no_hidden_sha1 = dirhash(dirpath, "sha1", ignore=[".*", ".*/"]) ``` CLI: ```commandline diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 6fd00d5..323e3f9 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -178,7 +178,7 @@ def dirhash_impl( In contrast to `dirhash.dirhash`, this function accepts custom implementations of the `dirhash.Filter` and `dirhash.Protocol` classes. - + # Arguments directory: Union[str, pathlib.Path] - Path to the directory to hash. algorithm: str - The name of the hashing algorithm to use. See From 429f0b98779638937e0a959f41ef383e31be888b Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 13:20:45 +0200 Subject: [PATCH 08/51] minor cleanup --- README.md | 9 ++++----- src/dirhash/__init__.py | 12 ++++++------ src/dirhash/cli.py | 35 +++++++---------------------------- 3 files changed, 17 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 17e4108..e9c938b 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,10 @@ [![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) # dirhash -A lightweight python module and tool for computing the hash of any +A lightweight python module and CLI for computing the hash of any directory based on its files' structure and content. -- Supports any hashing algorithm of Python's built-in `hashlib` module -- `.gitignore` style "wildmatch" patterns for expressive filtering of files to -include/exclude. +- Supports all hashing algorithms of Python's built-in `hashlib` module. +- Glob/wildcard (".gitignore style") path matching for expressive filtering of files to include/exclude. - Multiprocessing for up to [6x speed-up](#performance) The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. @@ -68,7 +67,7 @@ and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash/dirhash-python/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/cli.py) with the shell command: `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 323e3f9..056336f 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -105,14 +105,14 @@ def dirhash( # Path Selection and Filtering Provided glob/wildcard (".gitignore style") match-patterns determine what paths within the `directory` to include when computing the hash value. Paths - *relative to the root `directory` (i.e. excluding the name of the directory - itself) are matched against the patterns. + *relative to the root `directory`* (i.e. excluding the name of the root + directory itself) are matched against the patterns. The `match` argument represent what should be *included* - as opposed - to `ignore` patterns for which matches are *excluded*. Using `ignore` is + to the `ignore` argument for which matches are *excluded*. Using `ignore` is just short for adding the same patterns to the `match` argument with the prefix "!", i.e. the calls bellow are equivalent: - `dirhash(..., match=['*', '!'])` - `dirhash(..., ignore=[''])` + `dirhash(..., match=["*", "!"])` + `dirhash(..., ignore=[""])` To validate which paths are included, call `dirhash.included_paths` with the same values for the arguments: `match`, `ignore`, `linked_dirs`, `linked_files` and `empty_dirs` to get a list of all paths that will be @@ -348,7 +348,7 @@ class Filter(RecursionFilter): # Arguments match: Iterable[str] - An iterable of glob/wildcard (".gitignore style") match patterns for selection of which files and directories to include. - Paths *relative to the root `directory` (i.e. excluding the name of the + Paths *relative to the root `directory`* (i.e. excluding the name of the root directory itself) are matched against the provided patterns. For example, to include all files, except for hidden ones use: `match=['*', '!.*']` Default `None` which is equivalent to `['*']`, diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 365fee3..06e4044 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -43,10 +43,11 @@ def get_kwargs(args): choices=dirhash.algorithms_available, default='md5', help=( - 'Hashing algorithm to use. Always available: {}. Additionally available ' - 'on current platform: {}. Note that the same algorithm may appear ' - 'multiple times in this set under different names (thanks to ' - 'OpenSSL) [https://docs.python.org/2/library/hashlib.html]'.format( + 'Hashing algorithm to use, by default "md5". Always available: {}. ' + 'Additionally available on current platform: {}. Note that the same ' + 'algorithm may appear multiple times in this set under different names ' + '(thanks to OpenSSL) ' + '[https://docs.python.org/2/library/hashlib.html]'.format( sorted(dirhash.algorithms_guaranteed), sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed) ) @@ -77,7 +78,7 @@ def get_kwargs(args): nargs='+', default=['*'], help=( - 'String of match-patterns, separated by blank space. NOTE: patterns ' + 'One or several patterns for paths to include. NOTE: patterns ' 'with an asterisk must be in quotes ("*") or the asterisk ' 'preceded by an escape character (\*).' ), @@ -88,7 +89,7 @@ def get_kwargs(args): nargs='+', default=None, help=( - 'String of ignore-patterns, separated by blank space. NOTE: patterns ' + 'One or several patterns for paths to exclude. NOTE: patterns ' 'with an asterisk must be in quotes ("*") or the asterisk ' 'preceded by an escape character (\*).' ), @@ -175,27 +176,5 @@ def get_kwargs(args): return vars(parser.parse_args(args)) -# def preprocess_kwargs(kwargs): -# match_kwargs = {} -# for kwarg in ['match', 'ignore']: -# match_kwargs[kwarg] = kwargs.pop(kwarg) -# match_patterns = dirhash.get_match_patterns(**match_kwargs) -# -# filtering_kwargs = { -# 'match': match_patterns, -# 'linked_dirs': kwargs.pop('linked_dirs'), -# 'linked_files': kwargs.pop('linked_files'), -# 'empty_dirs': kwargs.pop('empty_dirs'), -# } -# protocol_kwargs = { -# 'allow_cyclic_links': kwargs.pop('allow_cyclic_links'), -# 'entry_properties': kwargs.pop('properties') or ["data", "name"] -# } -# kwargs['filtering'] = filtering_kwargs -# kwargs['protocol'] = protocol_kwargs -# -# return kwargs - - if __name__ == '__main__': # pragma: no cover main() From e15a37a7d7989e1bc28f5a6f4723ce176e30f8d6 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 13:30:25 +0200 Subject: [PATCH 09/51] add results from new benchmark runs --- benchmark/results_v0.2.0.csv | 51 +++++ benchmark/results_v0.2.0.json | 402 ++++++++++++++++++++++++++++++++++ 2 files changed, 453 insertions(+) create mode 100644 benchmark/results_v0.2.0.csv create mode 100644 benchmark/results_v0.2.0.json diff --git a/benchmark/results_v0.2.0.csv b/benchmark/results_v0.2.0.csv new file mode 100644 index 0000000..0e783dc --- /dev/null +++ b/benchmark/results_v0.2.0.csv @@ -0,0 +1,51 @@ +,test_case,implementation,algorithm,workers,t_best,t_median,speed-up (median) +0,flat_8_128MB,shell reference,md5,1,2.079,2.083,1.0 +1,flat_8_128MB,dirhash_impl,md5,1,1.734,1.945,1.0709511568123393 +2,flat_8_128MB,dirhash_impl,md5,2,0.999,1.183,1.760777683854607 +3,flat_8_128MB,dirhash_impl,md5,4,0.711,0.728,2.8612637362637368 +4,flat_8_128MB,dirhash_impl,md5,8,0.504,0.518,4.021235521235521 +5,flat_1k_1MB,shell reference,md5,1,3.383,3.679,1.0 +6,flat_1k_1MB,dirhash_impl,md5,1,1.846,1.921,1.9151483602290473 +7,flat_1k_1MB,dirhash_impl,md5,2,1.137,1.158,3.1770293609671847 +8,flat_1k_1MB,dirhash_impl,md5,4,0.74,0.749,4.911882510013351 +9,flat_1k_1MB,dirhash_impl,md5,8,0.53,0.534,6.889513108614231 +10,flat_32k_32kB,shell reference,md5,1,13.827,18.213,1.0 +11,flat_32k_32kB,dirhash_impl,md5,1,13.655,13.808,1.3190179606025494 +12,flat_32k_32kB,dirhash_impl,md5,2,3.276,3.33,5.469369369369369 +13,flat_32k_32kB,dirhash_impl,md5,4,2.409,2.421,7.522924411400249 +14,flat_32k_32kB,dirhash_impl,md5,8,2.045,2.086,8.731064237775648 +15,nested_1k_1MB,shell reference,md5,1,3.284,3.332,1.0 +16,nested_1k_1MB,dirhash_impl,md5,1,1.717,1.725,1.9315942028985504 +17,nested_1k_1MB,dirhash_impl,md5,2,1.026,1.034,3.222437137330754 +18,nested_1k_1MB,dirhash_impl,md5,4,0.622,0.633,5.263823064770932 +19,nested_1k_1MB,dirhash_impl,md5,8,0.522,0.529,6.29867674858223 +20,nested_32k_32kB,shell reference,md5,1,11.898,12.125,1.0 +21,nested_32k_32kB,dirhash_impl,md5,1,13.858,14.146,0.8571327583769263 +22,nested_32k_32kB,dirhash_impl,md5,2,2.781,2.987,4.059256779377302 +23,nested_32k_32kB,dirhash_impl,md5,4,1.894,1.92,6.315104166666667 +24,nested_32k_32kB,dirhash_impl,md5,8,1.55,1.568,7.732780612244897 +25,flat_8_128MB,shell reference,sha1,1,2.042,2.05,1.0 +26,flat_8_128MB,dirhash_impl,sha1,1,1.338,1.354,1.5140324963072376 +27,flat_8_128MB,dirhash_impl,sha1,2,0.79,0.794,2.5818639798488663 +28,flat_8_128MB,dirhash_impl,sha1,4,0.583,0.593,3.456998313659359 +29,flat_8_128MB,dirhash_impl,sha1,8,0.483,0.487,4.209445585215605 +30,flat_1k_1MB,shell reference,sha1,1,2.118,2.129,1.0 +31,flat_1k_1MB,dirhash_impl,sha1,1,1.39,1.531,1.3905943827563685 +32,flat_1k_1MB,dirhash_impl,sha1,2,0.925,0.932,2.2843347639484977 +33,flat_1k_1MB,dirhash_impl,sha1,4,0.614,0.629,3.384737678855326 +34,flat_1k_1MB,dirhash_impl,sha1,8,0.511,0.52,4.094230769230769 +35,flat_32k_32kB,shell reference,sha1,1,10.551,10.97,1.0 +36,flat_32k_32kB,dirhash_impl,sha1,1,4.663,4.76,2.304621848739496 +37,flat_32k_32kB,dirhash_impl,sha1,2,3.108,3.235,3.3910355486862445 +38,flat_32k_32kB,dirhash_impl,sha1,4,2.342,2.361,4.6463362981787375 +39,flat_32k_32kB,dirhash_impl,sha1,8,2.071,2.094,5.2387774594078325 +40,nested_1k_1MB,shell reference,sha1,1,2.11,2.159,1.0 +41,nested_1k_1MB,dirhash_impl,sha1,1,1.436,1.47,1.4687074829931972 +42,nested_1k_1MB,dirhash_impl,sha1,2,0.925,0.937,2.3041622198505864 +43,nested_1k_1MB,dirhash_impl,sha1,4,0.627,0.643,3.357698289269051 +44,nested_1k_1MB,dirhash_impl,sha1,8,0.516,0.527,4.096774193548386 +45,nested_32k_32kB,shell reference,sha1,1,3.982,7.147,1.0 +46,nested_32k_32kB,dirhash_impl,sha1,1,4.114,4.156,1.7196823869104911 +47,nested_32k_32kB,dirhash_impl,sha1,2,2.598,2.616,2.7320336391437308 +48,nested_32k_32kB,dirhash_impl,sha1,4,1.809,1.831,3.9033315128345167 +49,nested_32k_32kB,dirhash_impl,sha1,8,1.552,1.58,4.523417721518987 diff --git a/benchmark/results_v0.2.0.json b/benchmark/results_v0.2.0.json new file mode 100644 index 0000000..71a652b --- /dev/null +++ b/benchmark/results_v0.2.0.json @@ -0,0 +1,402 @@ +[ + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.079, + "t_median": 2.083 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.734, + "t_median": 1.945 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.999, + "t_median": 1.183 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.711, + "t_median": 0.728 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.504, + "t_median": 0.518 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.383, + "t_median": 3.679 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.846, + "t_median": 1.921 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.137, + "t_median": 1.158 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.74, + "t_median": 0.749 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.53, + "t_median": 0.534 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 13.827, + "t_median": 18.213 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.655, + "t_median": 13.808 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 3.276, + "t_median": 3.33 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 2.409, + "t_median": 2.421 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 2.045, + "t_median": 2.086 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.284, + "t_median": 3.332 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.717, + "t_median": 1.725 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.026, + "t_median": 1.034 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.622, + "t_median": 0.633 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.522, + "t_median": 0.529 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 11.898, + "t_median": 12.125 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.858, + "t_median": 14.146 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.781, + "t_median": 2.987 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.894, + "t_median": 1.92 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.55, + "t_median": 1.568 + }, + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.042, + "t_median": 2.05 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.338, + "t_median": 1.354 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.79, + "t_median": 0.794 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.583, + "t_median": 0.593 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.483, + "t_median": 0.487 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.118, + "t_median": 2.129 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.39, + "t_median": 1.531 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.932 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.614, + "t_median": 0.629 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.511, + "t_median": 0.52 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 10.551, + "t_median": 10.97 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.663, + "t_median": 4.76 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 3.108, + "t_median": 3.235 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 2.342, + "t_median": 2.361 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 2.071, + "t_median": 2.094 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.11, + "t_median": 2.159 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.436, + "t_median": 1.47 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.937 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.627, + "t_median": 0.643 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.516, + "t_median": 0.527 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 3.982, + "t_median": 7.147 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.114, + "t_median": 4.156 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 2.598, + "t_median": 2.616 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.809, + "t_median": 1.831 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.552, + "t_median": 1.58 + } +] \ No newline at end of file From 2d821b3800b193e85eaf5bb673b0a80e2bc6a6fe Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 13:49:44 +0200 Subject: [PATCH 10/51] add back removed tests --- tests/test_dirhash.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index d8b97f6..77f2409 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -797,15 +797,15 @@ def test_hash_cyclic_link(self): allow_cyclic_links=True ) - # def test_pass_filtering_instance(self): - # self.mkdirs('root') - # self.mkfile('root/f1', '') - # dirhash(self.path_to('root'), 'sha256', filtering=Filter()) - # - # def test_pass_protocol_instance(self): - # self.mkdirs('root') - # self.mkfile('root/f1', '') - # dirhash(self.path_to('root'), 'sha256', protocol=Protocol()) + def test_pass_filtering_instance(self): + self.mkdirs('root') + self.mkfile('root/f1', '') + dirhash_impl(self.path_to('root'), 'sha256', filter_=Filter()) + + def test_pass_protocol_instance(self): + self.mkdirs('root') + self.mkfile('root/f1', '') + dirhash_impl(self.path_to('root'), 'sha256', protocol=Protocol()) def test_raise_on_wrong_type(self): self.mkdirs('root') From 1f2643b27f10b216c73d0875f7e0a281a9fdb744 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 13:50:41 +0200 Subject: [PATCH 11/51] update changelog --- CHANGELOG.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dcb417a..50a06f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] NIL -## [0.2.0] - 2019-04-18 -Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/0.1.0) +## [0.2.0] - 2019-04-20 +Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/v0.1.0) ### Added - A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash). @@ -26,3 +26,7 @@ same directory**. the previous repository [github.com/andhus/dirhash](https://github.com/andhus/dirhash) which now contains the formal description of the Dirhash Standard. + +### Removed +- All support for the `.dirhashignore` file. This seemed superfluous, please file an +issue if you need this feature. From aa4cd7fccb1efc8ce00afbe6de47900728e689f5 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 20 Apr 2020 13:57:07 +0200 Subject: [PATCH 12/51] rename Filter arg match -> match_patterns to reflect Dirhash Standard --- src/dirhash/__init__.py | 8 ++++---- tests/test_dirhash.py | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 056336f..cee8bd6 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -147,7 +147,7 @@ def dirhash( description of how the returned hash value is computed. """ filter_ = Filter( - match=get_match_patterns(match=match, ignore=ignore), + match_patterns=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, linked_files=linked_files, empty_dirs=empty_dirs @@ -320,7 +320,7 @@ def included_paths( the hash of the `directory` using `dirhash.dirhash` and the same arguments. """ filter_ = Filter( - match=get_match_patterns(match=match, ignore=ignore), + match_patterns=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, linked_files=linked_files, empty_dirs=empty_dirs @@ -365,7 +365,7 @@ class Filter(RecursionFilter): """ def __init__( self, - match=None, + match_patterns=None, linked_dirs=True, linked_files=True, empty_dirs=False @@ -373,7 +373,7 @@ def __init__( super(Filter, self).__init__( linked_dirs=linked_dirs, linked_files=linked_files, - match=match + match=match_patterns ) self.empty_dirs = empty_dirs diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 77f2409..0111d78 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -15,7 +15,11 @@ dirhash, algorithms_available, algorithms_guaranteed, - Protocol, _parmap, Filter, dirhash_impl) + Protocol, + _parmap, + Filter, + dirhash_impl +) from scantree import SymlinkRecursionError From 054670271a8f693c7a3baa6d811cf338e2d5b5be Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 26 Apr 2020 13:36:23 +0200 Subject: [PATCH 13/51] add missing 'tree/master' in README links for in repo refs --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e9c938b..f006ade 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/tree/master/cli.py) with the shell command: `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` @@ -88,7 +88,7 @@ shell reference | nested_32k_32kB | 6.82 | -> 1.0 `dirhash` | nested_32k_32kB | 3.43 | 2.00 `dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** -The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/benchmark). +The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/tree/master/benchmark). ## Documentation -Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash/dirhash-python/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file +Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash/dirhash-python/tree/master/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file From ca5b744fa671c10a5dbaad9bbf505436830624bd Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 26 Apr 2020 13:40:52 +0200 Subject: [PATCH 14/51] try again fix README links for in repo refs --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f006ade..dc763ab 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/tree/master/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py) with the shell command: `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` @@ -91,4 +91,4 @@ shell reference | nested_32k_32kB | 6.82 | -> 1.0 The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/tree/master/benchmark). ## Documentation -Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash/dirhash-python/tree/master/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file +Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file From ede9939af1cf6979e213e4666baf971ed8f31846 Mon Sep 17 00:00:00 2001 From: JonathanArns Date: Mon, 27 Jul 2020 13:15:21 -0400 Subject: [PATCH 15/51] separate version into it's own module to avoid calling pkg_resources.require --- setup.py | 9 ++++++--- src/dirhash/__init__.py | 3 ++- src/dirhash/version.py | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 src/dirhash/version.py diff --git a/setup.py b/setup.py index 242919a..207052b 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,14 @@ import os from setuptools import setup, find_packages -VERSION = '0.2.0' +PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) + +version = {} +with io.open(os.path.join(PROJECT_ROOT, "src", "dirhash", "version.py") as fp: + exec(fp.read(), version) DESCRIPTION = 'Python module and CLI for hashing of file system directories.' -PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) try: with io.open(os.path.join(PROJECT_ROOT, 'README.md'), encoding='utf-8') as f: long_description = '\n' + f.read() @@ -15,7 +18,7 @@ setup( name='dirhash', - version=VERSION, + version=version['__version__'], description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index cee8bd6..807eec3 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -16,6 +16,8 @@ CyclicLinkedDir, ) +from dirhash.version import __version__ + __all__ = [ '__version__', 'algorithms_guaranteed', @@ -28,7 +30,6 @@ 'Protocol' ] -__version__ = pkg_resources.require("dirhash")[0].version algorithms_guaranteed = {'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'} algorithms_available = hashlib.algorithms_available diff --git a/src/dirhash/version.py b/src/dirhash/version.py new file mode 100644 index 0000000..7fd229a --- /dev/null +++ b/src/dirhash/version.py @@ -0,0 +1 @@ +__version__ = '0.2.0' From 33b506965400511255548d01db007549dfa340a1 Mon Sep 17 00:00:00 2001 From: JonathanArns Date: Wed, 29 Jul 2020 13:55:14 -0400 Subject: [PATCH 16/51] fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 207052b..6d16240 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) version = {} -with io.open(os.path.join(PROJECT_ROOT, "src", "dirhash", "version.py") as fp: +with io.open(os.path.join(PROJECT_ROOT, "src", "dirhash", "version.py")) as fp: exec(fp.read(), version) DESCRIPTION = 'Python module and CLI for hashing of file system directories.' From 347dc276f03c6fa9501d4414320cd7c64265502f Mon Sep 17 00:00:00 2001 From: "Keller Fabian Rudolf (CC-AD/EYC3)" Date: Thu, 20 Aug 2020 13:50:50 +0200 Subject: [PATCH 17/51] fix broken character --- src/dirhash/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index cee8bd6..b8520f5 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -78,7 +78,7 @@ def dirhash( "name" and "data" must be included. Default is ["name", "data"] which means that the content (actual data) as well as the path relative to the root `directory` of files will affect the hash value. See "Entry - Properties Interpretation" section below for further details.  + Properties Interpretation" section below for further details. allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is raised on presence of cyclic symbolic links. If set to `True` the the dirhash value for directory causing the cyclic link is replaced with the From 37c89746520a8fa9961f657e40bfc7bd77c6be25 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Wed, 26 Aug 2020 21:42:43 +0200 Subject: [PATCH 18/51] bump version -> 0.2.1 --- src/dirhash/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dirhash/version.py b/src/dirhash/version.py index 7fd229a..fc79d63 100644 --- a/src/dirhash/version.py +++ b/src/dirhash/version.py @@ -1 +1 @@ -__version__ = '0.2.0' +__version__ = '0.2.1' From fdf0415860d04fcefe072a4677826d624c6e1349 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 30 Mar 2022 23:28:48 +0300 Subject: [PATCH 19/51] remove unused "import pkg_resources" line 1. It brakes the code if `setuptools` package is not installed and 2. It is also unused, since you're getting the version directly from a separate file: ``` from dirhash.version import __version__ ``` --- src/dirhash/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 161768c..f24f698 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -5,7 +5,6 @@ import os import hashlib -import pkg_resources from functools import partial from multiprocessing import Pool From 7d4d75b92bcca791dd4486637e0cf15a239fc18d Mon Sep 17 00:00:00 2001 From: Scott K Logan Date: Thu, 17 Nov 2022 13:42:09 -0800 Subject: [PATCH 20/51] Allow passing positional arguments through tox to pytest --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index a819805..31da9a6 100644 --- a/tox.ini +++ b/tox.ini @@ -5,5 +5,5 @@ envlist = py27,py37 deps = pytest-cov commands = - py.test --cov-report=xml --cov-config=.coveragerc --cov=dirhash tests/ + py.test --cov-report=xml --cov-config=.coveragerc --cov=dirhash tests/ {posargs} coverage report From c2f4f53da789aba2f105b892bd66921df8cfc4a7 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 15 Jan 2024 22:54:23 +0100 Subject: [PATCH 21/51] removes travis, adds github workflow for running tox based tests --- .github/workflows/test.yml | 48 ++++++++++++++++++++++++++++++++++++++ .travis.yml | 23 ------------------ tox.ini | 14 ++++++++--- 3 files changed, 59 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/test.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..436c7d8 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,48 @@ +name: Run tests + +on: + push: + branches: + - "master" + pull_request: + branches: + - "*" + workflow_dispatch: + release: + types: [published, edited] + +jobs: + tests: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh-actions + - name: Cache tox environments + id: cache-tox + uses: actions/cache@v1 + with: + path: .tox + # setup.py and setup.cfg have versioning info that would impact the + # tox environment. hashFiles only takes a single file path or pattern + # at the moment. + key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} + - name: Test with tox + run: tox + - uses: codecov/codecov-action@v3 + env: + token: ${{ secrets.CODECOV_TOKEN }} + with: + verbose: true diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 82dceb6..0000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: python -dist: xenial - -install: - - pip install tox codecov - -matrix: - include: - - python: 2.7 - env: TOXENV=py27 - - python: 3.7 - env: TOXENV=py37 - -script: - - tox - -after_success: - - codecov - -notifications: - email: - on_success: never - on_failure: always diff --git a/tox.ini b/tox.ini index a819805..afa3d43 100644 --- a/tox.ini +++ b/tox.ini @@ -1,9 +1,17 @@ [tox] -envlist = py27,py37 +envlist = py{38,39,310,311,312} [testenv] deps = + pytest pytest-cov commands = - py.test --cov-report=xml --cov-config=.coveragerc --cov=dirhash tests/ - coverage report + pytest --cov=scantree --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ + +[gh-actions] +python = + 3.8: py38 + 3.9: py39 + 3.10: py310 + 3.11: py311 + 3.12: py312 From c96bd0c837d403cc379b5b1984daff6a354f2b03 Mon Sep 17 00:00:00 2001 From: Filip Richtarik Date: Wed, 3 Apr 2024 14:52:32 +0200 Subject: [PATCH 22/51] fixes --- .github/workflows/test.yml | 4 ++-- setup.py | 2 +- src/dirhash/cli.py | 4 ++-- tests/test_dirhash.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 436c7d8..1d58b6e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,7 +32,7 @@ jobs: python -m pip install tox tox-gh-actions - name: Cache tox environments id: cache-tox - uses: actions/cache@v1 + uses: actions/cache@v4 with: path: .tox # setup.py and setup.cfg have versioning info that would impact the @@ -41,7 +41,7 @@ jobs: key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - name: Test with tox run: tox - - uses: codecov/codecov-action@v3 + - uses: codecov/codecov-action@v4 env: token: ${{ secrets.CODECOV_TOKEN }} with: diff --git a/setup.py b/setup.py index 6d16240..29493d7 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ author="Anders Huss", author_email="andhus@kth.se", license='MIT', - install_requires=['scantree>=0.0.1'], + install_requires=['scantree>=0.0.2', 'pathspec<=0.10.0'], packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 06e4044..89f8308 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -80,7 +80,7 @@ def get_kwargs(args): help=( 'One or several patterns for paths to include. NOTE: patterns ' 'with an asterisk must be in quotes ("*") or the asterisk ' - 'preceded by an escape character (\*).' + 'preceded by an escape character (`*).' ), metavar='' ) @@ -91,7 +91,7 @@ def get_kwargs(args): help=( 'One or several patterns for paths to exclude. NOTE: patterns ' 'with an asterisk must be in quotes ("*") or the asterisk ' - 'preceded by an escape character (\*).' + 'preceded by an escape character (`*).' ), metavar='' ) diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 0111d78..1d70523 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -132,7 +132,7 @@ def test_ignore_extensions(self): class TempDirTest(object): - def setup(self): + def setup_method(self): self.dir = tempfile.mkdtemp() def tear_down(self): From c44d935ff77d5bfab0f234d5cc78730eb8847b5c Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 8 Apr 2024 22:13:59 +0200 Subject: [PATCH 23/51] allows available algorithm to be missing --- tests/test_dirhash.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 1d70523..6316961 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -42,9 +42,19 @@ def test_get_guaranteed(self): def test_get_available(self): for algorithm in algorithms_available: hasher_factory = _get_hasher_factory(algorithm) - hasher = hasher_factory() - assert hasattr(hasher, 'update') - assert hasattr(hasher, 'hexdigest') + try: + hasher = hasher_factory() + except ValueError as exc: + # Some "available" algorithms are not necessarily available (fails for e.g. + # 'ripemd160' in github actions for python 3.8). See: + # https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python # noqa + print(f"Failed to create hasher for {algorithm}: {exc}") + assert exc.args[0] == f"unsupported hash type {algorithm}" + hasher = None + + if hasher is not None: + assert hasattr(hasher, 'update') + assert hasattr(hasher, 'hexdigest') def test_not_available(self): with pytest.raises(ValueError): From 1a33f31e6de4d736476fe1d71740f836c0576113 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 8 Apr 2024 22:40:24 +0200 Subject: [PATCH 24/51] cleans up and increases margin (wait time) in multiprocessing tests --- tests/test_dirhash.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 6316961..f082392 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -145,7 +145,7 @@ class TempDirTest(object): def setup_method(self): self.dir = tempfile.mkdtemp() - def tear_down(self): + def teardown_method(self): if os.path.exists(self.dir): shutil.rmtree(self.dir) @@ -703,19 +703,19 @@ def test_multiproc_speedup(self): for i in range(num_files): self.mkfile('root/file_{}'.format(i), '< one chunk content') - expected_min_elapsed = SlowHasher.wait_time * num_files + expected_min_elapsed_sequential = SlowHasher.wait_time * num_files start = time() dirhash(self.path_to('root'), algorithm=SlowHasher) end = time() elapsed_sequential = end - start - assert elapsed_sequential > expected_min_elapsed + assert elapsed_sequential > expected_min_elapsed_sequential start = time() dirhash(self.path_to('root'), algorithm=SlowHasher, jobs=num_files) end = time() elapsed_muliproc = end - start - assert elapsed_muliproc < expected_min_elapsed + assert elapsed_muliproc < 0.9 * expected_min_elapsed_sequential # just check "any speedup", the overhead varies (and is high on Travis) def test_cache_by_real_path_speedup(self, tmpdir): @@ -729,13 +729,13 @@ def test_cache_by_real_path_speedup(self, tmpdir): file_i.write('< one chunk content', ensure=True) wait_time = SlowHasher.wait_time - expected_min_elapsed = wait_time * num_links + expected_min_elapsed_no_links = wait_time * num_links start = time() dirhash(root1, algorithm=SlowHasher) end = time() - elapsed_sequential = end - start - assert elapsed_sequential > expected_min_elapsed - overhead = elapsed_sequential - expected_min_elapsed + elapsed_no_links = end - start + assert elapsed_no_links > expected_min_elapsed_no_links + overhead = elapsed_no_links - expected_min_elapsed_no_links # all links to same file root2 = tmpdir.join('root2') @@ -746,13 +746,13 @@ def test_cache_by_real_path_speedup(self, tmpdir): root2.join('link_{}'.format(i)).mksymlinkto(target_file) overhead_margin_factor = 1.5 - expected_max_elapsed = overhead * overhead_margin_factor + wait_time - assert expected_max_elapsed < expected_min_elapsed + expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time + assert expected_max_elapsed_with_links < expected_min_elapsed_no_links start = time() dirhash(root2, algorithm=SlowHasher) end = time() - elapsed_cache = end - start - assert elapsed_cache < expected_max_elapsed + elapsed_with_links = end - start + assert elapsed_with_links < expected_max_elapsed_with_links def test_cache_together_with_multiprocess_speedup(self, tmpdir): target_file_names = ['target_file_1', 'target_file_2'] @@ -768,13 +768,13 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): jobs = 2 wait_time = SlowHasher.wait_time - expected_min_elapsed = wait_time * num_links / jobs + expected_min_elapsed_no_links = wait_time * num_links / jobs start = time() dirhash(root1, algorithm=SlowHasher, jobs=jobs) end = time() - elapsed_sequential = end - start - assert elapsed_sequential > expected_min_elapsed - overhead = elapsed_sequential - expected_min_elapsed + elapsed_no_links = end - start + assert elapsed_no_links > expected_min_elapsed_no_links + overhead = elapsed_no_links - expected_min_elapsed_no_links root2 = tmpdir.join('root2') root2.ensure(dir=True) @@ -785,13 +785,13 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): root2.join('link_{}_{}'.format(i, j)).mksymlinkto(target_file) overhead_margin_factor = 1.5 - expected_max_elapsed = overhead * overhead_margin_factor + wait_time * 2 - assert expected_max_elapsed < expected_min_elapsed + expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time * 2 + assert expected_max_elapsed_with_links < expected_min_elapsed_no_links start = time() dirhash(root2, algorithm=SlowHasher, jobs=jobs) end = time() - elapsed_mp_cache = end - start - assert elapsed_mp_cache < expected_max_elapsed + elapsed_mp_with_links = end - start + assert elapsed_mp_with_links < expected_max_elapsed_with_links def test_hash_cyclic_link_to_root(self): self.mkdirs('root/d1') @@ -831,7 +831,7 @@ def test_raise_on_wrong_type(self): class SlowHasher(object): - wait_time = 0.05 + wait_time = 0.25 def __init__(self, *args, **kwargs): pass From 1cd629bee4b56cbc6806e9a815fe087ab8957e18 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Mon, 8 Apr 2024 23:01:07 +0200 Subject: [PATCH 25/51] clarify pathspec limit --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 29493d7..7132af5 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ author="Anders Huss", author_email="andhus@kth.se", license='MIT', - install_requires=['scantree>=0.0.2', 'pathspec<=0.10.0'], + install_requires=['scantree>=0.0.2', 'pathspec<0.10.0'], packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, From 9e432e3190056c43a9dacaa894f25c40e11ca4af Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 17:42:57 +0200 Subject: [PATCH 26/51] removes travis badge --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index dc763ab..613e5b6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -[![Build Status](https://travis-ci.com/andhus/dirhash-python.svg?branch=master)](https://travis-ci.com/andhus/dirhash-python) [![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) # dirhash From 8f7a961f9d9964aa96bea6057aa1c134d054a751 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 17:50:18 +0200 Subject: [PATCH 27/51] adds codecov.yml --- .github/workflows/test.yml | 2 +- codecov.yml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 codecov.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1d58b6e..06d7679 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,7 +41,7 @@ jobs: key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - name: Test with tox run: tox - - uses: codecov/codecov-action@v4 + - uses: codecov/codecov-action@v3 env: token: ${{ secrets.CODECOV_TOKEN }} with: diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..1dde27a --- /dev/null +++ b/codecov.yml @@ -0,0 +1,6 @@ +coverage: + status: + project: + default: + target: 100% # the required coverage value + threshold: 5% # the leniency in hitting the target From 05877434e4ce4455f3e8de8f1901fbb0ca9ef318 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 17:54:02 +0200 Subject: [PATCH 28/51] try update --cov to correct repo --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index cd10b68..efbaf71 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,7 @@ deps = pytest pytest-cov commands = - pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} + pytest --cov=dirhash-python --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} [gh-actions] python = From 25a59de4fabd5170eed0aeab6dc1939ecc1fc517 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 17:58:05 +0200 Subject: [PATCH 29/51] revert try update --cov to correct repo --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index efbaf71..cd10b68 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,7 @@ deps = pytest pytest-cov commands = - pytest --cov=dirhash-python --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} + pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} [gh-actions] python = From 4dabf11d4753f3088f1a4b5db53d136c3454fd35 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 22:00:49 +0200 Subject: [PATCH 30/51] fix token --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 06d7679..cf893b4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,7 +42,6 @@ jobs: - name: Test with tox run: tox - uses: codecov/codecov-action@v3 - env: - token: ${{ secrets.CODECOV_TOKEN }} with: + token: ${{ secrets.CODECOV_TOKEN }} verbose: true From bd4890d3d572c30a16f4feaf70d7fb3fa18a6a8c Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 22:36:17 +0200 Subject: [PATCH 31/51] adds versioneer --- .coveragerc | 1 + .gitattributes | 1 + pyproject.toml | 3 + setup.cfg | 7 + setup.py | 8 +- src/dirhash/__init__.py | 3 +- src/dirhash/_version.py | 683 ++++++++++++++++++++++++++++++++++++++++ src/dirhash/version.py | 1 - 8 files changed, 700 insertions(+), 7 deletions(-) create mode 100644 .gitattributes create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 src/dirhash/_version.py delete mode 100644 src/dirhash/version.py diff --git a/.coveragerc b/.coveragerc index 05b56af..cea0409 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,3 +1,4 @@ [run] branch = True source = dirhash +omit = _version.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4ab54a1 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +src/dirhash/_version.py export-subst diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..31e196f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "versioneer==0.29"] +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..31b58fe --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[versioneer] +VCS = git +style = pep440 +versionfile_source = src/dirhash/_version.py +versionfile_build = dirhash/_version.py +tag_prefix = +parentdir_prefix = dirhash- diff --git a/setup.py b/setup.py index 7132af5..bb98d2b 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,9 @@ import os from setuptools import setup, find_packages -PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) +import versioneer -version = {} -with io.open(os.path.join(PROJECT_ROOT, "src", "dirhash", "version.py")) as fp: - exec(fp.read(), version) +PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) DESCRIPTION = 'Python module and CLI for hashing of file system directories.' @@ -18,7 +16,7 @@ setup( name='dirhash', - version=version['__version__'], + version=versioneer.get_version(), description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index f24f698..2b0d8b5 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -15,7 +15,8 @@ CyclicLinkedDir, ) -from dirhash.version import __version__ +from . import _version +__version__ = _version.get_versions()['version'] __all__ = [ '__version__', diff --git a/src/dirhash/_version.py b/src/dirhash/_version.py new file mode 100644 index 0000000..36c109b --- /dev/null +++ b/src/dirhash/_version.py @@ -0,0 +1,683 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys +from typing import Any, Callable, Dict, List, Optional, Tuple +import functools + + +def get_keywords() -> Dict[str, str]: + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool + + +def get_config() -> VersioneerConfig: + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "dirhash-" + cfg.versionfile_source = "src/dirhash/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f: Callable) -> Callable: + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None), **popen_kwargs) + break + except OSError as e: + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords: Dict[str, str] = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs( + tag_prefix: str, + root: str, + verbose: bool, + runner: Callable = run_command +) -> Dict[str, Any]: + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=not verbose) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces: Dict[str, Any] = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces: Dict[str, Any]) -> str: + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces: Dict[str, Any]) -> str: + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces: Dict[str, Any]) -> str: + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces: Dict[str, Any]) -> str: + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces: Dict[str, Any]) -> str: + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces: Dict[str, Any]) -> str: + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces: Dict[str, Any]) -> str: + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions() -> Dict[str, Any]: + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/src/dirhash/version.py b/src/dirhash/version.py deleted file mode 100644 index fc79d63..0000000 --- a/src/dirhash/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.2.1' From 0b8aea3185f9c612e8c7e551500d10c6d554ac97 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 22:49:23 +0200 Subject: [PATCH 32/51] adds publish.yml based on scantree setup --- .github/workflows/publish.yml | 128 ++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..d4b35ed --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,128 @@ +# Based on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/# +name: Publish Python Package + +on: + push: + tags: + - 'v[0-9]+.[0-9]+.[0-9]*' + +jobs: + build: + name: Build distribution + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + # NOTE: tags are not present unless triggered by tag push + # - name: Get tags + # run: git fetch --tags origin + # - name: List tags + # run: git tag --list + # TODO: somehow versioneer does not pickup the tag when workflow is not triggered by a + # tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5, + # see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42) + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v3 + with: + name: python-package-distributions + path: dist/ + + publish-to-pypi: + name: Publish to PyPI + # TODO we need to make sure the tag matches the version! + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/dirhash + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + github-release: + name: Sign and upload to GitHub Release + needs: + - publish-to-pypi + runs-on: ubuntu-latest + + permissions: + contents: write # IMPORTANT: mandatory for making GitHub Releases + id-token: write # IMPORTANT: mandatory for sigstore + + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v1.2.3 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + - name: Create GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + - name: Upload artifact signatures to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + # Upload to GitHub Release using the `gh` CLI. + # `dist/` contains the built packages, and the + # sigstore-produced signatures and certificates. + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' + + publish-to-testpypi: + name: Publish to TestPyPI + if: startsWith(github.ref, 'refs/tags/') # only publish on tag pushes + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/dirhash + + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ From 89f7241a1bd8eb8ce9af7ce3b29a1778080c0c07 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 22:52:58 +0200 Subject: [PATCH 33/51] adds back v prefix --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 31b58fe..4c05ed2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,5 +3,5 @@ VCS = git style = pep440 versionfile_source = src/dirhash/_version.py versionfile_build = dirhash/_version.py -tag_prefix = +tag_prefix = v parentdir_prefix = dirhash- From fc7b60628e08646084105d59eae0b1afed5cc53e Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 9 Apr 2024 23:29:50 +0200 Subject: [PATCH 34/51] reinits versioneer and adds missing cmdclass to setup.py --- setup.py | 1 + src/dirhash/_version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bb98d2b..5461af4 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ setup( name='dirhash', version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", diff --git a/src/dirhash/_version.py b/src/dirhash/_version.py index 36c109b..d5278f4 100644 --- a/src/dirhash/_version.py +++ b/src/dirhash/_version.py @@ -51,7 +51,7 @@ def get_config() -> VersioneerConfig: cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" - cfg.tag_prefix = "" + cfg.tag_prefix = "v" cfg.parentdir_prefix = "dirhash-" cfg.versionfile_source = "src/dirhash/_version.py" cfg.verbose = False From 611a15046c3f02d32b9dee492a187dbda9f79936 Mon Sep 17 00:00:00 2001 From: Filip Richtarik Date: Wed, 10 Apr 2024 16:51:01 +0200 Subject: [PATCH 35/51] init --- .github/workflows/publish.yml | 12 +++++----- .github/workflows/test.yml | 2 +- setup.py | 2 +- tests/test_dirhash.py | 45 ++++++++++++++++++++++++++++++----- 4 files changed, 47 insertions(+), 14 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d4b35ed..edb34d3 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -22,7 +22,7 @@ jobs: # tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5, # see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42) - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.x" - name: Install pypa/build @@ -34,7 +34,7 @@ jobs: - name: Build a binary wheel and a source tarball run: python3 -m build - name: Store the distribution packages - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: python-package-distributions path: dist/ @@ -54,7 +54,7 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-package-distributions path: dist/ @@ -73,12 +73,12 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-package-distributions path: dist/ - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v1.2.3 + uses: sigstore/gh-action-sigstore-python@v2.1.1 with: inputs: >- ./dist/*.tar.gz @@ -118,7 +118,7 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-package-distributions path: dist/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cf893b4..9859117 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,7 +41,7 @@ jobs: key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - name: Test with tox run: tox - - uses: codecov/codecov-action@v3 + - uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} verbose: true diff --git a/setup.py b/setup.py index 5461af4..cda3381 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ author="Anders Huss", author_email="andhus@kth.se", license='MIT', - install_requires=['scantree>=0.0.2', 'pathspec<0.10.0'], + install_requires=['scantree>=0.0.2'], packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index f082392..336bf38 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -51,7 +51,7 @@ def test_get_available(self): print(f"Failed to create hasher for {algorithm}: {exc}") assert exc.args[0] == f"unsupported hash type {algorithm}" hasher = None - + if hasher is not None: assert hasattr(hasher, 'update') assert hasattr(hasher, 'hexdigest') @@ -267,7 +267,7 @@ def test_cyclic_link(self): with pytest.raises(SymlinkRecursionError): filepaths = included_paths(self.path_to('root')) - def test_ignore_hidden_files(self): + def test_ignore_hidden(self): self.mkdirs('root/d1') self.mkdirs('root/.d2') @@ -282,12 +282,45 @@ def test_ignore_hidden_files(self): assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] # with ignore - filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'] - ) + filepaths = included_paths(self.path_to('root'), match=['*', '!.*']) + assert filepaths == ['d1/f1', 'f1'] + + def test_ignore_hidden_files_only(self): + self.mkdirs('root/d1') + self.mkdirs('root/.d2') + + self.mkfile('root/f1') + self.mkfile('root/.f2') + self.mkfile('root/d1/f1') + self.mkfile('root/d1/.f2') + self.mkfile('root/.d2/f1') + + # no ignore + filepaths = included_paths(self.path_to('root')) + assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + + # with ignore + filepaths = included_paths(self.path_to('root'), match=['**/*', '!**/.*', '**/.*/*', '!**/.*/.*']) assert filepaths == ['.d2/f1', 'd1/f1', 'f1'] + def test_ignore_hidden_explicitly_recursive(self): + self.mkdirs('root/d1') + self.mkdirs('root/.d2') + + self.mkfile('root/f1') + self.mkfile('root/.f2') + self.mkfile('root/d1/f1') + self.mkfile('root/d1/.f2') + self.mkfile('root/.d2/f1') + + # no ignore + filepaths = included_paths(self.path_to('root')) + assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + + # with ignore + filepaths = included_paths(self.path_to('root'), match=['*', '!**/.*']) + assert filepaths == ['d1/f1', 'f1'] + def test_exclude_hidden_dirs(self): self.mkdirs('root/d1') self.mkdirs('root/.d2') From a99c0545bf29c16e5e1351218c4f7c36ed74fb00 Mon Sep 17 00:00:00 2001 From: Filip Richtarik Date: Thu, 11 Apr 2024 10:03:07 +0200 Subject: [PATCH 36/51] formatting init --- .github/workflows/publish.yml | 162 +++---- .github/workflows/test.yml | 102 +++- .gitignore | 4 +- .pre-commit-config.yaml | 26 ++ CHANGELOG.md | 27 +- README.md | 61 ++- benchmark/README.md | 29 +- benchmark/results.json | 802 ++++++++++++++++---------------- benchmark/results_v0.2.0.json | 802 ++++++++++++++++---------------- benchmark/run.py | 151 +++--- setup.py | 27 +- src/dirhash/__init__.py | 177 +++---- src/dirhash/_version.py | 163 ++++--- src/dirhash/cli.py | 178 +++---- tests/test_cli.py | 301 ++++++------ tests/test_dirhash.py | 847 ++++++++++++++++------------------ tox.ini | 2 + 17 files changed, 1929 insertions(+), 1932 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index edb34d3..350681a 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -4,7 +4,7 @@ name: Publish Python Package on: push: tags: - - 'v[0-9]+.[0-9]+.[0-9]*' + - "v[0-9]+.[0-9]+.[0-9]*" jobs: build: @@ -12,101 +12,101 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - # NOTE: tags are not present unless triggered by tag push - # - name: Get tags - # run: git fetch --tags origin - # - name: List tags - # run: git tag --list - # TODO: somehow versioneer does not pickup the tag when workflow is not triggered by a - # tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5, - # see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42) - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - name: Install pypa/build - run: >- - python3 -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: python3 -m build - - name: Store the distribution packages - uses: actions/upload-artifact@v4 - with: - name: python-package-distributions - path: dist/ + - uses: actions/checkout@v4 + # NOTE: tags are not present unless triggered by tag push + # - name: Get tags + # run: git fetch --tags origin + # - name: List tags + # run: git tag --list + # TODO: somehow versioneer does not pickup the tag when workflow is not triggered by a + # tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5, + # see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42) + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ publish-to-pypi: name: Publish to PyPI # TODO we need to make sure the tag matches the version! - if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes needs: - - build + - build runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/dirhash permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing + id-token: write # IMPORTANT: mandatory for trusted publishing steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 github-release: name: Sign and upload to GitHub Release needs: - - publish-to-pypi + - publish-to-pypi runs-on: ubuntu-latest permissions: - contents: write # IMPORTANT: mandatory for making GitHub Releases - id-token: write # IMPORTANT: mandatory for sigstore + contents: write # IMPORTANT: mandatory for making GitHub Releases + id-token: write # IMPORTANT: mandatory for sigstore steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v2.1.1 - with: - inputs: >- - ./dist/*.tar.gz - ./dist/*.whl - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - run: >- - gh release create - '${{ github.ref_name }}' - --repo '${{ github.repository }}' - --notes "" - - name: Upload artifact signatures to GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - # Upload to GitHub Release using the `gh` CLI. - # `dist/` contains the built packages, and the - # sigstore-produced signatures and certificates. - run: >- - gh release upload - '${{ github.ref_name }}' dist/** - --repo '${{ github.repository }}' + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + - name: Create GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + - name: Upload artifact signatures to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + # Upload to GitHub Release using the `gh` CLI. + # `dist/` contains the built packages, and the + # sigstore-produced signatures and certificates. + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' publish-to-testpypi: name: Publish to TestPyPI - if: startsWith(github.ref, 'refs/tags/') # only publish on tag pushes + if: startsWith(github.ref, 'refs/tags/') # only publish on tag pushes needs: - - build + - build runs-on: ubuntu-latest environment: @@ -114,15 +114,15 @@ jobs: url: https://test.pypi.org/p/dirhash permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing + id-token: write # IMPORTANT: mandatory for trusted publishing steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9859117..fd81a08 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,6 @@ on: jobs: tests: - runs-on: ubuntu-latest strategy: fail-fast: false @@ -21,27 +20,80 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install tox tox-gh-actions - - name: Cache tox environments - id: cache-tox - uses: actions/cache@v4 - with: - path: .tox - # setup.py and setup.cfg have versioning info that would impact the - # tox environment. hashFiles only takes a single file path or pattern - # at the moment. - key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - - name: Test with tox - run: tox - - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - verbose: true + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh-actions + - name: Cache tox environments + id: cache-tox + uses: actions/cache@v4 + with: + path: .tox + # setup.py and setup.cfg have versioning info that would impact the + # tox environment. hashFiles only takes a single file path or pattern + # at the moment. + key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} + - name: Test with tox + run: tox + - uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true + + # Always run this last as it can push new changes and actions will not rerun. + pre-commit: + needs: [tests] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.8" + + - name: Install PreCommit + run: pip install pre-commit + + - uses: actions/cache@v4 + with: + path: ~/.cache/pre-commit + key: ${{ runner.os }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} + restore-keys: | + ${{ runner.os }}-pre-commit- + + - name: PreCommit + id: pre-commit + run: | + if pre-commit run --show-diff-on-failure --color=always --all-files; then + echo "failed=0" >> $GITHUB_OUTPUT + else + echo "failed=1" >> $GITHUB_OUTPUT + fi + if [ -n "$(git status -s)" ]; then + echo "dirty=1" >> $GITHUB_OUTPUT + else + echo "dirty=0" >> $GITHUB_OUTPUT + fi + + # Run a second time to verify that everything has indeed been fixed. + - name: PreCommit verify + if: steps.pre-commit.outputs.failed == 1 + run: | + pre-commit run --show-diff-on-failure --color=always --all-files + + - name: Commit and Push to Pull Request + if: steps.pre-commit.outputs.dirty == 1 + run: | + git add . + git status + git commit -m "✨ ⭐ Automated commit has been added to your pull request to fix formatting! ⭐ ✨" + git push origin ${{ github.head_ref }} diff --git a/.gitignore b/.gitignore index bd70d88..b4973ea 100644 --- a/.gitignore +++ b/.gitignore @@ -106,6 +106,8 @@ venv.bak/ # Pycharm .idea/ +# VSC +.vscode/ # Project specific -benchmark/test_cases/* \ No newline at end of file +benchmark/test_cases/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5da4da3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: + - repo: https://github.com/psf/black + rev: 24.3.0 + hooks: + - id: black + args: ["--target-version", "py38"] + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + args: ["--profile", "black"] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + args: [--prose-wrap=preserve, --print-width=90] + - repo: https://github.com/pycqa/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + args: ["--max-line-length=90", "--extend-ignore=E203,W503"] + - repo: https://github.com/asottile/pyupgrade + rev: v3.15.2 + hooks: + - id: pyupgrade + args: ["--py38-plus"] diff --git a/CHANGELOG.md b/CHANGELOG.md index 50a06f1..ca27faa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,27 +6,32 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + NIL ## [0.2.0] - 2019-04-20 + Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/v0.1.0) ### Added + - A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash). - This changelog. - Results form a new benchmark run after changes. The `benchmark/run.py` now outputs results files which names include the `dirhash.__version__`. ### Changed -- **Significant breaking changes** from version 0.1.1 - both regarding API and the -underlying method/protocol for computing the hash. This means that **hashes -computed with this version will differ from hashes computed with version < 0.2.0 for -same directory**. -- This dirhash python implementation has moved to here -[github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from -the previous repository -[github.com/andhus/dirhash](https://github.com/andhus/dirhash) -which now contains the formal description of the Dirhash Standard. + +- **Significant breaking changes** from version 0.1.1 - both regarding API and the + underlying method/protocol for computing the hash. This means that **hashes + computed with this version will differ from hashes computed with version < 0.2.0 for + same directory**. +- This dirhash python implementation has moved to here + [github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from + the previous repository + [github.com/andhus/dirhash](https://github.com/andhus/dirhash) + which now contains the formal description of the Dirhash Standard. ### Removed -- All support for the `.dirhashignore` file. This seemed superfluous, please file an -issue if you need this feature. + +- All support for the `.dirhashignore` file. This seemed superfluous, please file an + issue if you need this feature. diff --git a/README.md b/README.md index 613e5b6..7efd153 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ [![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) # dirhash + A lightweight python module and CLI for computing the hash of any directory based on its files' structure and content. + - Supports all hashing algorithms of Python's built-in `hashlib` module. - Glob/wildcard (".gitignore style") path matching for expressive filtering of files to include/exclude. - Multiprocessing for up to [6x speed-up](#performance) @@ -10,18 +12,24 @@ directory based on its files' structure and content. The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. ## Installation + From PyPI: + ```commandline pip install dirhash ``` + Or directly from source: + ```commandline git clone git@github.com:andhus/dirhash-python.git pip install dirhash/ ``` ## Usage + Python module: + ```python from dirhash import dirhash @@ -30,7 +38,9 @@ dir_md5 = dirhash(dirpath, "md5") pyfiles_md5 = dirhash(dirpath, "md5", match=["*.py"]) no_hidden_sha1 = dirhash(dirpath, "sha1", ignore=[".*", ".*/"]) ``` + CLI: + ```commandline dirhash path/to/directory -a md5 dirhash path/to/directory -a md5 --match "*.py" @@ -38,56 +48,59 @@ dirhash path/to/directory -a sha1 --ignore ".*" ".*/" ``` ## Why? + If you (or your application) need to verify the integrity of a set of files as well -as their name and location, you might find this useful. Use-cases range from -verification of your image classification dataset (before spending GPU-$$$ on +as their name and location, you might find this useful. Use-cases range from +verification of your image classification dataset (before spending GPU-$$$ on training your fancy Deep Learning model) to validation of generated files in regression-testing. -There isn't really a standard way of doing this. There are plenty of recipes out +There isn't really a standard way of doing this. There are plenty of recipes out there (see e.g. these SO-questions for [linux](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents) and [python](https://stackoverflow.com/questions/24937495/how-can-i-calculate-a-hash-for-a-filesystem-directory-using-python)) -but I couldn't find one that is properly tested (there are some gotcha:s to cover!) -and documented with a compelling user interface. `dirhash` was created with this as +but I couldn't find one that is properly tested (there are some gotcha:s to cover!) +and documented with a compelling user interface. `dirhash` was created with this as the goal. -[checksumdir](https://github.com/cakepietoast/checksumdir) is another python +[checksumdir](https://github.com/cakepietoast/checksumdir) is another python module/tool with similar intent (that inspired this project) but it lacks much of the functionality offered here (most notably including file names/structure in the hash) and lacks tests. ## Performance + The python `hashlib` implementation of common hashing algorithms are highly -optimised. `dirhash` mainly parses the file tree, pipes data to `hashlib` and -combines the output. Reasonable measures have been taken to minimize the overhead -and for common use-cases, the majority of time is spent reading data from disk +optimised. `dirhash` mainly parses the file tree, pipes data to `hashlib` and +combines the output. Reasonable measures have been taken to minimize the overhead +and for common use-cases, the majority of time is spent reading data from disk and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py) with the shell command: -`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` +`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` -which is the top answer for the SO-question: +which is the top answer for the SO-question: [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents) -Results for two test cases are shown below. Both have 1 GiB of random data: in -"flat_1k_1MB", split into 1k files (1 MiB each) in a flat structure, and in -"nested_32k_32kB", into 32k files (32 KiB each) spread over the 256 leaf directories +Results for two test cases are shown below. Both have 1 GiB of random data: in +"flat_1k_1MB", split into 1k files (1 MiB each) in a flat structure, and in +"nested_32k_32kB", into 32k files (32 KiB each) spread over the 256 leaf directories in a binary tree of depth 8. -Implementation | Test Case | Time (s) | Speed up -------------------- | --------------- | -------: | -------: -shell reference | flat_1k_1MB | 2.29 | -> 1.0 -`dirhash` | flat_1k_1MB | 1.67 | 1.36 -`dirhash`(8 workers)| flat_1k_1MB | 0.48 | **4.73** -shell reference | nested_32k_32kB | 6.82 | -> 1.0 -`dirhash` | nested_32k_32kB | 3.43 | 2.00 -`dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** +| Implementation | Test Case | Time (s) | Speed up | +| -------------------- | --------------- | -------: | -------: | +| shell reference | flat_1k_1MB | 2.29 | -> 1.0 | +| `dirhash` | flat_1k_1MB | 1.67 | 1.36 | +| `dirhash`(8 workers) | flat_1k_1MB | 0.48 | **4.73** | +| shell reference | nested_32k_32kB | 6.82 | -> 1.0 | +| `dirhash` | nested_32k_32kB | 3.43 | 2.00 | +| `dirhash`(8 workers) | nested_32k_32kB | 1.14 | **6.00** | The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/tree/master/benchmark). ## Documentation -Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file + +Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). diff --git a/benchmark/README.md b/benchmark/README.md index 52bb5c1..0fb9d61 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -2,32 +2,31 @@ As a reference, the performance of `dirhash` is benchmarked against the shell command: -`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` +`find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` -(top answer for the SO-question: +(top answer for the SO-question: [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents)) Each test case contains 1 GiB of random data, split equally into 8, 1k or 32k files, in a flat or nested (binary tree of depth 8) structure. -For a fair comparison, *the CLI version* of `dirhash` was used (including startup +For a fair comparison, _the CLI version_ of `dirhash` was used (including startup time for loading of python modules etc.). -For full details/reproducibility see/run the `run.py` script for which the output is -found in `results.csv`. These results were generated on a MacBook Pro (2018): +For full details/reproducibility see/run the `run.py` script for which the output is +found in `results.csv`. These results were generated on a MacBook Pro (2018): + - 2,2 GHz Intel Core i7 (`sysctl -n hw.physicalcpu hw.logicalcpu`-> 6, 12) - 16 GB 2400 MHz DDR4 - APPLE SSD AP0512M - - ## Sample results: -Implementation | Test Case | Time (s) | Speed up -------------------- | --------------- | -------: | -------: -shell reference | flat_1k_1MB | 2.29 | -> 1.0 -`dirhash` | flat_1k_1MB | 1.67 | 1.36 -`dirhash`(8 workers)| flat_1k_1MB | 0.48 | **4.73** -shell reference | nested_32k_32kB | 6.82 | -> 1.0 -`dirhash` | nested_32k_32kB | 3.43 | 2.00 -`dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** +| Implementation | Test Case | Time (s) | Speed up | +| -------------------- | --------------- | -------: | -------: | +| shell reference | flat_1k_1MB | 2.29 | -> 1.0 | +| `dirhash` | flat_1k_1MB | 1.67 | 1.36 | +| `dirhash`(8 workers) | flat_1k_1MB | 0.48 | **4.73** | +| shell reference | nested_32k_32kB | 6.82 | -> 1.0 | +| `dirhash` | nested_32k_32kB | 3.43 | 2.00 | +| `dirhash`(8 workers) | nested_32k_32kB | 1.14 | **6.00** | diff --git a/benchmark/results.json b/benchmark/results.json index 62c622b..0304dfd 100644 --- a/benchmark/results.json +++ b/benchmark/results.json @@ -1,402 +1,402 @@ [ - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.014, - "t_median": 2.02 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.602, - "t_median": 1.604 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.977, - "t_median": 0.98 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.562, - "t_median": 0.569 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.464, - "t_median": 0.473 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.263, - "t_median": 2.268 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.662, - "t_median": 1.667 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.978, - "t_median": 0.983 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.57, - "t_median": 0.58 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.476, - "t_median": 0.48 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 6.711, - "t_median": 6.721 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 3.329, - "t_median": 3.354 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 2.067, - "t_median": 2.074 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 1.345, - "t_median": 1.362 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 1.09, - "t_median": 1.094 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.296, - "t_median": 2.306 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.713, - "t_median": 1.714 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.996, - "t_median": 1.009 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.601, - "t_median": 0.602 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.499, - "t_median": 0.505 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 6.814, - "t_median": 6.818 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 3.376, - "t_median": 3.426 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 2.147, - "t_median": 2.153 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 1.414, - "t_median": 1.416 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 1.137, - "t_median": 1.138 - }, - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.181, - "t_median": 2.196 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.214, - "t_median": 1.225 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.768, - "t_median": 0.774 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.467, - "t_median": 0.474 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.47, - "t_median": 0.477 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.221, - "t_median": 2.229 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.252, - "t_median": 1.263 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.774, - "t_median": 0.777 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.471, - "t_median": 0.477 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.378, - "t_median": 0.478 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.178, - "t_median": 4.224 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.921, - "t_median": 3.008 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 1.888, - "t_median": 1.892 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 1.266, - "t_median": 1.275 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 1.072, - "t_median": 1.079 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.236, - "t_median": 2.26 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.308, - "t_median": 1.314 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.797, - "t_median": 0.8 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.501, - "t_median": 0.509 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.499, - "t_median": 0.503 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.383, - "t_median": 4.406 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 3.041, - "t_median": 3.05 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 1.943, - "t_median": 1.965 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 1.329, - "t_median": 1.334 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 1.14, - "t_median": 1.149 - } -] \ No newline at end of file + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.014, + "t_median": 2.02 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.602, + "t_median": 1.604 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.977, + "t_median": 0.98 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.562, + "t_median": 0.569 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.464, + "t_median": 0.473 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.263, + "t_median": 2.268 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.662, + "t_median": 1.667 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.978, + "t_median": 0.983 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.57, + "t_median": 0.58 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.476, + "t_median": 0.48 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 6.711, + "t_median": 6.721 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 3.329, + "t_median": 3.354 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.067, + "t_median": 2.074 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.345, + "t_median": 1.362 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.09, + "t_median": 1.094 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.296, + "t_median": 2.306 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.713, + "t_median": 1.714 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.996, + "t_median": 1.009 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.601, + "t_median": 0.602 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.499, + "t_median": 0.505 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 6.814, + "t_median": 6.818 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 3.376, + "t_median": 3.426 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.147, + "t_median": 2.153 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.414, + "t_median": 1.416 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.137, + "t_median": 1.138 + }, + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.181, + "t_median": 2.196 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.214, + "t_median": 1.225 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.768, + "t_median": 0.774 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.467, + "t_median": 0.474 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.47, + "t_median": 0.477 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.221, + "t_median": 2.229 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.252, + "t_median": 1.263 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.774, + "t_median": 0.777 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.471, + "t_median": 0.477 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.378, + "t_median": 0.478 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.178, + "t_median": 4.224 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.921, + "t_median": 3.008 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 1.888, + "t_median": 1.892 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.266, + "t_median": 1.275 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.072, + "t_median": 1.079 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.236, + "t_median": 2.26 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.308, + "t_median": 1.314 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.797, + "t_median": 0.8 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.501, + "t_median": 0.509 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.499, + "t_median": 0.503 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.383, + "t_median": 4.406 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 3.041, + "t_median": 3.05 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 1.943, + "t_median": 1.965 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.329, + "t_median": 1.334 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.14, + "t_median": 1.149 + } +] diff --git a/benchmark/results_v0.2.0.json b/benchmark/results_v0.2.0.json index 71a652b..a707fcf 100644 --- a/benchmark/results_v0.2.0.json +++ b/benchmark/results_v0.2.0.json @@ -1,402 +1,402 @@ [ - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 2.079, - "t_median": 2.083 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.734, - "t_median": 1.945 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 0.999, - "t_median": 1.183 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.711, - "t_median": 0.728 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.504, - "t_median": 0.518 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 3.383, - "t_median": 3.679 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.846, - "t_median": 1.921 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 1.137, - "t_median": 1.158 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.74, - "t_median": 0.749 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.53, - "t_median": 0.534 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 13.827, - "t_median": 18.213 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 13.655, - "t_median": 13.808 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 3.276, - "t_median": 3.33 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 2.409, - "t_median": 2.421 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 2.045, - "t_median": 2.086 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 3.284, - "t_median": 3.332 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 1.717, - "t_median": 1.725 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 1.026, - "t_median": 1.034 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 0.622, - "t_median": 0.633 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 0.522, - "t_median": 0.529 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "md5", - "workers": 1, - "t_best": 11.898, - "t_median": 12.125 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 1, - "t_best": 13.858, - "t_median": 14.146 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 2, - "t_best": 2.781, - "t_median": 2.987 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 4, - "t_best": 1.894, - "t_median": 1.92 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "md5", - "workers": 8, - "t_best": 1.55, - "t_median": 1.568 - }, - { - "test_case": "flat_8_128MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.042, - "t_median": 2.05 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.338, - "t_median": 1.354 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.79, - "t_median": 0.794 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.583, - "t_median": 0.593 - }, - { - "test_case": "flat_8_128MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.483, - "t_median": 0.487 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.118, - "t_median": 2.129 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.39, - "t_median": 1.531 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.925, - "t_median": 0.932 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.614, - "t_median": 0.629 - }, - { - "test_case": "flat_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.511, - "t_median": 0.52 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 10.551, - "t_median": 10.97 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.663, - "t_median": 4.76 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 3.108, - "t_median": 3.235 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 2.342, - "t_median": 2.361 - }, - { - "test_case": "flat_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 2.071, - "t_median": 2.094 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 2.11, - "t_median": 2.159 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 1.436, - "t_median": 1.47 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 0.925, - "t_median": 0.937 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 0.627, - "t_median": 0.643 - }, - { - "test_case": "nested_1k_1MB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 0.516, - "t_median": 0.527 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "shell reference", - "algorithm": "sha1", - "workers": 1, - "t_best": 3.982, - "t_median": 7.147 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 1, - "t_best": 4.114, - "t_median": 4.156 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 2, - "t_best": 2.598, - "t_median": 2.616 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 4, - "t_best": 1.809, - "t_median": 1.831 - }, - { - "test_case": "nested_32k_32kB", - "implementation": "dirhash", - "algorithm": "sha1", - "workers": 8, - "t_best": 1.552, - "t_median": 1.58 - } -] \ No newline at end of file + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.079, + "t_median": 2.083 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.734, + "t_median": 1.945 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.999, + "t_median": 1.183 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.711, + "t_median": 0.728 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.504, + "t_median": 0.518 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.383, + "t_median": 3.679 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.846, + "t_median": 1.921 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.137, + "t_median": 1.158 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.74, + "t_median": 0.749 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.53, + "t_median": 0.534 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 13.827, + "t_median": 18.213 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.655, + "t_median": 13.808 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 3.276, + "t_median": 3.33 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 2.409, + "t_median": 2.421 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 2.045, + "t_median": 2.086 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.284, + "t_median": 3.332 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.717, + "t_median": 1.725 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.026, + "t_median": 1.034 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.622, + "t_median": 0.633 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.522, + "t_median": 0.529 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 11.898, + "t_median": 12.125 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.858, + "t_median": 14.146 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.781, + "t_median": 2.987 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.894, + "t_median": 1.92 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.55, + "t_median": 1.568 + }, + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.042, + "t_median": 2.05 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.338, + "t_median": 1.354 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.79, + "t_median": 0.794 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.583, + "t_median": 0.593 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.483, + "t_median": 0.487 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.118, + "t_median": 2.129 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.39, + "t_median": 1.531 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.932 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.614, + "t_median": 0.629 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.511, + "t_median": 0.52 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 10.551, + "t_median": 10.97 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.663, + "t_median": 4.76 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 3.108, + "t_median": 3.235 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 2.342, + "t_median": 2.361 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 2.071, + "t_median": 2.094 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.11, + "t_median": 2.159 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.436, + "t_median": 1.47 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.937 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.627, + "t_median": 0.643 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.516, + "t_median": 0.527 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 3.982, + "t_median": 7.147 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.114, + "t_median": 4.156 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 2.598, + "t_median": 2.616 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.809, + "t_median": 1.831 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.552, + "t_median": 1.58 + } +] diff --git a/benchmark/run.py b/benchmark/run.py index f930b2e..d5d9a25 100644 --- a/benchmark/run.py +++ b/benchmark/run.py @@ -1,24 +1,18 @@ -from __future__ import print_function, division - import json import os import subprocess - -from statistics import median, mean +from statistics import median from dirhash import __version__ - -BENCHMARK_ROOT = os.path.abspath( - os.path.join(__file__, os.pardir) -) +BENCHMARK_ROOT = os.path.abspath(os.path.join(__file__, os.pardir)) TEST_CASES = { - 'flat_8_128MB': {'depth': 0, 'num_files': 2**3, 'file_size': 2**27}, - 'flat_1k_1MB': {'depth': 0, 'num_files': 2**10, 'file_size': 2**20}, - 'flat_32k_32kB': {'depth': 0, 'num_files': 2**15, 'file_size': 2**15}, - 'nested_1k_1MB': {'depth': 8, 'num_files': 2**10, 'file_size': 2**20}, - 'nested_32k_32kB': {'depth': 8, 'num_files': 2**15, 'file_size': 2**15}, + "flat_8_128MB": {"depth": 0, "num_files": 2**3, "file_size": 2**27}, + "flat_1k_1MB": {"depth": 0, "num_files": 2**10, "file_size": 2**20}, + "flat_32k_32kB": {"depth": 0, "num_files": 2**15, "file_size": 2**15}, + "nested_1k_1MB": {"depth": 8, "num_files": 2**10, "file_size": 2**20}, + "nested_32k_32kB": {"depth": 8, "num_files": 2**15, "file_size": 2**15}, } @@ -33,36 +27,32 @@ def int_chunks(x, n): def write_file_tree(dirpath, depth, num_files, file_size, branch_factor=2): - assert num_files >= branch_factor ** depth + assert num_files >= branch_factor**depth os.mkdir(dirpath) if depth == 0: fill = len(str(num_files)) for i in range(num_files): - filepath = os.path.join(dirpath, 'f_' + str(i).rjust(fill, '0')) - with open(filepath, 'wb') as f: + filepath = os.path.join(dirpath, "f_" + str(i).rjust(fill, "0")) + with open(filepath, "wb") as f: f.write(os.urandom(file_size)) else: fill = len(str(branch_factor)) for i, num_files_branch in enumerate(int_chunks(num_files, branch_factor)): - dirpath_branch = os.path.join(dirpath, 'd_' + str(i).rjust(fill, '0')) + dirpath_branch = os.path.join(dirpath, "d_" + str(i).rjust(fill, "0")) write_file_tree( - dirpath_branch, - depth - 1, - num_files_branch, - file_size, - branch_factor + dirpath_branch, depth - 1, num_files_branch, file_size, branch_factor ) def require_test_cases(): - test_cases_root = os.path.join(BENCHMARK_ROOT, 'test_cases') + test_cases_root = os.path.join(BENCHMARK_ROOT, "test_cases") if not os.path.exists(test_cases_root): os.mkdir(test_cases_root) test_case_paths = [] for name, kwargs in TEST_CASES.items(): test_case_path = os.path.join(test_cases_root, name) if not os.path.exists(test_case_path): - print('creating test case: {}: {}'.format(name, kwargs)) + print(f"creating test case: {name}: {kwargs}") write_file_tree(test_case_path, **kwargs) test_case_paths.append(test_case_path) @@ -71,32 +61,27 @@ def require_test_cases(): def time_shell(cmd, runs=1, repetitions=1, setup=None): time_cmd = "time for i in {{1..{rep}}}; do {cmd}; done".format( - cmd=cmd, - rep=repetitions + cmd=cmd, rep=repetitions ) if setup is not None: - time_cmd = "{}; {}".format(setup, time_cmd) + time_cmd = f"{setup}; {time_cmd}" realtimes = [] for i in range(runs): process = subprocess.run( - time_cmd, - capture_output=True, - text=True, - shell=True, - check=True + time_cmd, capture_output=True, text=True, shell=True, check=True ) - output_lines = process.stderr.split('\n') + output_lines = process.stderr.split("\n") try: t_real, t_user, t_sys = output_lines[-4:-1] - assert t_real.startswith('real') - t_str = t_real.split('\t')[1] - min_str, sec_str = t_str.split('m') + assert t_real.startswith("real") + t_str = t_real.split("\t")[1] + min_str, sec_str = t_str.split("m") sec = 60 * int(min_str) + float(sec_str[:-1]) sec_per_rep = sec / repetitions - except: + except: # noqa: E722 raise RuntimeError( - 'Failed to parse `time` stderr output: {}'.format(process.stderr) + f"Failed to parse `time` stderr output: {process.stderr}" ) realtimes.append(sec_per_rep) @@ -104,22 +89,21 @@ def time_shell(cmd, runs=1, repetitions=1, setup=None): def get_reference_shell_cmd(dirpath, algorithm): - if algorithm == 'md5': + if algorithm == "md5": pass - elif algorithm.startswith('sha'): + elif algorithm.startswith("sha"): version = int(algorithm[3:]) - algorithm = 'shasum -a {}'.format(version) + algorithm = f"shasum -a {version}" else: - raise ValueError('only md5 and sha supported') + raise ValueError("only md5 and sha supported") - return 'find {dir} -type f -print0 | sort -z | xargs -0 {alg} | {alg}'.format( - dir=dirpath, - alg=algorithm + return "find {dir} -type f -print0 | sort -z | xargs -0 {alg} | {alg}".format( + dir=dirpath, alg=algorithm ) def get_dirhash_shell_cmd(dirpath, algorithm, workers=1): - return 'dirhash {} -a {} -j {}'.format(dirpath, algorithm, workers) + return f"dirhash {dirpath} -a {algorithm} -j {workers}" def benchmark(dirpath, algorithm, **kwargs): @@ -129,12 +113,12 @@ def benchmark(dirpath, algorithm, **kwargs): cmd = get_reference_shell_cmd(dirpath, algorithm) realtimes = time_shell(cmd=cmd, **kwargs) res = { - 'test_case': test_case, - 'implementation': 'shell reference', - 'algorithm': algorithm, - 'workers': 1, - 't_best': min(realtimes), - 't_median': median(realtimes), + "test_case": test_case, + "implementation": "shell reference", + "algorithm": algorithm, + "workers": 1, + "t_best": min(realtimes), + "t_median": median(realtimes), } print(res) print(realtimes) @@ -144,12 +128,12 @@ def benchmark(dirpath, algorithm, **kwargs): cmd = get_dirhash_shell_cmd(dirpath, algorithm, workers) realtimes = time_shell(cmd=cmd, **kwargs) res = { - 'test_case': test_case, - 'implementation': 'dirhash', - 'algorithm': algorithm, - 'workers': workers, - 't_best': min(realtimes), - 't_median': median(realtimes), + "test_case": test_case, + "implementation": "dirhash", + "algorithm": algorithm, + "workers": workers, + "t_best": min(realtimes), + "t_median": median(realtimes), } print(res) print(realtimes) @@ -158,40 +142,49 @@ def benchmark(dirpath, algorithm, **kwargs): return result -if __name__ == '__main__': +if __name__ == "__main__": test_cases = require_test_cases() results = [] - for alg in ['md5', 'sha1']: + for alg in ["md5", "sha1"]: for test_case in test_cases: result = benchmark(test_case, algorithm=alg, runs=5, repetitions=1) results.extend(result) - result_fname = 'results_v{}'.format(__version__) + result_fname = f"results_v{__version__}" - with open(os.path.join(BENCHMARK_ROOT, result_fname + '.json'), 'w') as f: - json.dump(results, f, indent=4) + with open(os.path.join(BENCHMARK_ROOT, result_fname + ".json"), "w") as f: + json.dump(results, f, indent=2) try: import pandas as pd + df = pd.DataFrame(results) - df = df[['test_case', 'implementation', 'algorithm', 'workers', - 't_best', 't_median']] - for (tc, alg), subdf in df.groupby(['test_case', 'algorithm']): - t_ref = subdf[ - subdf['implementation'] == 'shell reference' - ]['t_median'].values[0] - speed_up = t_ref / subdf['t_median'] - df.loc[speed_up.index, 'speed-up (median)'] = speed_up + df = df[ + [ + "test_case", + "implementation", + "algorithm", + "workers", + "t_best", + "t_median", + ] + ] + for (tc, alg), subdf in df.groupby(["test_case", "algorithm"]): + t_ref = subdf[subdf["implementation"] == "shell reference"][ + "t_median" + ].values[0] + speed_up = t_ref / subdf["t_median"] + df.loc[speed_up.index, "speed-up (median)"] = speed_up print(df) - df_hd = df[df['implementation'] == 'dirhash'] - df_hd_1w = df_hd[df_hd['workers'] == 1] - df_hd_8w = df_hd[df_hd['workers'] == 8] - mean_speedup_1w = df_hd_1w.mean()['speed-up (median)'] - mean_speedup_8w = df_hd_8w.mean()['speed-up (median)'] - print('\nAverage speedup (single process): {}'.format(mean_speedup_1w)) + df_hd = df[df["implementation"] == "dirhash"] + df_hd_1w = df_hd[df_hd["workers"] == 1] + df_hd_8w = df_hd[df_hd["workers"] == 8] + mean_speedup_1w = df_hd_1w.mean()["speed-up (median)"] + mean_speedup_8w = df_hd_8w.mean()["speed-up (median)"] + print(f"\nAverage speedup (single process): {mean_speedup_1w}") print(df_hd_1w) - print('\nAverage speedup multiprocess (8 workers): {}'.format(mean_speedup_8w)) + print(f"\nAverage speedup multiprocess (8 workers): {mean_speedup_8w}") print(df_hd_8w) - df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + '.csv')) + df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + ".csv")) except ImportError: pass diff --git a/setup.py b/setup.py index cda3381..14e3cb9 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,35 @@ -import io import os -from setuptools import setup, find_packages import versioneer +from setuptools import find_packages, setup PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) -DESCRIPTION = 'Python module and CLI for hashing of file system directories.' +DESCRIPTION = "Python module and CLI for hashing of file system directories." try: - with io.open(os.path.join(PROJECT_ROOT, 'README.md'), encoding='utf-8') as f: - long_description = '\n' + f.read() -except IOError: + with open(os.path.join(PROJECT_ROOT, "README.md"), encoding="utf-8") as f: + long_description = "\n" + f.read() +except OSError: long_description = DESCRIPTION setup( - name='dirhash', + name="dirhash", version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/andhus/dirhash-python', + url="https://github.com/andhus/dirhash-python", author="Anders Huss", author_email="andhus@kth.se", - license='MIT', - install_requires=['scantree>=0.0.2'], - packages=find_packages('src'), - package_dir={'': 'src'}, + license="MIT", + install_requires=["scantree"], + packages=find_packages("src"), + package_dir={"": "src"}, include_package_data=True, entry_points={ - 'console_scripts': ['dirhash=dirhash.cli:main'], + "console_scripts": ["dirhash=dirhash.cli:main"], }, - tests_require=['pytest', 'pytest-cov'] + tests_require=["pre-commit", "pytest", "pytest-cov"], ) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 2b0d8b5..f0d54b7 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -1,37 +1,32 @@ #!/usr/bin/env python """dirhash - a python library (and CLI) for hashing of file system directories. """ -from __future__ import print_function, division -import os import hashlib - +import os from functools import partial from multiprocessing import Pool -from scantree import ( - scantree, - RecursionFilter, - CyclicLinkedDir, -) +from scantree import CyclicLinkedDir, RecursionFilter, scantree from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] __all__ = [ - '__version__', - 'algorithms_guaranteed', - 'algorithms_available', - 'dirhash', - 'dirhash_impl', - 'included_paths', - 'Filter', - 'get_match_patterns', - 'Protocol' + "__version__", + "algorithms_guaranteed", + "algorithms_available", + "dirhash", + "dirhash_impl", + "included_paths", + "Filter", + "get_match_patterns", + "Protocol", ] -algorithms_guaranteed = {'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'} +algorithms_guaranteed = {"md5", "sha1", "sha224", "sha256", "sha384", "sha512"} algorithms_available = hashlib.algorithms_available @@ -43,10 +38,10 @@ def dirhash( linked_dirs=True, linked_files=True, empty_dirs=False, - entry_properties=('name', 'data'), + entry_properties=("name", "data"), allow_cyclic_links=False, chunk_size=2**20, - jobs=1 + jobs=1, ): """Computes the hash of a directory based on its structure and content. @@ -151,11 +146,10 @@ def dirhash( match_patterns=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, linked_files=linked_files, - empty_dirs=empty_dirs + empty_dirs=empty_dirs, ) protocol = Protocol( - entry_properties=entry_properties, - allow_cyclic_links=allow_cyclic_links + entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links ) return dirhash_impl( directory=directory, @@ -163,17 +157,12 @@ def dirhash( filter_=filter_, protocol=protocol, chunk_size=chunk_size, - jobs=jobs + jobs=jobs, ) def dirhash_impl( - directory, - algorithm, - filter_=None, - protocol=None, - chunk_size=2**20, - jobs=1 + directory, algorithm, filter_=None, protocol=None, chunk_size=2**20, jobs=1 ): """Computes the hash of a directory based on its structure and content. @@ -215,25 +204,26 @@ def dirhash_impl( See https://github.com/andhus/dirhash/README.md for a formal description of how the returned hash value is computed. """ + def get_instance(value, cls_, argname): if isinstance(value, cls_): return value if value is None: return cls_() - raise TypeError('{} must be an instance of {} or None'.format(argname, cls_)) + raise TypeError(f"{argname} must be an instance of {cls_} or None") - filter_ = get_instance(filter_, Filter, 'filter_') - protocol = get_instance(protocol, Protocol, 'protocol') + filter_ = get_instance(filter_, Filter, "filter_") + protocol = get_instance(protocol, Protocol, "protocol") hasher_factory = _get_hasher_factory(algorithm) def dir_apply(dir_node): if not filter_.empty_dirs: - if dir_node.path.relative == '' and dir_node.empty: + if dir_node.path.relative == "" and dir_node.empty: # only check if root node is empty (other empty dirs are filter # before `dir_apply` with `filter_.empty_dirs=False`) - raise ValueError('{}: Nothing to hash'.format(directory)) + raise ValueError(f"{directory}: Nothing to hash") descriptor = protocol.get_descriptor(dir_node) - _dirhash = hasher_factory(descriptor.encode('utf-8')).hexdigest() + _dirhash = hasher_factory(descriptor.encode("utf-8")).hexdigest() return dir_node.path, _dirhash @@ -242,10 +232,7 @@ def dir_apply(dir_node): def file_apply(path): return path, _get_filehash( - path.real, - hasher_factory, - chunk_size=chunk_size, - cache=cache + path.real, hasher_factory, chunk_size=chunk_size, cache=cache ) _, dirhash_ = scantree( @@ -257,7 +244,7 @@ def file_apply(path): allow_cyclic_links=protocol.allow_cyclic_links, cache_file_apply=False, include_empty=filter_.empty_dirs, - jobs=1 + jobs=1, ) else: # multiprocessing real_paths = set() @@ -274,18 +261,16 @@ def extract_real_paths(path): allow_cyclic_links=protocol.allow_cyclic_links, cache_file_apply=False, include_empty=filter_.empty_dirs, - jobs=1 + jobs=1, ) real_paths = list(real_paths) # hash files in parallel file_hashes = _parmap( partial( - _get_filehash, - hasher_factory=hasher_factory, - chunk_size=chunk_size + _get_filehash, hasher_factory=hasher_factory, chunk_size=chunk_size ), real_paths, - jobs=jobs + jobs=jobs, ) # prepare the mapping with precomputed file hashes real_path_to_hash = dict(zip(real_paths, file_hashes)) @@ -324,7 +309,7 @@ def included_paths( match_patterns=get_match_patterns(match=match, ignore=ignore), linked_dirs=linked_dirs, linked_files=linked_files, - empty_dirs=empty_dirs + empty_dirs=empty_dirs, ) protocol = Protocol(allow_cyclic_links=allow_cyclic_links) @@ -333,11 +318,11 @@ def included_paths( recursion_filter=filter_, follow_links=True, allow_cyclic_links=protocol.allow_cyclic_links, - include_empty=filter_.empty_dirs + include_empty=filter_.empty_dirs, ).leafpaths() return [ - path.relative if path.is_file() else os.path.join(path.relative, '.') + path.relative if path.is_file() else os.path.join(path.relative, ".") for path in leafpaths ] @@ -364,17 +349,12 @@ class Filter(RecursionFilter): that *matches provided matching criteria*. Default `False`, i.e. empty directories are ignored (as is done in git version control). """ + def __init__( - self, - match_patterns=None, - linked_dirs=True, - linked_files=True, - empty_dirs=False + self, match_patterns=None, linked_dirs=True, linked_files=True, empty_dirs=False ): - super(Filter, self).__init__( - linked_dirs=linked_dirs, - linked_files=linked_files, - match=match_patterns + super().__init__( + linked_dirs=linked_dirs, linked_files=linked_files, match=match_patterns ) self.empty_dirs = empty_dirs @@ -400,23 +380,23 @@ def get_match_patterns( ignore_hidden: bool - If `True` ignore hidden files and directories. Short for `ignore=['.*', '.*/']` Default `False`. """ - match = ['*'] if match is None else list(match) + match = ["*"] if match is None else list(match) ignore = [] if ignore is None else list(ignore) ignore_extensions = [] if ignore_extensions is None else list(ignore_extensions) if ignore_hidden: - ignore.extend(['.*', '.*/']) + ignore.extend([".*", ".*/"]) for ext in ignore_extensions: - if not ext.startswith('.'): - ext = '.' + ext - ext = '*' + ext + if not ext.startswith("."): + ext = "." + ext + ext = "*" + ext ignore.append(ext) - match_spec = match + ['!' + ign for ign in ignore] + match_spec = match + ["!" + ign for ign in ignore] def deduplicate(items): - items_set = set([]) + items_set = set() dd_items = [] for item in items: if item not in items_set: @@ -428,7 +408,7 @@ def deduplicate(items): return deduplicate(match_spec) -class Protocol(object): +class Protocol: """Specifications of which file and directory properties to consider when computing the `dirhash` value. @@ -463,33 +443,31 @@ class Protocol(object): dirhash value for directory causing the cyclic link is replaced with the hash function hexdigest of the relative path from the link to the target. """ - class EntryProperties(object): - NAME = 'name' - DATA = 'data' - IS_LINK = 'is_link' + + class EntryProperties: + NAME = "name" + DATA = "data" + IS_LINK = "is_link" options = {NAME, DATA, IS_LINK} - _DIRHASH = 'dirhash' + _DIRHASH = "dirhash" - _entry_property_separator = '\000' - _entry_descriptor_separator = '\000\000' + _entry_property_separator = "\000" + _entry_descriptor_separator = "\000\000" - def __init__( - self, - entry_properties=('name', 'data'), - allow_cyclic_links=False - ): + def __init__(self, entry_properties=("name", "data"), allow_cyclic_links=False): entry_properties = set(entry_properties) if not entry_properties.issubset(self.EntryProperties.options): raise ValueError( - 'entry properties {} not supported'.format( - entry_properties - self.EntryProperties.options) + "entry properties {} not supported".format( + entry_properties - self.EntryProperties.options + ) ) if not ( - self.EntryProperties.NAME in entry_properties or - self.EntryProperties.DATA in entry_properties + self.EntryProperties.NAME in entry_properties + or self.EntryProperties.DATA in entry_properties ): raise ValueError( - 'at least one of entry properties `name` and `data` must be used' + "at least one of entry properties `name` and `data` must be used" ) self.entry_properties = entry_properties self._include_name = self.EntryProperties.NAME in entry_properties @@ -498,8 +476,8 @@ def __init__( if not isinstance(allow_cyclic_links, bool): raise ValueError( - 'allow_cyclic_link must be a boolean, ' - 'got {}'.format(allow_cyclic_links) + "allow_cyclic_link must be a boolean, " + "got {}".format(allow_cyclic_links) ) self.allow_cyclic_links = allow_cyclic_links @@ -509,18 +487,14 @@ def get_descriptor(self, dir_node): entries = dir_node.directories + dir_node.files entry_descriptors = [ - self._get_entry_descriptor( - self._get_entry_properties(path, entry_hash) - ) for path, entry_hash in entries + self._get_entry_descriptor(self._get_entry_properties(path, entry_hash)) + for path, entry_hash in entries ] return self._entry_descriptor_separator.join(sorted(entry_descriptors)) @classmethod def _get_entry_descriptor(cls, entry_properties): - entry_strings = [ - '{}:{}'.format(name, value) - for name, value in entry_properties - ] + entry_strings = [f"{name}:{value}" for name, value in entry_properties] return cls._entry_property_separator.join(sorted(entry_strings)) def _get_entry_properties(self, path, entry_hash): @@ -543,8 +517,8 @@ def _get_cyclic_linked_dir_descriptor(self, dir_node): path_to_target = os.path.relpath( # the extra '.' is needed if link back to root, because # an empty path ('') is not supported by os.path.relpath - os.path.join('.', target_relpath), - os.path.join('.', relpath) + os.path.join(".", target_relpath), + os.path.join(".", relpath), ) # TODO normalize posix! return path_to_target @@ -562,15 +536,14 @@ def _get_hasher_factory(algorithm): return partial(hashlib.new, algorithm) try: # bypass algorithm if already a hasher factory - hasher = algorithm(b'') - hasher.update(b'') + hasher = algorithm(b"") + hasher.update(b"") hasher.hexdigest() return algorithm - except: + except: # noqa: E722 pass - raise ValueError( - '`algorithm` must be one of: {}`'.format(algorithms_available)) + raise ValueError(f"`algorithm` must be one of: {algorithms_available}`") def _parmap(func, iterable, jobs=1): @@ -614,8 +587,8 @@ def _get_filehash(filepath, hasher_factory, chunk_size, cache=None): return filehash hasher = hasher_factory() - with open(filepath, 'rb') as f: - for chunk in iter(lambda: f.read(chunk_size), b''): + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(chunk_size), b""): hasher.update(chunk) return hasher.hexdigest() diff --git a/src/dirhash/_version.py b/src/dirhash/_version.py index d5278f4..1166e3d 100644 --- a/src/dirhash/_version.py +++ b/src/dirhash/_version.py @@ -1,4 +1,3 @@ - # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -12,12 +11,12 @@ """Git implementation of _version.py.""" import errno +import functools import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple -import functools def get_keywords() -> Dict[str, str]: @@ -68,12 +67,14 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate @@ -100,10 +101,14 @@ def run_command( try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen([command] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None), **popen_kwargs) + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) break except OSError as e: if e.errno == errno.ENOENT: @@ -114,7 +119,7 @@ def run_command( return None, None else: if verbose: - print("unable to find command, tried %s" % (commands,)) + print(f"unable to find command, tried {commands}") return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: @@ -141,15 +146,21 @@ def versions_from_parentdir( for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -162,7 +173,7 @@ def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: # _version.py. keywords: Dict[str, str] = {} try: - with open(versionfile_abs, "r") as fobj: + with open(versionfile_abs) as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) @@ -212,7 +223,7 @@ def git_versions_from_keywords( # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -221,7 +232,7 @@ def git_versions_from_keywords( # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -229,32 +240,36 @@ def git_versions_from_keywords( for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') - if not re.match(r'\d', r): + if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( - tag_prefix: str, - root: str, - verbose: bool, - runner: Callable = run_command + tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. @@ -273,8 +288,7 @@ def git_pieces_from_vcs( env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=not verbose) + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -282,10 +296,19 @@ def git_pieces_from_vcs( # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, [ - "describe", "--tags", "--dirty", "--always", "--long", - "--match", f"{tag_prefix}[[:digit:]]*" - ], cwd=root) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + f"{tag_prefix}[[:digit:]]*", + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -300,8 +323,7 @@ def git_pieces_from_vcs( pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], - cwd=root) + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") @@ -341,17 +363,16 @@ def git_pieces_from_vcs( dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -360,10 +381,12 @@ def git_pieces_from_vcs( if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -412,8 +435,7 @@ def render_pep440(pieces: Dict[str, Any]) -> str: rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -442,8 +464,7 @@ def render_pep440_branch(pieces: Dict[str, Any]) -> str: rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -604,11 +625,13 @@ def render_git_describe_long(pieces: Dict[str, Any]) -> str: def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -632,9 +655,13 @@ def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } def get_versions() -> Dict[str, Any]: @@ -648,8 +675,7 @@ def get_versions() -> Dict[str, Any]: verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -658,13 +684,16 @@ def get_versions() -> Dict[str, Any]: # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for _ in cfg.versionfile_source.split('/'): + for _ in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -678,6 +707,10 @@ def get_versions() -> Dict[str, Any]: except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 89f8308..8354d49 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -1,10 +1,9 @@ #!/usr/bin/env python """Get hash for the content and/or structure of a directory. """ -from __future__ import print_function -import sys import argparse +import sys import dirhash @@ -12,169 +11,172 @@ def main(): try: kwargs = get_kwargs(sys.argv[1:]) - if kwargs.pop('list'): + if kwargs.pop("list"): # kwargs below have no effect when listing - for k in ['algorithm', 'chunk_size', 'jobs', 'entry_properties']: + for k in ["algorithm", "chunk_size", "jobs", "entry_properties"]: kwargs.pop(k) for leafpath in dirhash.included_paths(**kwargs): print(leafpath) else: print(dirhash.dirhash(**kwargs)) except Exception as e: # pragma: no cover (not picked up by coverage) - sys.stderr.write('dirhash: {}\n'.format(e)) + sys.stderr.write(f"dirhash: {e}\n") sys.exit(1) def get_kwargs(args): - parser = argparse.ArgumentParser( - description='Determine the hash for a directory.' - ) - parser.add_argument( - '-v', '--version', - action='version', - version='dirhash {}'.format(dirhash.__version__) - ) + parser = argparse.ArgumentParser(description="Determine the hash for a directory.") parser.add_argument( - 'directory', - help='Directory to hash.' + "-v", + "--version", + action="version", + version=f"dirhash {dirhash.__version__}", ) + parser.add_argument("directory", help="Directory to hash.") parser.add_argument( - '-a', '--algorithm', + "-a", + "--algorithm", choices=dirhash.algorithms_available, - default='md5', + default="md5", help=( 'Hashing algorithm to use, by default "md5". Always available: {}. ' - 'Additionally available on current platform: {}. Note that the same ' - 'algorithm may appear multiple times in this set under different names ' - '(thanks to OpenSSL) ' - '[https://docs.python.org/2/library/hashlib.html]'.format( + "Additionally available on current platform: {}. Note that the same " + "algorithm may appear multiple times in this set under different names " + "(thanks to OpenSSL) " + "[https://docs.python.org/2/library/hashlib.html]".format( sorted(dirhash.algorithms_guaranteed), - sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed) + sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed), ) ), - metavar='' + metavar="", ) filter_options = parser.add_argument_group( - title='Filtering options', + title="Filtering options", description=( - 'Specify what files and directories to include. All files and ' - 'directories (including symbolic links) are included by default. The ' - '--match/--ignore arguments allows for selection using glob/wildcard ' + "Specify what files and directories to include. All files and " + "directories (including symbolic links) are included by default. The " + "--match/--ignore arguments allows for selection using glob/wildcard " '(".gitignore style") path matching. Paths relative to the root ' - '`directory` (i.e. excluding the name of the root directory itself) are ' - 'matched against the provided patterns. For example, to only include ' + "`directory` (i.e. excluding the name of the root directory itself) are " + "matched against the provided patterns. For example, to only include " 'python source files, use: `dirhash path/to/dir -m "*.py"` or to ' - 'exclude hidden files and directories use: ' + "exclude hidden files and directories use: " '`dirhash path/to.dir -i ".*" ".*/"` which is short for ' '`dirhash path/to.dir -m "*" "!.*" "!.*/"`. By adding the --list ' - 'argument, all included paths, for the given filtering arguments, are ' - 'returned instead of the hash value. For further details see ' - 'https://github.com/andhus/dirhash/README.md#filtering' - ) + "argument, all included paths, for the given filtering arguments, are " + "returned instead of the hash value. For further details see " + "https://github.com/andhus/dirhash/README.md#filtering" + ), ) filter_options.add_argument( - '-m', '--match', - nargs='+', - default=['*'], + "-m", + "--match", + nargs="+", + default=["*"], help=( - 'One or several patterns for paths to include. NOTE: patterns ' + "One or several patterns for paths to include. NOTE: patterns " 'with an asterisk must be in quotes ("*") or the asterisk ' - 'preceded by an escape character (`*).' + "preceded by an escape character (`*)." ), - metavar='' + metavar="", ) filter_options.add_argument( - '-i', '--ignore', - nargs='+', + "-i", + "--ignore", + nargs="+", default=None, help=( - 'One or several patterns for paths to exclude. NOTE: patterns ' + "One or several patterns for paths to exclude. NOTE: patterns " 'with an asterisk must be in quotes ("*") or the asterisk ' - 'preceded by an escape character (`*).' + "preceded by an escape character (`*)." ), - metavar='' + metavar="", ) filter_options.add_argument( - '--empty-dirs', - action='store_true', + "--empty-dirs", + action="store_true", default=False, - help='Include empty directories (containing no files that meet the matching ' - 'criteria and no non-empty sub directories).' + help="Include empty directories (containing no files that meet the matching " + "criteria and no non-empty sub directories).", ) filter_options.add_argument( - '--no-linked-dirs', - dest='linked_dirs', - action='store_false', - help='Do not include symbolic links to other directories.' + "--no-linked-dirs", + dest="linked_dirs", + action="store_false", + help="Do not include symbolic links to other directories.", ) filter_options.add_argument( - '--no-linked-files', - dest='linked_files', - action='store_false', - help='Do not include symbolic links to files.' + "--no-linked-files", + dest="linked_files", + action="store_false", + help="Do not include symbolic links to files.", ) parser.set_defaults(linked_dirs=True, linked_files=True) protocol_options = parser.add_argument_group( - title='Protocol options', + title="Protocol options", description=( - 'Specify what properties of files and directories to include and ' - 'whether to allow cyclic links. For further details see ' - 'https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol' - ) + "Specify what properties of files and directories to include and " + "whether to allow cyclic links. For further details see " + "https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol" + ), ) protocol_options.add_argument( - '-p', '--properties', - nargs='+', - dest='entry_properties', - default=['data', 'name'], + "-p", + "--properties", + nargs="+", + dest="entry_properties", + default=["data", "name"], help=( - 'List of file/directory properties to include in the hash. Available ' - 'properties are: {} and at least one of name and data must be ' - 'included. Default is [data name] which means that both the name/paths' - ' and content (actual data) of files and directories will be included' + "List of file/directory properties to include in the hash. Available " + "properties are: {} and at least one of name and data must be " + "included. Default is [data name] which means that both the name/paths" + " and content (actual data) of files and directories will be included" ).format(list(dirhash.Protocol.EntryProperties.options)), - metavar='' + metavar="", ) protocol_options.add_argument( - '-c', '--allow-cyclic-links', + "-c", + "--allow-cyclic-links", default=False, - action='store_true', + action="store_true", help=( - 'Allow presence of cyclic links (by hashing the relative path to the ' - 'target directory).' - ) + "Allow presence of cyclic links (by hashing the relative path to the " + "target directory)." + ), ) implementation_options = parser.add_argument_group( - title='Implementation options', - description='' + title="Implementation options", description="" ) implementation_options.add_argument( - '-s', '--chunk-size', + "-s", + "--chunk-size", default=2**20, type=int, - help='The chunk size (in bytes) for reading of files.' + help="The chunk size (in bytes) for reading of files.", ) implementation_options.add_argument( - '-j', '--jobs', + "-j", + "--jobs", type=int, default=1, # TODO make default number of cores? - help='Number of jobs (parallel processes) to use.' + help="Number of jobs (parallel processes) to use.", ) - special_options = parser.add_argument_group(title='Special options') + special_options = parser.add_argument_group(title="Special options") special_options.add_argument( - '-l', '--list', - action='store_true', + "-l", + "--list", + action="store_true", default=False, - help='List the file paths that will be taken into account, given the ' - 'provided filtering options.' + help="List the file paths that will be taken into account, given the " + "provided filtering options.", ) return vars(parser.parse_args(args)) -if __name__ == '__main__': # pragma: no cover +if __name__ == "__main__": # pragma: no cover main() diff --git a/tests/test_cli.py b/tests/test_cli.py index 3886fb9..a9aea3e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,19 +1,13 @@ -from __future__ import print_function, division - import os -import sys import shlex import subprocess - -import dirhash +import sys import pytest +import dirhash -console_script = os.path.join( - os.path.dirname(sys.executable), - 'dirhash' -) +console_script = os.path.join(os.path.dirname(sys.executable), "dirhash") def dirhash_run(argstring, add_env=None): @@ -28,15 +22,15 @@ def dirhash_run(argstring, add_env=None): [console_script] + shlex.split(argstring), stdout=subprocess.PIPE, stderr=subprocess.PIPE, - env=env + env=env, ) output, error = process.communicate() # in python3 output and error are `bytes` as opposed to `str` in python2 if isinstance(output, bytes): - output = output.decode('utf-8') + output = output.decode("utf-8") if isinstance(error, bytes): - error = error.decode('utf-8') + error = error.decode("utf-8") return output, error, process.returncode @@ -54,155 +48,121 @@ def create_default_tree(tmpdir): |__file.ext1 |__file.ext2 """ - dotdir = tmpdir.mkdir('.dir') - dotdir.join('file').write('file in hidden sub-directory') - tmpdir.join(".file").write('hidden file') - dir = tmpdir.mkdir('dir') - dir.join('file').write('file in sub-directory') - tmpdir.mkdir('empty') - tmpdir.join("file").write('file') - tmpdir.join("file.ext1").write('file with extension .ext1') - tmpdir.join("file.ext2").write('file with extension .ext2') - - -class TestCLI(object): + dotdir = tmpdir.mkdir(".dir") + dotdir.join("file").write("file in hidden sub-directory") + tmpdir.join(".file").write("hidden file") + dir = tmpdir.mkdir("dir") + dir.join("file").write("file in sub-directory") + tmpdir.mkdir("empty") + tmpdir.join("file").write("file") + tmpdir.join("file.ext1").write("file with extension .ext1") + tmpdir.join("file.ext2").write("file with extension .ext2") + + +class TestCLI: @pytest.mark.parametrize( - 'argstring, non_default_kwargs', + "argstring, non_default_kwargs", [ - ( - '. -a md5', - {} - ), - ( - '.. -a md5', - {'directory': '..'} - ), - ( - 'target-dir -a md5', - {'directory': 'target-dir'} - ), - ( - '. -a sha256', - {'algorithm': 'sha256'} - ), + (". -a md5", {}), + (".. -a md5", {"directory": ".."}), + ("target-dir -a md5", {"directory": "target-dir"}), + (". -a sha256", {"algorithm": "sha256"}), # Filtering options - ( - '. -a md5 -m "*" "!.*"', - {'match': ['*', '!.*']} - ), + ('. -a md5 -m "*" "!.*"', {"match": ["*", "!.*"]}), ( '. -a md5 --match "d1/*" "d2/*" --ignore "*.txt"', - {'match': ['d1/*', 'd2/*'], 'ignore': ['*.txt']} - ), - ( - '. -a md5 --empty-dirs', - {'empty_dirs': True} - ), - ( - '. -a md5 --no-linked-dirs', - {'linked_dirs': False} - ), - ( - '. -a md5 --no-linked-files', - {'linked_files': False} + {"match": ["d1/*", "d2/*"], "ignore": ["*.txt"]}, ), + (". -a md5 --empty-dirs", {"empty_dirs": True}), + (". -a md5 --no-linked-dirs", {"linked_dirs": False}), + (". -a md5 --no-linked-files", {"linked_files": False}), # Protocol options - ( - '. -a md5 --allow-cyclic-links', - {'allow_cyclic_links': True} - - ), - ( - '. -a md5 --properties name', - {'entry_properties': ['name']} - - ), - ( - '. -a md5 --properties name data', - {'entry_properties': ['name', 'data']} - - ), + (". -a md5 --allow-cyclic-links", {"allow_cyclic_links": True}), + (". -a md5 --properties name", {"entry_properties": ["name"]}), + (". -a md5 --properties name data", {"entry_properties": ["name", "data"]}), # Implementation - ( - '. -a md5 -j 10', - {'jobs': 10} - ), - ( - '. -a md5 -s 32000', - {'chunk_size': 32000} - ), - ] + (". -a md5 -j 10", {"jobs": 10}), + (". -a md5 -s 32000", {"chunk_size": 32000}), + ], ) def test_get_kwargs(self, argstring, non_default_kwargs): from dirhash.cli import get_kwargs + kwargs_expected = { - 'list': False, - 'directory': '.', - 'algorithm': 'md5', - 'match': ['*'], - 'ignore': None, - 'empty_dirs': False, - 'linked_dirs': True, - 'linked_files': True, - 'entry_properties': ['data', 'name'], - 'allow_cyclic_links': False, - 'chunk_size': 2 ** 20, - 'jobs': 1 + "list": False, + "directory": ".", + "algorithm": "md5", + "match": ["*"], + "ignore": None, + "empty_dirs": False, + "linked_dirs": True, + "linked_files": True, + "entry_properties": ["data", "name"], + "allow_cyclic_links": False, + "chunk_size": 2**20, + "jobs": 1, } kwargs_expected.update(non_default_kwargs) kwargs = get_kwargs(shlex.split(argstring)) assert kwargs == kwargs_expected @pytest.mark.parametrize( - 'description, argstrings, output', + "description, argstrings, output", [ - ('ARGS WITHOUT EFFECT WHEN LISTING', - ['. -l', - '. --list', - '. -a md5 --list', - '. -a sha256 --list', - '. --properties name --list', - '. --jobs 2 --list', - '. --chunk-size 2 --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n')), - ('IGNORE EXTENSION', - ['. -i "*.ext1" --list', - '. --ignore "*.ext1" --list', - '. -m "*" "!*.ext1" --list', - '. --match "*" "!*.ext1" --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n' - 'file.ext2\n')), - ('IGNORE MULTIPLE EXTENSIONS', - ['. -i "*.ext1" "*.ext2" --list', - '. -i "*.ext*" --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n')), - ('IGNORE HIDDEN', - ['. -i ".*" ".*/" --list'], - ('dir/file\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n')), - ('INCLUDE EMPTY', - ['. --empty-dirs --list'], - ('.dir/file\n' - '.file\n' - 'dir/file\n' - 'empty/.\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n')), - ] + ( + "ARGS WITHOUT EFFECT WHEN LISTING", + [ + ". -l", + ". --list", + ". -a md5 --list", + ". -a sha256 --list", + ". --properties name --list", + ". --jobs 2 --list", + ". --chunk-size 2 --list", + ], + ( + ".dir/file\n" + ".file\n" + "dir/file\n" + "file\n" + "file.ext1\n" + "file.ext2\n" + ), + ), + ( + "IGNORE EXTENSION", + [ + '. -i "*.ext1" --list', + '. --ignore "*.ext1" --list', + '. -m "*" "!*.ext1" --list', + '. --match "*" "!*.ext1" --list', + ], + (".dir/file\n" ".file\n" "dir/file\n" "file\n" "file.ext2\n"), + ), + ( + "IGNORE MULTIPLE EXTENSIONS", + ['. -i "*.ext1" "*.ext2" --list', '. -i "*.ext*" --list'], + (".dir/file\n" ".file\n" "dir/file\n" "file\n"), + ), + ( + "IGNORE HIDDEN", + ['. -i ".*" ".*/" --list'], + ("dir/file\n" "file\n" "file.ext1\n" "file.ext2\n"), + ), + ( + "INCLUDE EMPTY", + [". --empty-dirs --list"], + ( + ".dir/file\n" + ".file\n" + "dir/file\n" + "empty/.\n" + "file\n" + "file.ext1\n" + "file.ext2\n" + ), + ), + ], ) def test_list(self, description, argstrings, output, tmpdir): create_default_tree(tmpdir) @@ -210,24 +170,31 @@ def test_list(self, description, argstrings, output, tmpdir): for argstring in argstrings: o, error, returncode = dirhash_run(argstring) assert returncode == 0 - assert error == '' + assert error == "" assert o == output @pytest.mark.parametrize( - 'argstring, kwargs, expected_hashes', + "argstring, kwargs, expected_hashes", [ - ('. -a md5', - {'algorithm': 'md5'}, - ['594c48dde0776b03eddeeb0232190be7', - 'd8ab965636d48e407b73b9dbba4cb928', - '050e7bc9ffcb09c15186c04e0f8026df'] - ), - ('. -a sha256', - {'algorithm': 'sha256'}, - ['23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b', - '7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a', - '7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5']), - ] + ( + ". -a md5", + {"algorithm": "md5"}, + [ + "594c48dde0776b03eddeeb0232190be7", + "d8ab965636d48e407b73b9dbba4cb928", + "050e7bc9ffcb09c15186c04e0f8026df", + ], + ), + ( + ". -a sha256", + {"algorithm": "sha256"}, + [ + "23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b", + "7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a", + "7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5", + ], + ), + ], ) def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): # verify same result from cmdline and library + regression test of actual @@ -235,28 +202,28 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): create_default_tree(tmpdir) with tmpdir.as_cwd(): for add_argstring, add_kwargs, expected_hash in zip( - ['', ' -p data', ' -p name'], + ["", " -p data", " -p name"], [ {}, - {'entry_properties': ['data']}, - {'entry_properties': ['name']}, + {"entry_properties": ["data"]}, + {"entry_properties": ["name"]}, ], - expected_hashes + expected_hashes, ): # run CLI full_argstring = argstring + add_argstring cli_out, error, returncode = dirhash_run(full_argstring) - assert error == '' + assert error == "" assert returncode == 0 - assert cli_out[-1] == '\n' + assert cli_out[-1] == "\n" cli_hash = cli_out[:-1] # run CLI multiproc - full_argstring_mp = argstring + add_argstring + ' --jobs 2' + full_argstring_mp = argstring + add_argstring + " --jobs 2" cli_out_mp, error_mp, returncode_mp = dirhash_run(full_argstring_mp) - assert error_mp == '' + assert error_mp == "" assert returncode_mp == 0 - assert cli_out_mp[-1] == '\n' + assert cli_out_mp[-1] == "\n" cli_hash_mp = cli_out_mp[:-1] # run lib function @@ -268,6 +235,6 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): def test_error_bad_argument(self, tmpdir): with tmpdir.as_cwd(): - o, error, returncode = dirhash_run('. --chunk-size not_an_int') + o, error, returncode = dirhash_run(". --chunk-size not_an_int") assert returncode > 0 - assert error != '' + assert error != "" diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 336bf38..276c3ad 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -1,38 +1,36 @@ -from __future__ import print_function, division - +import hashlib import os import shutil -import hashlib import tempfile from time import sleep, time import pytest +from scantree import SymlinkRecursionError from dirhash import ( + Filter, + Protocol, _get_hasher_factory, - get_match_patterns, - included_paths, - dirhash, + _parmap, algorithms_available, algorithms_guaranteed, - Protocol, - _parmap, - Filter, - dirhash_impl + dirhash, + dirhash_impl, + get_match_patterns, + included_paths, ) -from scantree import SymlinkRecursionError -class TestGetHasherFactory(object): +class TestGetHasherFactory: def test_get_guaranteed(self): algorithm_and_hasher_factory = [ - ('md5', hashlib.md5), - ('sha1', hashlib.sha1), - ('sha224', hashlib.sha224), - ('sha256', hashlib.sha256), - ('sha384', hashlib.sha384), - ('sha512', hashlib.sha512) + ("md5", hashlib.md5), + ("sha1", hashlib.sha1), + ("sha224", hashlib.sha224), + ("sha256", hashlib.sha256), + ("sha384", hashlib.sha384), + ("sha512", hashlib.sha512), ] assert algorithms_guaranteed == {a for a, _ in algorithm_and_hasher_factory} for algorithm, expected_hasher_factory in algorithm_and_hasher_factory: @@ -45,20 +43,20 @@ def test_get_available(self): try: hasher = hasher_factory() except ValueError as exc: - # Some "available" algorithms are not necessarily available (fails for e.g. - # 'ripemd160' in github actions for python 3.8). See: - # https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python # noqa + # Some "available" algorithms are not necessarily available + # (fails for e.g. 'ripemd160' in github actions for python 3.8). + # See: https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python # noqa: E501 print(f"Failed to create hasher for {algorithm}: {exc}") assert exc.args[0] == f"unsupported hash type {algorithm}" hasher = None if hasher is not None: - assert hasattr(hasher, 'update') - assert hasattr(hasher, 'hexdigest') + assert hasattr(hasher, "update") + assert hasattr(hasher, "hexdigest") def test_not_available(self): with pytest.raises(ValueError): - _get_hasher_factory('not available') + _get_hasher_factory("not available") def test_bypass_hasher_factory(self): @@ -67,7 +65,7 @@ def test_bypass_hasher_factory(self): assert hasher_factory is hashlib.sha256 # test raise on custom hasher with bad interface - class IncompleteMockHasher(object): + class IncompleteMockHasher: def __init__(self, *args, **kwargs): pass @@ -82,65 +80,65 @@ def update(self, *args, **kwargs): class MockHasher(IncompleteMockHasher): def hexdigest(self): - return '' + return "" hasher_factory = _get_hasher_factory(MockHasher) assert hasher_factory is MockHasher -class TestGetMatchPatterns(object): +class TestGetMatchPatterns: def test_default_match_all(self): ms = get_match_patterns() - assert ms == ['*'] + assert ms == ["*"] def test_only_match(self): - ms = get_match_patterns(match=['a*', 'b*']) - assert ms == ['a*', 'b*'] + ms = get_match_patterns(match=["a*", "b*"]) + assert ms == ["a*", "b*"] def test_only_ignore(self): - ms = get_match_patterns(ignore=['a*', 'b*']) - assert ms == ['*', '!a*', '!b*'] + ms = get_match_patterns(ignore=["a*", "b*"]) + assert ms == ["*", "!a*", "!b*"] def test_match_and_ignore(self): - ms = get_match_patterns(match=['a*'], ignore=['*.ext']) - assert ms == ['a*', '!*.ext'] + ms = get_match_patterns(match=["a*"], ignore=["*.ext"]) + assert ms == ["a*", "!*.ext"] def test_ignore_hidden(self): ms = get_match_patterns(ignore_hidden=True) - assert ms == ['*', '!.*', '!.*/'] + assert ms == ["*", "!.*", "!.*/"] # should not duplicate if present in (general) ignore - ms = get_match_patterns(ignore=['.*'], ignore_hidden=True) - assert ms == ['*', '!.*', '!.*/'] + ms = get_match_patterns(ignore=[".*"], ignore_hidden=True) + assert ms == ["*", "!.*", "!.*/"] - ms = get_match_patterns(ignore=['.*/'], ignore_hidden=True) - assert ms == ['*', '!.*/', '!.*'] + ms = get_match_patterns(ignore=[".*/"], ignore_hidden=True) + assert ms == ["*", "!.*/", "!.*"] - ms = get_match_patterns(ignore=['.*', '.*/'], ignore_hidden=True) - assert ms == ['*', '!.*', '!.*/'] + ms = get_match_patterns(ignore=[".*", ".*/"], ignore_hidden=True) + assert ms == ["*", "!.*", "!.*/"] def test_ignore_extensions(self): - ms = get_match_patterns(ignore_extensions=['.ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore_extensions=[".ext"]) + assert ms == ["*", "!*.ext"] # automatically adds '.' - ms = get_match_patterns(ignore_extensions=['ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore_extensions=["ext"]) + assert ms == ["*", "!*.ext"] # mixed also works - ms = get_match_patterns(ignore_extensions=['ext1', '.ext2']) - assert ms == ['*', '!*.ext1', '!*.ext2'] + ms = get_match_patterns(ignore_extensions=["ext1", ".ext2"]) + assert ms == ["*", "!*.ext1", "!*.ext2"] # should not duplicate if present in (general) ignore - ms = get_match_patterns(ignore=['*.ext'], ignore_extensions=['.ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=[".ext"]) + assert ms == ["*", "!*.ext"] - ms = get_match_patterns(ignore=['*.ext'], ignore_extensions=['ext']) - assert ms == ['*', '!*.ext'] + ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=["ext"]) + assert ms == ["*", "!*.ext"] -class TempDirTest(object): +class TempDirTest: def setup_method(self): self.dir = tempfile.mkdtemp() @@ -156,7 +154,7 @@ def mkdirs(self, dirpath): os.makedirs(self.path_to(dirpath)) def mkfile(self, relpath, content=None): - with open(self.path_to(relpath), 'w') as f: + with open(self.path_to(relpath), "w") as f: if content: f.write(content) @@ -173,297 +171,266 @@ class TestGetIncludedPaths(TempDirTest): # Integration tests with `pathspec` for basic use cases. def test_basic(self): - self.mkdirs('root/d1/d11') - self.mkdirs('root/d2') + self.mkdirs("root/d1/d11") + self.mkdirs("root/d2") - self.mkfile('root/f1') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/d11/f1') - self.mkfile('root/d2/f1') + self.mkfile("root/f1") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/d11/f1") + self.mkfile("root/d2/f1") - expected_filepaths = ['d1/d11/f1', 'd1/f1', 'd2/f1', 'f1'] - filepaths = included_paths(self.path_to('root')) + expected_filepaths = ["d1/d11/f1", "d1/f1", "d2/f1", "f1"] + filepaths = included_paths(self.path_to("root")) assert filepaths == expected_filepaths # end with '/' or not should not matter - filepaths = included_paths(self.path_to('root/')) + filepaths = included_paths(self.path_to("root/")) assert filepaths == expected_filepaths def test_not_a_directory(self): - self.mkdirs('root') - self.mkfile('root/f1') + self.mkdirs("root") + self.mkfile("root/f1") # does not exist with pytest.raises(ValueError): - included_paths(self.path_to('wrong_root')) + included_paths(self.path_to("wrong_root")) with pytest.raises(ValueError): - included_paths(self.path_to('root/f1')) + included_paths(self.path_to("root/f1")) def test_symlinked_file(self): - self.mkdirs('root') - self.mkfile('root/f1') - self.mkfile('linked_file') - self.symlink('linked_file', 'root/f2') + self.mkdirs("root") + self.mkfile("root/f1") + self.mkfile("linked_file") + self.symlink("linked_file", "root/f2") - filepaths = included_paths( - self.path_to('root'), - linked_files=True - ) - assert filepaths == ['f1', 'f2'] + filepaths = included_paths(self.path_to("root"), linked_files=True) + assert filepaths == ["f1", "f2"] - filepaths = included_paths( - self.path_to('root'), - linked_files=False - ) - assert filepaths == ['f1'] + filepaths = included_paths(self.path_to("root"), linked_files=False) + assert filepaths == ["f1"] # default is 'linked_files': True - filepaths = included_paths(self.path_to('root'), ) - assert filepaths == ['f1', 'f2'] + filepaths = included_paths( + self.path_to("root"), + ) + assert filepaths == ["f1", "f2"] def test_symlinked_dir(self): - self.mkdirs('root') - self.mkfile('root/f1') - self.mkdirs('linked_dir') - self.mkfile('linked_dir/f1') - self.mkfile('linked_dir/f2') - self.symlink('linked_dir', 'root/d1') + self.mkdirs("root") + self.mkfile("root/f1") + self.mkdirs("linked_dir") + self.mkfile("linked_dir/f1") + self.mkfile("linked_dir/f2") + self.symlink("linked_dir", "root/d1") - filepaths = included_paths( - self.path_to('root'), - linked_dirs=False - ) - assert filepaths == ['f1'] + filepaths = included_paths(self.path_to("root"), linked_dirs=False) + assert filepaths == ["f1"] - filepaths = included_paths( - self.path_to('root'), - linked_dirs=True - ) - assert filepaths == ['d1/f1', 'd1/f2', 'f1'] + filepaths = included_paths(self.path_to("root"), linked_dirs=True) + assert filepaths == ["d1/f1", "d1/f2", "f1"] # default is 'linked_dirs': True - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['d1/f1', 'd1/f2', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == ["d1/f1", "d1/f2", "f1"] def test_cyclic_link(self): - self.mkdirs('root/d1') - self.symlink('root', 'root/d1/link_back') + self.mkdirs("root/d1") + self.symlink("root", "root/d1/link_back") with pytest.raises(SymlinkRecursionError) as exc_info: - included_paths( - self.path_to('root'), - allow_cyclic_links=False - ) - assert exc_info.value.real_path == os.path.realpath(self.path_to('root')) - assert exc_info.value.first_path == self.path_to('root/') - assert exc_info.value.second_path == self.path_to('root/d1/link_back') - assert str(exc_info.value).startswith('Symlink recursion:') + included_paths(self.path_to("root"), allow_cyclic_links=False) + assert exc_info.value.real_path == os.path.realpath(self.path_to("root")) + assert exc_info.value.first_path == self.path_to("root/") + assert exc_info.value.second_path == self.path_to("root/d1/link_back") + assert str(exc_info.value).startswith("Symlink recursion:") - filepaths = included_paths( - self.path_to('root'), - allow_cyclic_links=True - ) - assert filepaths == ['d1/link_back/.'] + filepaths = included_paths(self.path_to("root"), allow_cyclic_links=True) + assert filepaths == ["d1/link_back/."] # default is 'allow_cyclic_links': False with pytest.raises(SymlinkRecursionError): - filepaths = included_paths(self.path_to('root')) + filepaths = included_paths(self.path_to("root")) def test_ignore_hidden(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] # with ignore - filepaths = included_paths(self.path_to('root'), match=['*', '!.*']) - assert filepaths == ['d1/f1', 'f1'] + filepaths = included_paths(self.path_to("root"), match=["*", "!.*"]) + assert filepaths == ["d1/f1", "f1"] def test_ignore_hidden_files_only(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] # with ignore - filepaths = included_paths(self.path_to('root'), match=['**/*', '!**/.*', '**/.*/*', '!**/.*/.*']) - assert filepaths == ['.d2/f1', 'd1/f1', 'f1'] + filepaths = included_paths( + self.path_to("root"), match=["**/*", "!**/.*", "**/.*/*", "!**/.*/.*"] + ) + assert filepaths == [".d2/f1", "d1/f1", "f1"] def test_ignore_hidden_explicitly_recursive(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] # with ignore - filepaths = included_paths(self.path_to('root'), match=['*', '!**/.*']) - assert filepaths == ['d1/f1', 'f1'] + filepaths = included_paths(self.path_to("root"), match=["*", "!**/.*"]) + assert filepaths == ["d1/f1", "f1"] def test_exclude_hidden_dirs(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') - self.mkdirs('root/d1/.d1') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") + self.mkdirs("root/d1/.d1") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root'), empty_dirs=True) - assert filepaths == ['.d2/f1', '.f2', 'd1/.d1/.', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root"), empty_dirs=True) + assert filepaths == [".d2/f1", ".f2", "d1/.d1/.", "d1/.f2", "d1/f1", "f1"] # with ignore - filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*/'] - ) - assert filepaths == ['.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root"), match=["*", "!.*/"]) + assert filepaths == [".f2", "d1/.f2", "d1/f1", "f1"] def test_exclude_hidden_dirs_and_files(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") - self.mkfile('root/f1') - self.mkfile('root/.f2') - self.mkfile('root/d1/f1') - self.mkfile('root/d1/.f2') - self.mkfile('root/.d2/f1') + self.mkfile("root/f1") + self.mkfile("root/.f2") + self.mkfile("root/d1/f1") + self.mkfile("root/d1/.f2") + self.mkfile("root/.d2/f1") # no ignore - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] # using ignore - filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*/', '!.*'] - ) - assert filepaths == ['d1/f1', 'f1'] + filepaths = included_paths(self.path_to("root"), match=["*", "!.*/", "!.*"]) + assert filepaths == ["d1/f1", "f1"] def test_exclude_extensions(self): - self.mkdirs('root/d1') - - self.mkfile('root/f') - self.mkfile('root/f.txt') - self.mkfile('root/f.skip1') - self.mkfile('root/fskip1') - self.mkfile('root/f.skip2') - self.mkfile('root/f.skip1.txt') - self.mkfile('root/f.skip1.skip2') - self.mkfile('root/f.skip1skip2') - self.mkfile('root/d1/f.txt') - self.mkfile('root/d1/f.skip1') + self.mkdirs("root/d1") + + self.mkfile("root/f") + self.mkfile("root/f.txt") + self.mkfile("root/f.skip1") + self.mkfile("root/fskip1") + self.mkfile("root/f.skip2") + self.mkfile("root/f.skip1.txt") + self.mkfile("root/f.skip1.skip2") + self.mkfile("root/f.skip1skip2") + self.mkfile("root/d1/f.txt") + self.mkfile("root/d1/f.skip1") filepaths = included_paths( - self.path_to('root'), - match=['*', '!*.skip1', '!*.skip2'] + self.path_to("root"), match=["*", "!*.skip1", "!*.skip2"] ) assert filepaths == [ - 'd1/f.txt', 'f', 'f.skip1.txt', 'f.skip1skip2', 'f.txt', 'fskip1'] + "d1/f.txt", + "f", + "f.skip1.txt", + "f.skip1skip2", + "f.txt", + "fskip1", + ] def test_empty_dirs_include_vs_exclude(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') - self.mkdirs('root/d3/d31') - self.mkdirs('root/d4/d41') + self.mkdirs("root/d1") + self.mkdirs("root/d2") + self.mkdirs("root/d3/d31") + self.mkdirs("root/d4/d41") - self.mkfile('root/d1/f') - self.mkfile('root/d3/d31/f') + self.mkfile("root/d1/f") + self.mkfile("root/d3/d31/f") - filepaths = included_paths( - self.path_to('root'), - empty_dirs=False - ) - assert filepaths == ['d1/f', 'd3/d31/f'] + filepaths = included_paths(self.path_to("root"), empty_dirs=False) + assert filepaths == ["d1/f", "d3/d31/f"] # `include_empty=False` is default - filepaths = included_paths(self.path_to('root')) - assert filepaths == ['d1/f', 'd3/d31/f'] + filepaths = included_paths(self.path_to("root")) + assert filepaths == ["d1/f", "d3/d31/f"] - filepaths = included_paths( - self.path_to('root'), - empty_dirs=True - ) - assert filepaths == ['d1/f', 'd2/.', 'd3/d31/f', 'd4/d41/.'] + filepaths = included_paths(self.path_to("root"), empty_dirs=True) + assert filepaths == ["d1/f", "d2/.", "d3/d31/f", "d4/d41/."] def test_empty_dirs_because_of_filter_include_vs_exclude(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') + self.mkdirs("root/d1") + self.mkdirs("root/d2") - self.mkfile('root/d1/f') - self.mkfile('root/d2/.f') + self.mkfile("root/d1/f") + self.mkfile("root/d2/.f") filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], - empty_dirs=False + self.path_to("root"), match=["*", "!.*"], empty_dirs=False ) - assert filepaths == ['d1/f'] + assert filepaths == ["d1/f"] # `include_empty=False` is default filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], + self.path_to("root"), + match=["*", "!.*"], ) - assert filepaths == ['d1/f'] + assert filepaths == ["d1/f"] filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], - empty_dirs=True + self.path_to("root"), match=["*", "!.*"], empty_dirs=True ) - assert filepaths == ['d1/f', 'd2/.'] + assert filepaths == ["d1/f", "d2/."] def test_empty_dir_inclusion_not_affected_by_match(self): - self.mkdirs('root/d1') - self.mkdirs('root/.d2') + self.mkdirs("root/d1") + self.mkdirs("root/.d2") # NOTE that empty dirs are not excluded by match_patterns: filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*'], - empty_dirs=True + self.path_to("root"), match=["*", "!.*"], empty_dirs=True ) - assert filepaths == ['.d2/.', 'd1/.'] + assert filepaths == [".d2/.", "d1/."] filepaths = included_paths( - self.path_to('root'), - match=['*', '!.*/'], - empty_dirs=True + self.path_to("root"), match=["*", "!.*/"], empty_dirs=True ) - assert filepaths == ['.d2/.', 'd1/.'] + assert filepaths == [".d2/.", "d1/."] filepaths = included_paths( - self.path_to('root'), - match=['*', '!d1'], - empty_dirs=True + self.path_to("root"), match=["*", "!d1"], empty_dirs=True ) - assert filepaths == ['.d2/.', 'd1/.'] + assert filepaths == [".d2/.", "d1/."] def dirhash_mp_comp(*args, **kwargs): @@ -476,276 +443,246 @@ def dirhash_mp_comp(*args, **kwargs): class TestDirhash(TempDirTest): def test_guaranteed_algorithms(self): - self.mkdirs('root/d1/d11') - self.mkdirs('root/d2') - self.mkfile('root/f1', 'a') - self.mkfile('root/d1/f1', 'b') - self.mkfile('root/d1/d11/f1', 'c') - self.mkfile('root/d2/f1', 'd') + self.mkdirs("root/d1/d11") + self.mkdirs("root/d2") + self.mkfile("root/f1", "a") + self.mkfile("root/d1/f1", "b") + self.mkfile("root/d1/d11/f1", "c") + self.mkfile("root/d2/f1", "d") for algorithm, expected_hash in [ - ('md5', '3c631c7f5771468a2187494f802fad8f'), - ('sha1', '992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41'), - ('sha224', '18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999'), - ('sha256', 'ef7e95269fbc0e3478ad31fddd1c7d08' - '907d189c61725332e8a2fd14448fe175'), - ('sha384', '64ef4360c172bc68250f9326ea231cd1' - '46a7fa1afe9d386cee0cae0e9f1b4ad2' - '1df050d1df436cff792bbe81d6698026'), - ('sha512', '7854226eb0278bc136056998890a8399' - 'f85ca383f7c54665026358d28b5dc716' - '0ec654d2bcebf5d60974f82ed820600d' - '8e807ea53d57578d076ec1c82f501208') + ("md5", "3c631c7f5771468a2187494f802fad8f"), + ("sha1", "992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41"), + ("sha224", "18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999"), + ( + "sha256", + "ef7e95269fbc0e3478ad31fddd1c7d08" "907d189c61725332e8a2fd14448fe175", + ), + ( + "sha384", + "64ef4360c172bc68250f9326ea231cd1" + "46a7fa1afe9d386cee0cae0e9f1b4ad2" + "1df050d1df436cff792bbe81d6698026", + ), + ( + "sha512", + "7854226eb0278bc136056998890a8399" + "f85ca383f7c54665026358d28b5dc716" + "0ec654d2bcebf5d60974f82ed820600d" + "8e807ea53d57578d076ec1c82f501208", + ), ]: - hash_value = dirhash_mp_comp(self.path_to('root'), algorithm) + hash_value = dirhash_mp_comp(self.path_to("root"), algorithm) assert hash_value == expected_hash def test_recursive_descriptor(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') - self.mkfile('root/f1', 'a') - self.mkfile('root/d1/f12', 'b') - - f1_desc = 'data:a\000name:f1' - f12_desc = 'data:b\000name:f12' - d1_desc = 'dirhash:{}\000name:d1'.format(f12_desc) - d2_desc = 'dirhash:\000name:d2' - - empty_dirs_false_expected = '\000\000'.join([f1_desc, d1_desc]) - empty_dirs_true_expected = '\000\000'.join([f1_desc, d2_desc, d1_desc]) - - empty_dirs_false = dirhash( - self.path_to('root'), - algorithm=IdentityHasher - ) + self.mkdirs("root/d1") + self.mkdirs("root/d2") + self.mkfile("root/f1", "a") + self.mkfile("root/d1/f12", "b") + + f1_desc = "data:a\000name:f1" + f12_desc = "data:b\000name:f12" + d1_desc = f"dirhash:{f12_desc}\000name:d1" + d2_desc = "dirhash:\000name:d2" + + empty_dirs_false_expected = "\000\000".join([f1_desc, d1_desc]) + empty_dirs_true_expected = "\000\000".join([f1_desc, d2_desc, d1_desc]) + + empty_dirs_false = dirhash(self.path_to("root"), algorithm=IdentityHasher) assert empty_dirs_false == empty_dirs_false_expected empty_dirs_true = dirhash( - self.path_to('root'), - algorithm=IdentityHasher, - empty_dirs=True + self.path_to("root"), algorithm=IdentityHasher, empty_dirs=True ) assert empty_dirs_true == empty_dirs_true_expected def test_symlinked_file(self): - self.mkdirs('root1') - self.mkfile('root1/f1', 'a') - self.mkfile('linked_file', 'b') - self.symlink('linked_file', 'root1/f2') + self.mkdirs("root1") + self.mkfile("root1/f1", "a") + self.mkfile("linked_file", "b") + self.symlink("linked_file", "root1/f2") - self.mkdirs('root2') - self.mkfile('root2/f1', 'a') - self.mkfile('root2/f2', 'b') + self.mkdirs("root2") + self.mkfile("root2/f1", "a") + self.mkfile("root2/f2", "b") root1_linked_files_true = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5' + self.path_to("root1"), algorithm="md5" ) root1_linked_files_false = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5', - linked_files=False + self.path_to("root1"), algorithm="md5", linked_files=False ) - root2 = dirhash_mp_comp( - self.path_to('root2'), algorithm='md5' - ) + root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5") assert root1_linked_files_false != root1_linked_files_true assert root1_linked_files_true == root2 def test_symlinked_dir(self): - self.mkdirs('root1') - self.mkfile('root1/f1', 'a') - self.mkdirs('linked_dir') - self.mkfile('linked_dir/f1', 'b') - self.mkfile('linked_dir/f2', 'c') - self.symlink('linked_dir', 'root1/d1') - - self.mkdirs('root2') - self.mkfile('root2/f1', 'a') - self.mkdirs('root2/d1') - self.mkfile('root2/d1/f1', 'b') - self.mkfile('root2/d1/f2', 'c') + self.mkdirs("root1") + self.mkfile("root1/f1", "a") + self.mkdirs("linked_dir") + self.mkfile("linked_dir/f1", "b") + self.mkfile("linked_dir/f2", "c") + self.symlink("linked_dir", "root1/d1") + + self.mkdirs("root2") + self.mkfile("root2/f1", "a") + self.mkdirs("root2/d1") + self.mkfile("root2/d1/f1", "b") + self.mkfile("root2/d1/f2", "c") root1_linked_dirs_true = dirhash_mp_comp( - self.path_to('root1'), - algorithm='md5', - linked_dirs=True + self.path_to("root1"), algorithm="md5", linked_dirs=True ) root1_linked_dirs_false = dirhash_mp_comp( - self.path_to('root1'), - algorithm='md5', - linked_dirs=False - ) - root2 = dirhash_mp_comp( - self.path_to('root2'), algorithm='md5' + self.path_to("root1"), algorithm="md5", linked_dirs=False ) + root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5") assert root1_linked_dirs_false != root1_linked_dirs_true assert root1_linked_dirs_true == root2 def test_cache_used_for_symlinks(self): - self.mkdirs('root/dir') - self.mkfile('root/file', '< one chunk content') + self.mkdirs("root/dir") + self.mkfile("root/file", "< one chunk content") for i in range(10): - self.symlink('root/file', 'root/link_{}'.format(i)) + self.symlink("root/file", f"root/link_{i}") for i in range(10): - self.symlink('root/file', 'root/dir/link_{}'.format(i)) + self.symlink("root/file", f"root/dir/link_{i}") start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher) + dirhash(self.path_to("root"), algorithm=SlowHasher) end = time() elapsed = end - start assert elapsed < SlowHasher.wait_time * 2 def test_raise_on_empty_root_without_include_empty(self): - self.mkdirs('root') + self.mkdirs("root") with pytest.raises(ValueError): - dirhash_mp_comp(self.path_to('root'), 'sha256') + dirhash_mp_comp(self.path_to("root"), "sha256") def test_empty_root_include_empty(self): - self.mkdirs('root') - dirhash_ = dirhash_mp_comp( - self.path_to('root'), - 'sha256', - empty_dirs=True - ) - expected_dirhash = hashlib.sha256(''.encode('utf-8')).hexdigest() + self.mkdirs("root") + dirhash_ = dirhash_mp_comp(self.path_to("root"), "sha256", empty_dirs=True) + expected_dirhash = hashlib.sha256(b"").hexdigest() assert dirhash_ == expected_dirhash def test_include_empty(self): - self.mkdirs('root/d1') - self.mkdirs('root/d2') - self.mkfile('root/d1/f') - - args = (self.path_to('root'), 'sha256') - dirhash_ = dirhash_mp_comp( - *args, - empty_dirs=False - ) - dirhash_empty = dirhash_mp_comp( - *args, - empty_dirs=True - ) + self.mkdirs("root/d1") + self.mkdirs("root/d2") + self.mkfile("root/d1/f") + + args = (self.path_to("root"), "sha256") + dirhash_ = dirhash_mp_comp(*args, empty_dirs=False) + dirhash_empty = dirhash_mp_comp(*args, empty_dirs=True) assert dirhash_ != dirhash_empty def test_chunksize(self): - self.mkdirs('root') - self.mkfile('root/numbers.txt', str(range(1000))) + self.mkdirs("root") + self.mkfile("root/numbers.txt", str(range(1000))) - hash_value = dirhash_mp_comp(self.path_to('root'), 'sha256') + hash_value = dirhash_mp_comp(self.path_to("root"), "sha256") for chunk_size in [2**4, 2**8, 2**16]: - assert dirhash_mp_comp( - self.path_to('root'), - 'sha256', - chunk_size=chunk_size - ) == hash_value + assert ( + dirhash_mp_comp(self.path_to("root"), "sha256", chunk_size=chunk_size) + == hash_value + ) def test_data_only(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - self.mkfile('root1/b.txt', 'def') - self.mkdirs('root2') - self.mkfile('root2/a.txt', 'abc') - self.mkfile('root2/c.txt', 'def') - - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + self.mkfile("root1/b.txt", "def") + self.mkdirs("root2") + self.mkfile("root2/a.txt", "abc") + self.mkfile("root2/c.txt", "def") + + hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") + hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") assert hash1 != hash2 # with entry hash remains the same as long as order of files is the # same [dhash1, dhash2] = [ - dirhash_mp_comp( - self.path_to(root), - 'sha256', - entry_properties=['data'] - ) for root in ['root1', 'root2'] + dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["data"]) + for root in ["root1", "root2"] ] assert dhash1 == dhash2 def test_name_only(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - self.mkfile('root1/b.txt', 'def') - self.mkdirs('root2') - self.mkfile('root2/a.txt', 'abc') - self.mkfile('root2/b.txt', '___') - - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + self.mkfile("root1/b.txt", "def") + self.mkdirs("root2") + self.mkfile("root2/a.txt", "abc") + self.mkfile("root2/b.txt", "___") + + hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") + hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") assert hash1 != hash2 [dhash1, dhash2] = [ - dirhash_mp_comp( - self.path_to(root), - 'sha256', - entry_properties=['name'] - ) for root in ['root1', 'root2'] + dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["name"]) + for root in ["root1", "root2"] ] assert dhash1 == dhash2 def test_is_link_property(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - self.mkfile('root1/b.txt', 'def') - self.mkdirs('root2') - self.mkfile('b_target', 'def') - self.mkfile('root2/a.txt', 'abc') - self.symlink('b_target', 'root2/b.txt') - - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + self.mkfile("root1/b.txt", "def") + self.mkdirs("root2") + self.mkfile("b_target", "def") + self.mkfile("root2/a.txt", "abc") + self.symlink("b_target", "root2/b.txt") + + hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") + hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") assert hash1 == hash2 for entry_properties in [ - ['name', 'data', 'is_link'], - ['name', 'is_link'], - ['data', 'is_link'], + ["name", "data", "is_link"], + ["name", "is_link"], + ["data", "is_link"], ]: [hash1, hash2] = [ dirhash_mp_comp( - self.path_to(root), - 'sha256', - entry_properties=entry_properties - ) for root in ['root1', 'root2'] + self.path_to(root), "sha256", entry_properties=entry_properties + ) + for root in ["root1", "root2"] ] assert hash1 != hash2 def test_raise_on_not_at_least_one_of_name_and_data(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - dirhash_mp_comp(self.path_to('root1'), 'sha256') # check ok + self.mkdirs("root1") + self.mkfile("root1/a.txt", "abc") + dirhash_mp_comp(self.path_to("root1"), "sha256") # check ok with pytest.raises(ValueError): - dirhash_mp_comp( - self.path_to('root1'), - 'sha256', - entry_properties=[] - ) + dirhash_mp_comp(self.path_to("root1"), "sha256", entry_properties=[]) with pytest.raises(ValueError): dirhash_mp_comp( - self.path_to('root1'), - 'sha256', - entry_properties=['is_link'] + self.path_to("root1"), "sha256", entry_properties=["is_link"] ) def test_multiproc_speedup(self): - self.mkdirs('root/dir') + self.mkdirs("root/dir") num_files = 10 for i in range(num_files): - self.mkfile('root/file_{}'.format(i), '< one chunk content') + self.mkfile(f"root/file_{i}", "< one chunk content") expected_min_elapsed_sequential = SlowHasher.wait_time * num_files start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher) + dirhash(self.path_to("root"), algorithm=SlowHasher) end = time() elapsed_sequential = end - start assert elapsed_sequential > expected_min_elapsed_sequential start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher, jobs=num_files) + dirhash(self.path_to("root"), algorithm=SlowHasher, jobs=num_files) end = time() elapsed_muliproc = end - start assert elapsed_muliproc < 0.9 * expected_min_elapsed_sequential @@ -755,11 +692,11 @@ def test_cache_by_real_path_speedup(self, tmpdir): num_links = 10 # reference run without links - root1 = tmpdir.join('root1') + root1 = tmpdir.join("root1") root1.ensure(dir=True) for i in range(num_links): - file_i = root1.join('file_{}'.format(i)) - file_i.write('< one chunk content', ensure=True) + file_i = root1.join(f"file_{i}") + file_i.write("< one chunk content", ensure=True) wait_time = SlowHasher.wait_time expected_min_elapsed_no_links = wait_time * num_links @@ -771,12 +708,12 @@ def test_cache_by_real_path_speedup(self, tmpdir): overhead = elapsed_no_links - expected_min_elapsed_no_links # all links to same file - root2 = tmpdir.join('root2') + root2 = tmpdir.join("root2") root2.ensure(dir=True) - target_file = tmpdir.join('target_file') + target_file = tmpdir.join("target_file") target_file.ensure() for i in range(num_links): - root2.join('link_{}'.format(i)).mksymlinkto(target_file) + root2.join(f"link_{i}").mksymlinkto(target_file) overhead_margin_factor = 1.5 expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time @@ -788,16 +725,16 @@ def test_cache_by_real_path_speedup(self, tmpdir): assert elapsed_with_links < expected_max_elapsed_with_links def test_cache_together_with_multiprocess_speedup(self, tmpdir): - target_file_names = ['target_file_1', 'target_file_2'] + target_file_names = ["target_file_1", "target_file_2"] num_links_per_file = 10 num_links = num_links_per_file * len(target_file_names) # reference run without links - root1 = tmpdir.join('root1') + root1 = tmpdir.join("root1") root1.ensure(dir=True) for i in range(num_links): - file_i = root1.join('file_{}'.format(i)) - file_i.write('< one chunk content', ensure=True) + file_i = root1.join(f"file_{i}") + file_i.write("< one chunk content", ensure=True) jobs = 2 wait_time = SlowHasher.wait_time @@ -809,16 +746,18 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): assert elapsed_no_links > expected_min_elapsed_no_links overhead = elapsed_no_links - expected_min_elapsed_no_links - root2 = tmpdir.join('root2') + root2 = tmpdir.join("root2") root2.ensure(dir=True) for i, target_file_name in enumerate(target_file_names): target_file = tmpdir.join(target_file_name) - target_file.write('< one chunk content', ensure=True) + target_file.write("< one chunk content", ensure=True) for j in range(num_links_per_file): - root2.join('link_{}_{}'.format(i, j)).mksymlinkto(target_file) + root2.join(f"link_{i}_{j}").mksymlinkto(target_file) overhead_margin_factor = 1.5 - expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time * 2 + expected_max_elapsed_with_links = ( + overhead * overhead_margin_factor + wait_time * 2 + ) assert expected_max_elapsed_with_links < expected_min_elapsed_no_links start = time() dirhash(root2, algorithm=SlowHasher, jobs=jobs) @@ -827,84 +766,76 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): assert elapsed_mp_with_links < expected_max_elapsed_with_links def test_hash_cyclic_link_to_root(self): - self.mkdirs('root/d1') - self.symlink('root', 'root/d1/link_back') - dirhash( - self.path_to('root'), - 'sha256', - allow_cyclic_links=True - ) + self.mkdirs("root/d1") + self.symlink("root", "root/d1/link_back") + dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True) def test_hash_cyclic_link(self): - self.mkdirs('root/d1/d2') - self.symlink('root/d1', 'root/d1/d2/link_back') - dirhash( - self.path_to('root'), - 'sha256', - allow_cyclic_links=True - ) + self.mkdirs("root/d1/d2") + self.symlink("root/d1", "root/d1/d2/link_back") + dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True) def test_pass_filtering_instance(self): - self.mkdirs('root') - self.mkfile('root/f1', '') - dirhash_impl(self.path_to('root'), 'sha256', filter_=Filter()) + self.mkdirs("root") + self.mkfile("root/f1", "") + dirhash_impl(self.path_to("root"), "sha256", filter_=Filter()) def test_pass_protocol_instance(self): - self.mkdirs('root') - self.mkfile('root/f1', '') - dirhash_impl(self.path_to('root'), 'sha256', protocol=Protocol()) + self.mkdirs("root") + self.mkfile("root/f1", "") + dirhash_impl(self.path_to("root"), "sha256", protocol=Protocol()) def test_raise_on_wrong_type(self): - self.mkdirs('root') - self.mkfile('root/f1', '') + self.mkdirs("root") + self.mkfile("root/f1", "") with pytest.raises(TypeError): - dirhash_impl(self.path_to('root'), 'sha256', filter_='') + dirhash_impl(self.path_to("root"), "sha256", filter_="") with pytest.raises(TypeError): - dirhash_impl(self.path_to('root'), 'sha256', protocol='') + dirhash_impl(self.path_to("root"), "sha256", protocol="") -class SlowHasher(object): +class SlowHasher: wait_time = 0.25 def __init__(self, *args, **kwargs): pass def update(self, data): - if data != b'': + if data != b"": sleep(self.wait_time) def hexdigest(self): - return '' + return "" -class IdentityHasher(object): +class IdentityHasher: - def __init__(self, initial_data=b''): - self.datas = [initial_data.decode('utf-8')] + def __init__(self, initial_data=b""): + self.datas = [initial_data.decode("utf-8")] def update(self, data): - self.datas.append(data.decode('utf-8')) + self.datas.append(data.decode("utf-8")) def hexdigest(self): - return ''.join(self.datas) + return "".join(self.datas) -class TestProtocol(object): +class TestProtocol: def test_raise_for_invalid_entry_properties(self): with pytest.raises(ValueError): - Protocol(entry_properties=['not-valid']) + Protocol(entry_properties=["not-valid"]) def test_raise_for_invalid_allow_cyclic_links(self): with pytest.raises(ValueError): - Protocol(allow_cyclic_links='not-valid') + Protocol(allow_cyclic_links="not-valid") def mock_func(x): return x * 2 -@pytest.mark.parametrize('jobs', [1, 2, 4]) +@pytest.mark.parametrize("jobs", [1, 2, 4]) def test_parmap(jobs): inputs = [1, 2, 3, 4] assert _parmap(mock_func, inputs, jobs=jobs) == [2, 4, 6, 8] diff --git a/tox.ini b/tox.ini index cd10b68..f873fe3 100644 --- a/tox.ini +++ b/tox.ini @@ -3,9 +3,11 @@ envlist = py{38,39,310,311,312} [testenv] deps = + pre-commit pytest pytest-cov commands = + pre-commit run --all-files pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} [gh-actions] From 7e4695a93725fa6f1084d51c93123596a4c6515a Mon Sep 17 00:00:00 2001 From: Filip Richtarik Date: Thu, 11 Apr 2024 10:28:57 +0200 Subject: [PATCH 37/51] Update test.yml --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fd81a08..b1249b1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,6 +47,9 @@ jobs: # Always run this last as it can push new changes and actions will not rerun. pre-commit: + permissions: + contents: write + pull-requests: write needs: [tests] runs-on: ubuntu-latest steps: From 99df03a14faee9bab92b53d32b96d21aa9540719 Mon Sep 17 00:00:00 2001 From: Filip Richtarik Date: Thu, 11 Apr 2024 10:32:58 +0200 Subject: [PATCH 38/51] Update test.yml --- .github/workflows/test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b1249b1..8956ee4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,7 +6,7 @@ on: - "master" pull_request: branches: - - "*" + - "**" workflow_dispatch: release: types: [published, edited] @@ -46,7 +46,9 @@ jobs: verbose: true # Always run this last as it can push new changes and actions will not rerun. + # not working on pull requests from forks pre-commit: + if: false permissions: contents: write pull-requests: write From 899d7554a4a186252e2db5a687b6115bb0db7f72 Mon Sep 17 00:00:00 2001 From: Filip Richtarik Date: Sun, 14 Apr 2024 12:24:37 +0200 Subject: [PATCH 39/51] fix --- .flake8 | 3 ++ .github/workflows/test.yml | 66 +++++--------------------------------- .pre-commit-config.yaml | 6 +++- benchmark/run.py | 8 ++--- src/dirhash/__init__.py | 11 +++---- src/dirhash/_version.py | 5 ++- src/dirhash/cli.py | 29 ++++++++--------- tests/test_dirhash.py | 11 ------- tox.ini | 3 +- 9 files changed, 41 insertions(+), 101 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..de50b19 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 90 +extend-ignore=E203 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8956ee4..15b2850 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,65 +40,15 @@ jobs: key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - name: Test with tox run: tox + - name: Commit and Push to Pull Request + if: matrix.python-version == 3.8 + run: | + if [ -n "$(git status -s)" ]; then + git add . + git commit -m "✨ ⭐ Automated commit has been added to your pull request to fix formatting! ⭐ ✨" + git push origin ${{ github.head_ref }} + fi - uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} verbose: true - - # Always run this last as it can push new changes and actions will not rerun. - # not working on pull requests from forks - pre-commit: - if: false - permissions: - contents: write - pull-requests: write - needs: [tests] - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - ref: ${{ github.head_ref }} - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: "3.8" - - - name: Install PreCommit - run: pip install pre-commit - - - uses: actions/cache@v4 - with: - path: ~/.cache/pre-commit - key: ${{ runner.os }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} - restore-keys: | - ${{ runner.os }}-pre-commit- - - - name: PreCommit - id: pre-commit - run: | - if pre-commit run --show-diff-on-failure --color=always --all-files; then - echo "failed=0" >> $GITHUB_OUTPUT - else - echo "failed=1" >> $GITHUB_OUTPUT - fi - if [ -n "$(git status -s)" ]; then - echo "dirty=1" >> $GITHUB_OUTPUT - else - echo "dirty=0" >> $GITHUB_OUTPUT - fi - - # Run a second time to verify that everything has indeed been fixed. - - name: PreCommit verify - if: steps.pre-commit.outputs.failed == 1 - run: | - pre-commit run --show-diff-on-failure --color=always --all-files - - - name: Commit and Push to Pull Request - if: steps.pre-commit.outputs.dirty == 1 - run: | - git add . - git status - git commit -m "✨ ⭐ Automated commit has been added to your pull request to fix formatting! ⭐ ✨" - git push origin ${{ github.head_ref }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5da4da3..7de1e24 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,11 @@ repos: rev: 7.0.0 hooks: - id: flake8 - args: ["--max-line-length=90", "--extend-ignore=E203,W503"] + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.7 + hooks: + - id: ruff + - id: ruff-format - repo: https://github.com/asottile/pyupgrade rev: v3.15.2 hooks: diff --git a/benchmark/run.py b/benchmark/run.py index d5d9a25..1c27706 100644 --- a/benchmark/run.py +++ b/benchmark/run.py @@ -60,9 +60,7 @@ def require_test_cases(): def time_shell(cmd, runs=1, repetitions=1, setup=None): - time_cmd = "time for i in {{1..{rep}}}; do {cmd}; done".format( - cmd=cmd, rep=repetitions - ) + time_cmd = f"time for i in {{1..{repetitions}}}; do {cmd}; done" if setup is not None: time_cmd = f"{setup}; {time_cmd}" @@ -97,8 +95,8 @@ def get_reference_shell_cmd(dirpath, algorithm): else: raise ValueError("only md5 and sha supported") - return "find {dir} -type f -print0 | sort -z | xargs -0 {alg} | {alg}".format( - dir=dirpath, alg=algorithm + return ( + f"find {dirpath} -type f -print0 | sort -z | xargs -0 {algorithm} | {algorithm}" ) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index f0d54b7..0e49b64 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -"""dirhash - a python library (and CLI) for hashing of file system directories. -""" +"""dirhash - a python library (and CLI) for hashing of file system directories.""" import hashlib import os @@ -458,9 +457,8 @@ def __init__(self, entry_properties=("name", "data"), allow_cyclic_links=False): entry_properties = set(entry_properties) if not entry_properties.issubset(self.EntryProperties.options): raise ValueError( - "entry properties {} not supported".format( - entry_properties - self.EntryProperties.options - ) + f"entry properties {entry_properties - self.EntryProperties.options} " + "not supported" ) if not ( self.EntryProperties.NAME in entry_properties @@ -476,8 +474,7 @@ def __init__(self, entry_properties=("name", "data"), allow_cyclic_links=False): if not isinstance(allow_cyclic_links, bool): raise ValueError( - "allow_cyclic_link must be a boolean, " - "got {}".format(allow_cyclic_links) + f"allow_cyclic_link must be a boolean, got {allow_cyclic_links}" ) self.allow_cyclic_links = allow_cyclic_links diff --git a/src/dirhash/_version.py b/src/dirhash/_version.py index 1166e3d..aa7d646 100644 --- a/src/dirhash/_version.py +++ b/src/dirhash/_version.py @@ -381,9 +381,8 @@ def git_pieces_from_vcs( if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( - full_tag, - tag_prefix, + pieces["error"] = ( + f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 8354d49..a06db40 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -"""Get hash for the content and/or structure of a directory. -""" +"""Get hash for the content and/or structure of a directory.""" import argparse import sys @@ -39,14 +38,13 @@ def get_kwargs(args): choices=dirhash.algorithms_available, default="md5", help=( - 'Hashing algorithm to use, by default "md5". Always available: {}. ' - "Additionally available on current platform: {}. Note that the same " - "algorithm may appear multiple times in this set under different names " - "(thanks to OpenSSL) " - "[https://docs.python.org/2/library/hashlib.html]".format( - sorted(dirhash.algorithms_guaranteed), - sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed), - ) + "Hashing algorithm to use, by default 'md5'. " + f"Always available: {sorted(dirhash.algorithms_guaranteed)}. " + f"Additionally available on current platform: " + f"{sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed)}. " + "Note that the same algorithm may appear multiple times in this set " + "under different names (thanks to OpenSSL) " + "[https://docs.python.org/2/library/hashlib.html]." ), metavar="", ) @@ -129,11 +127,12 @@ def get_kwargs(args): dest="entry_properties", default=["data", "name"], help=( - "List of file/directory properties to include in the hash. Available " - "properties are: {} and at least one of name and data must be " - "included. Default is [data name] which means that both the name/paths" - " and content (actual data) of files and directories will be included" - ).format(list(dirhash.Protocol.EntryProperties.options)), + "List of file/directory properties to include in the hash. " + f"Available properties are: {list(dirhash.Protocol.EntryProperties.options)} " + "and at least one of name and data must be included. " + "Default is [data name] which means that both the name/paths " + "and content (actual data) of files and directories will be included" + ), metavar="", ) protocol_options.add_argument( diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 276c3ad..3b7efce 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -22,7 +22,6 @@ class TestGetHasherFactory: - def test_get_guaranteed(self): algorithm_and_hasher_factory = [ ("md5", hashlib.md5), @@ -59,14 +58,12 @@ def test_not_available(self): _get_hasher_factory("not available") def test_bypass_hasher_factory(self): - # test standard hasher hasher_factory = _get_hasher_factory(hashlib.sha256) assert hasher_factory is hashlib.sha256 # test raise on custom hasher with bad interface class IncompleteMockHasher: - def __init__(self, *args, **kwargs): pass @@ -78,7 +75,6 @@ def update(self, *args, **kwargs): # test custom hasher with ok interface class MockHasher(IncompleteMockHasher): - def hexdigest(self): return "" @@ -87,7 +83,6 @@ def hexdigest(self): class TestGetMatchPatterns: - def test_default_match_all(self): ms = get_match_patterns() assert ms == ["*"] @@ -139,7 +134,6 @@ def test_ignore_extensions(self): class TempDirTest: - def setup_method(self): self.dir = tempfile.mkdtemp() @@ -441,7 +435,6 @@ def dirhash_mp_comp(*args, **kwargs): class TestDirhash(TempDirTest): - def test_guaranteed_algorithms(self): self.mkdirs("root/d1/d11") self.mkdirs("root/d2") @@ -545,7 +538,6 @@ def test_symlinked_dir(self): assert root1_linked_dirs_true == root2 def test_cache_used_for_symlinks(self): - self.mkdirs("root/dir") self.mkfile("root/file", "< one chunk content") for i in range(10): @@ -667,7 +659,6 @@ def test_raise_on_not_at_least_one_of_name_and_data(self): ) def test_multiproc_speedup(self): - self.mkdirs("root/dir") num_files = 10 for i in range(num_files): @@ -809,7 +800,6 @@ def hexdigest(self): class IdentityHasher: - def __init__(self, initial_data=b""): self.datas = [initial_data.decode("utf-8")] @@ -821,7 +811,6 @@ def hexdigest(self): class TestProtocol: - def test_raise_for_invalid_entry_properties(self): with pytest.raises(ValueError): Protocol(entry_properties=["not-valid"]) diff --git a/tox.ini b/tox.ini index f873fe3..34d71e5 100644 --- a/tox.ini +++ b/tox.ini @@ -2,12 +2,13 @@ envlist = py{38,39,310,311,312} [testenv] +allowlist_externals = bash deps = pre-commit pytest pytest-cov commands = - pre-commit run --all-files + bash -ec 'if [[ ${envpython} == *"py38"* ]]; then pre-commit run --all-files; fi' {posargs} pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} [gh-actions] From 276b36bd5c2f7f7c9ef9ca8fce70ba57cd0380d0 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 22:49:03 +0200 Subject: [PATCH 40/51] replaces black, isort and flake8 with ruff and ruff-format --- .flake8 | 3 --- .pre-commit-config.yaml | 23 +++-------------------- benchmark/run.py | 8 ++++---- pyproject.toml | 14 ++++++++++++++ src/dirhash/_version.py | 2 ++ src/dirhash/cli.py | 10 +++++----- tests/test_dirhash.py | 2 +- 7 files changed, 29 insertions(+), 33 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index de50b19..0000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 90 -extend-ignore=E203 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7de1e24..393ce81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,30 +1,13 @@ repos: - - repo: https://github.com/psf/black - rev: 24.3.0 - hooks: - - id: black - args: ["--target-version", "py38"] - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - args: ["--profile", "black"] - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.1.0 hooks: - id: prettier - args: [--prose-wrap=preserve, --print-width=90] - - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 - hooks: - - id: flake8 + args: [--prose-wrap=preserve, --print-width=88] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.3.7 hooks: - id: ruff + args: + - --fix - id: ruff-format - - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 - hooks: - - id: pyupgrade - args: ["--py38-plus"] diff --git a/benchmark/run.py b/benchmark/run.py index 1c27706..712aa9f 100644 --- a/benchmark/run.py +++ b/benchmark/run.py @@ -65,7 +65,7 @@ def time_shell(cmd, runs=1, repetitions=1, setup=None): time_cmd = f"{setup}; {time_cmd}" realtimes = [] - for i in range(runs): + for _run in range(runs): process = subprocess.run( time_cmd, capture_output=True, text=True, shell=True, check=True ) @@ -77,10 +77,10 @@ def time_shell(cmd, runs=1, repetitions=1, setup=None): min_str, sec_str = t_str.split("m") sec = 60 * int(min_str) + float(sec_str[:-1]) sec_per_rep = sec / repetitions - except: # noqa: E722 + except Exception as exc: raise RuntimeError( f"Failed to parse `time` stderr output: {process.stderr}" - ) + ) from exc realtimes.append(sec_per_rep) return realtimes @@ -167,7 +167,7 @@ def benchmark(dirpath, algorithm, **kwargs): "t_median", ] ] - for (tc, alg), subdf in df.groupby(["test_case", "algorithm"]): + for (_tc, _alg), subdf in df.groupby(["test_case", "algorithm"]): t_ref = subdf[subdf["implementation"] == "shell reference"][ "t_median" ].values[0] diff --git a/pyproject.toml b/pyproject.toml index 31e196f..65e0edc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,17 @@ [build-system] requires = ["setuptools", "versioneer==0.29"] build-backend = "setuptools.build_meta" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade +] + +[tool.ruff.lint.isort] +known-local-folder = ["dirhash"] diff --git a/src/dirhash/_version.py b/src/dirhash/_version.py index aa7d646..db747a1 100644 --- a/src/dirhash/_version.py +++ b/src/dirhash/_version.py @@ -8,6 +8,8 @@ # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer +# ruff: noqa + """Git implementation of _version.py.""" import errno diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index a06db40..ae34de7 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -127,11 +127,11 @@ def get_kwargs(args): dest="entry_properties", default=["data", "name"], help=( - "List of file/directory properties to include in the hash. " - f"Available properties are: {list(dirhash.Protocol.EntryProperties.options)} " - "and at least one of name and data must be included. " - "Default is [data name] which means that both the name/paths " - "and content (actual data) of files and directories will be included" + "List of file/directory properties to include in the hash. Available " + f"properties are: {list(dirhash.Protocol.EntryProperties.options)} and at " + "least one of name and data must be included. Default is [data name] which " + "means that both the name/paths and content (actual data) of files and " + "directories will be included" ), metavar="", ) diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 3b7efce..df55260 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -429,7 +429,7 @@ def test_empty_dir_inclusion_not_affected_by_match(self): def dirhash_mp_comp(*args, **kwargs): res = dirhash(*args, **kwargs) - res_mp = dirhash(jobs=2, *args, **kwargs) + res_mp = dirhash(*args, **{**kwargs, "jobs": 2}) assert res == res_mp return res From 47f6c279d07057f1280d888a564e4d7881a28bda Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:02:25 +0200 Subject: [PATCH 41/51] break out pre-commit in tox.ini --- tox.ini | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index 34d71e5..c687513 100644 --- a/tox.ini +++ b/tox.ini @@ -2,14 +2,16 @@ envlist = py{38,39,310,311,312} [testenv] -allowlist_externals = bash deps = - pre-commit pytest pytest-cov commands = - bash -ec 'if [[ ${envpython} == *"py38"* ]]; then pre-commit run --all-files; fi' {posargs} - pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc tests/ {posargs} + pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc {posargs:tests} + +[testenv:pre-commit] +skip_install = true +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure [gh-actions] python = From 09b9765959715e485a24a950674da676ab8f7734 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:13:44 +0200 Subject: [PATCH 42/51] adds ruff target version --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 65e0edc..a032c1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,9 @@ requires = ["setuptools", "versioneer==0.29"] build-backend = "setuptools.build_meta" +[tool.ruff] +target-version = "py38" + [tool.ruff.lint] select = [ "E", # pycodestyle errors From 014fa53d28fc02ab74802f903b6c45852b402b3e Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:32:06 +0200 Subject: [PATCH 43/51] adds pre-commit to envlist --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index c687513..cb48aa0 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{38,39,310,311,312} +envlist = pre-commit,py{38,39,310,311,312} [testenv] deps = @@ -15,7 +15,7 @@ commands = pre-commit run --all-files --show-diff-on-failure [gh-actions] python = - 3.8: py38 + 3.8: py38, pre-commit 3.9: py39 3.10: py310 3.11: py311 From 548479bbaec40b676eb9413e2a84c1c7245cba83 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:36:34 +0200 Subject: [PATCH 44/51] intentionally fails pre-commit to test gha --- src/dirhash/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 0e49b64..4c44e6d 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -147,9 +147,7 @@ def dirhash( linked_files=linked_files, empty_dirs=empty_dirs, ) - protocol = Protocol( - entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links - ) + protocol = Protocol(entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links) return dirhash_impl( directory=directory, algorithm=algorithm, From 6831eb8ab1b95cd4bbfe4d92a99850f9e71d70ba Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:46:04 +0200 Subject: [PATCH 45/51] revert intentional pre-commit error --- src/dirhash/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 4c44e6d..0e49b64 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -147,7 +147,9 @@ def dirhash( linked_files=linked_files, empty_dirs=empty_dirs, ) - protocol = Protocol(entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links) + protocol = Protocol( + entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links + ) return dirhash_impl( directory=directory, algorithm=algorithm, From 5cc14faae37f469d5115d2ffc3b00aaf1dd6aa06 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:48:00 +0200 Subject: [PATCH 46/51] removes auto commit (was not running if previous step failed, and mainly only works for main fork and I won't use it) --- .github/workflows/test.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 15b2850..30e693c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,14 +40,6 @@ jobs: key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} - name: Test with tox run: tox - - name: Commit and Push to Pull Request - if: matrix.python-version == 3.8 - run: | - if [ -n "$(git status -s)" ]; then - git add . - git commit -m "✨ ⭐ Automated commit has been added to your pull request to fix formatting! ⭐ ✨" - git push origin ${{ github.head_ref }} - fi - uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} From 3c62a9c8f18ed041120a7b1ca3f43e2501eea042 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:51:34 +0200 Subject: [PATCH 47/51] try out separate pre-commit action --- .github/workflows/test.yml | 9 +++++++++ tox.ini | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 30e693c..704b01d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,6 +12,15 @@ on: types: [published, edited] jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.8" + - uses: pre-commit/action@v3.0.1 + tests: runs-on: ubuntu-latest strategy: diff --git a/tox.ini b/tox.ini index cb48aa0..168724a 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ commands = pre-commit run --all-files --show-diff-on-failure [gh-actions] python = - 3.8: py38, pre-commit + 3.8: py38 3.9: py39 3.10: py310 3.11: py311 From 7b2aad33cfc9cb9c4f2df972b54401002a532e42 Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Tue, 16 Apr 2024 23:56:59 +0200 Subject: [PATCH 48/51] renames workflow ci --- .github/workflows/{test.yml => ci.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{test.yml => ci.yml} (98%) diff --git a/.github/workflows/test.yml b/.github/workflows/ci.yml similarity index 98% rename from .github/workflows/test.yml rename to .github/workflows/ci.yml index 704b01d..58c8991 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: Run tests +name: CI on: push: From 26be94e4b9709b0bcd7246ba91b70bfa1c7f2059 Mon Sep 17 00:00:00 2001 From: Matthew Feickert Date: Thu, 20 Jun 2024 15:32:36 -0400 Subject: [PATCH 49/51] Add requires-python metadata (#28) * Add requires-python metadata through the addition of setuptools's python_requires in setup.py. - c.f. https://peps.python.org/pep-0621/#requires-python * The addition of requires-python is to provide guards to keep older CPython versions from installing releases that could contain unrunnable code. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 14e3cb9..4794fb7 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ author="Anders Huss", author_email="andhus@kth.se", license="MIT", + python_requires=">=3.8", install_requires=["scantree"], packages=find_packages("src"), package_dir={"": "src"}, From fc56c76c2f2e60beb9fbc1711bde612e44c2d6ef Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 4 Aug 2024 00:00:43 +0200 Subject: [PATCH 50/51] Windows support (#29) --- .github/workflows/ci.yml | 4 +- tests/test_cli.py | 22 ++++++++-- tests/test_dirhash.py | 88 ++++++++++++++++++++++++---------------- 3 files changed, 75 insertions(+), 39 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 58c8991..47e7d0d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,11 +22,12 @@ jobs: - uses: pre-commit/action@v3.0.1 tests: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os: [ubuntu-latest, windows-latest] steps: - uses: actions/checkout@v4 @@ -50,6 +51,7 @@ jobs: - name: Test with tox run: tox - uses: codecov/codecov-action@v4 + if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest' with: token: ${{ secrets.CODECOV_TOKEN }} verbose: true diff --git a/tests/test_cli.py b/tests/test_cli.py index a9aea3e..ef34ecd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,12 +7,18 @@ import dirhash -console_script = os.path.join(os.path.dirname(sys.executable), "dirhash") +console_script = os.path.join( + os.path.dirname(sys.executable), + "dirhash.exe" if os.name == "nt" else "dirhash", +) +if not os.path.isfile(console_script): + print(os.listdir(os.path.dirname(sys.executable))) + raise FileNotFoundError(f"Could not find console script at {console_script}.") +if not os.access(console_script, os.X_OK): + raise PermissionError(f"Console script at {console_script} is not executable.") def dirhash_run(argstring, add_env=None): - assert os.path.isfile(console_script) - assert os.access(console_script, os.X_OK) if add_env: env = os.environ.copy() env.update(add_env) @@ -22,6 +28,7 @@ def dirhash_run(argstring, add_env=None): [console_script] + shlex.split(argstring), stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, env=env, ) output, error = process.communicate() @@ -59,6 +66,13 @@ def create_default_tree(tmpdir): tmpdir.join("file.ext2").write("file with extension .ext2") +def osp(path: str) -> str: + """Normalize path for OS.""" + if os.name == "nt": # pragma: no cover + return path.replace("/", "\\") + return path + + class TestCLI: @pytest.mark.parametrize( "argstring, non_default_kwargs", @@ -171,7 +185,7 @@ def test_list(self, description, argstrings, output, tmpdir): o, error, returncode = dirhash_run(argstring) assert returncode == 0 assert error == "" - assert o == output + assert o == osp(output) @pytest.mark.parametrize( "argstring, kwargs, expected_hashes", diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index df55260..68df656 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -21,6 +21,17 @@ ) +def osp(path: str) -> str: + """Normalize path for OS.""" + if os.name == "nt": # pragma: no cover + return path.replace("/", "\\") + return path + + +def map_osp(paths): + return [osp(path) for path in paths] + + class TestGetHasherFactory: def test_get_guaranteed(self): algorithm_and_hasher_factory = [ @@ -142,7 +153,7 @@ def teardown_method(self): shutil.rmtree(self.dir) def path_to(self, relpath): - return os.path.join(self.dir, relpath) + return os.path.join(self.dir, osp(relpath)) def mkdirs(self, dirpath): os.makedirs(self.path_to(dirpath)) @@ -173,7 +184,7 @@ def test_basic(self): self.mkfile("root/d1/d11/f1") self.mkfile("root/d2/f1") - expected_filepaths = ["d1/d11/f1", "d1/f1", "d2/f1", "f1"] + expected_filepaths = map_osp(["d1/d11/f1", "d1/f1", "d2/f1", "f1"]) filepaths = included_paths(self.path_to("root")) assert filepaths == expected_filepaths @@ -220,11 +231,11 @@ def test_symlinked_dir(self): assert filepaths == ["f1"] filepaths = included_paths(self.path_to("root"), linked_dirs=True) - assert filepaths == ["d1/f1", "d1/f2", "f1"] + assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"]) # default is 'linked_dirs': True filepaths = included_paths(self.path_to("root")) - assert filepaths == ["d1/f1", "d1/f2", "f1"] + assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"]) def test_cyclic_link(self): self.mkdirs("root/d1") @@ -237,7 +248,7 @@ def test_cyclic_link(self): assert str(exc_info.value).startswith("Symlink recursion:") filepaths = included_paths(self.path_to("root"), allow_cyclic_links=True) - assert filepaths == ["d1/link_back/."] + assert filepaths == map_osp(["d1/link_back/."]) # default is 'allow_cyclic_links': False with pytest.raises(SymlinkRecursionError): @@ -255,11 +266,11 @@ def test_ignore_hidden(self): # no ignore filepaths = included_paths(self.path_to("root")) - assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # with ignore filepaths = included_paths(self.path_to("root"), match=["*", "!.*"]) - assert filepaths == ["d1/f1", "f1"] + assert filepaths == map_osp(["d1/f1", "f1"]) def test_ignore_hidden_files_only(self): self.mkdirs("root/d1") @@ -273,13 +284,13 @@ def test_ignore_hidden_files_only(self): # no ignore filepaths = included_paths(self.path_to("root")) - assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # with ignore filepaths = included_paths( self.path_to("root"), match=["**/*", "!**/.*", "**/.*/*", "!**/.*/.*"] ) - assert filepaths == [".d2/f1", "d1/f1", "f1"] + assert filepaths == map_osp([".d2/f1", "d1/f1", "f1"]) def test_ignore_hidden_explicitly_recursive(self): self.mkdirs("root/d1") @@ -293,11 +304,11 @@ def test_ignore_hidden_explicitly_recursive(self): # no ignore filepaths = included_paths(self.path_to("root")) - assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # with ignore filepaths = included_paths(self.path_to("root"), match=["*", "!**/.*"]) - assert filepaths == ["d1/f1", "f1"] + assert filepaths == map_osp(["d1/f1", "f1"]) def test_exclude_hidden_dirs(self): self.mkdirs("root/d1") @@ -312,11 +323,13 @@ def test_exclude_hidden_dirs(self): # no ignore filepaths = included_paths(self.path_to("root"), empty_dirs=True) - assert filepaths == [".d2/f1", ".f2", "d1/.d1/.", "d1/.f2", "d1/f1", "f1"] + assert filepaths == map_osp( + [".d2/f1", ".f2", "d1/.d1/.", "d1/.f2", "d1/f1", "f1"] + ) # with ignore filepaths = included_paths(self.path_to("root"), match=["*", "!.*/"]) - assert filepaths == [".f2", "d1/.f2", "d1/f1", "f1"] + assert filepaths == map_osp([".f2", "d1/.f2", "d1/f1", "f1"]) def test_exclude_hidden_dirs_and_files(self): self.mkdirs("root/d1") @@ -330,11 +343,11 @@ def test_exclude_hidden_dirs_and_files(self): # no ignore filepaths = included_paths(self.path_to("root")) - assert filepaths == [".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"] + assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) # using ignore filepaths = included_paths(self.path_to("root"), match=["*", "!.*/", "!.*"]) - assert filepaths == ["d1/f1", "f1"] + assert filepaths == map_osp(["d1/f1", "f1"]) def test_exclude_extensions(self): self.mkdirs("root/d1") @@ -353,14 +366,16 @@ def test_exclude_extensions(self): filepaths = included_paths( self.path_to("root"), match=["*", "!*.skip1", "!*.skip2"] ) - assert filepaths == [ - "d1/f.txt", - "f", - "f.skip1.txt", - "f.skip1skip2", - "f.txt", - "fskip1", - ] + assert filepaths == map_osp( + [ + "d1/f.txt", + "f", + "f.skip1.txt", + "f.skip1skip2", + "f.txt", + "fskip1", + ] + ) def test_empty_dirs_include_vs_exclude(self): self.mkdirs("root/d1") @@ -372,14 +387,14 @@ def test_empty_dirs_include_vs_exclude(self): self.mkfile("root/d3/d31/f") filepaths = included_paths(self.path_to("root"), empty_dirs=False) - assert filepaths == ["d1/f", "d3/d31/f"] + assert filepaths == map_osp(["d1/f", "d3/d31/f"]) # `include_empty=False` is default filepaths = included_paths(self.path_to("root")) - assert filepaths == ["d1/f", "d3/d31/f"] + assert filepaths == map_osp(["d1/f", "d3/d31/f"]) filepaths = included_paths(self.path_to("root"), empty_dirs=True) - assert filepaths == ["d1/f", "d2/.", "d3/d31/f", "d4/d41/."] + assert filepaths == map_osp(["d1/f", "d2/.", "d3/d31/f", "d4/d41/."]) def test_empty_dirs_because_of_filter_include_vs_exclude(self): self.mkdirs("root/d1") @@ -391,19 +406,19 @@ def test_empty_dirs_because_of_filter_include_vs_exclude(self): filepaths = included_paths( self.path_to("root"), match=["*", "!.*"], empty_dirs=False ) - assert filepaths == ["d1/f"] + assert filepaths == map_osp(["d1/f"]) # `include_empty=False` is default filepaths = included_paths( self.path_to("root"), match=["*", "!.*"], ) - assert filepaths == ["d1/f"] + assert filepaths == map_osp(["d1/f"]) filepaths = included_paths( self.path_to("root"), match=["*", "!.*"], empty_dirs=True ) - assert filepaths == ["d1/f", "d2/."] + assert filepaths == map_osp(["d1/f", "d2/."]) def test_empty_dir_inclusion_not_affected_by_match(self): self.mkdirs("root/d1") @@ -414,17 +429,17 @@ def test_empty_dir_inclusion_not_affected_by_match(self): filepaths = included_paths( self.path_to("root"), match=["*", "!.*"], empty_dirs=True ) - assert filepaths == [".d2/.", "d1/."] + assert filepaths == map_osp([".d2/.", "d1/."]) filepaths = included_paths( self.path_to("root"), match=["*", "!.*/"], empty_dirs=True ) - assert filepaths == [".d2/.", "d1/."] + assert filepaths == map_osp([".d2/.", "d1/."]) filepaths = included_paths( self.path_to("root"), match=["*", "!d1"], empty_dirs=True ) - assert filepaths == [".d2/.", "d1/."] + assert filepaths == map_osp([".d2/.", "d1/."]) def dirhash_mp_comp(*args, **kwargs): @@ -658,6 +673,11 @@ def test_raise_on_not_at_least_one_of_name_and_data(self): self.path_to("root1"), "sha256", entry_properties=["is_link"] ) + @pytest.mark.skipif( + os.name == "nt", + reason="TODO: not getting expected speedup on Windows.", + # TODO: see https://github.com/andhus/scantree/issues/25 + ) def test_multiproc_speedup(self): self.mkdirs("root/dir") num_files = 10 @@ -704,7 +724,7 @@ def test_cache_by_real_path_speedup(self, tmpdir): target_file = tmpdir.join("target_file") target_file.ensure() for i in range(num_links): - root2.join(f"link_{i}").mksymlinkto(target_file) + os.symlink(target_file, root2.join(f"link_{i}")) overhead_margin_factor = 1.5 expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time @@ -743,7 +763,7 @@ def test_cache_together_with_multiprocess_speedup(self, tmpdir): target_file = tmpdir.join(target_file_name) target_file.write("< one chunk content", ensure=True) for j in range(num_links_per_file): - root2.join(f"link_{i}_{j}").mksymlinkto(target_file) + os.symlink(target_file, root2.join(f"link_{i}_{j}")) overhead_margin_factor = 1.5 expected_max_elapsed_with_links = ( From 1ead28a0ede6c8f039ab8b8107b71b011b3d435d Mon Sep 17 00:00:00 2001 From: Anders Huss Date: Sun, 4 Aug 2024 00:12:01 +0200 Subject: [PATCH 51/51] sets lower bound for scantree (#30) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4794fb7..708f049 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ author_email="andhus@kth.se", license="MIT", python_requires=">=3.8", - install_requires=["scantree"], + install_requires=["scantree>=0.0.4"], packages=find_packages("src"), package_dir={"": "src"}, include_package_data=True,