Source code for pdiffcopy.hashing

# Fast large file synchronization inspired by rsync.
#
# Author: Peter Odding <peter@peterodding.com>
# Last Change: March 6, 2020
# URL: https://pdiffcopy.readthedocs.io

"""Parallel hashing of files using :mod:`multiprocessing` and :mod:`pdiffcopy.mp`."""

# Standard library modules.
import functools
import hashlib
import os

# External dependencies.
from six.moves import range

# Modules included in our package.
from pdiffcopy.mp import WorkerPool

# Public identifiers that require documentation.
__all__ = ("compute_hashes", "hash_worker")


[docs]def compute_hashes(filename, block_size, method, concurrency): """Compute checksums of a file in blocks (parallel).""" with WorkerPool( concurrency=concurrency, generator_fn=functools.partial(range, 0, os.path.getsize(filename), block_size), worker_fn=functools.partial(hash_worker, block_size=block_size, filename=filename, method=method), ) as pool: for offset, digest in pool: yield offset, digest
[docs]def hash_worker(offset, block_size, filename, method): """Worker function to be run in child processes.""" with open(filename, "rb") as handle: handle.seek(offset) context = hashlib.new(method) context.update(handle.read(block_size)) return offset, context.hexdigest()