Source code for pdiffcopy.hashing

# Fast large file synchronization inspired by rsync.
#
# Author: Peter Odding <peter@peterodding.com>
# Last Change: March 6, 2020
# URL: https://pdiffcopy.readthedocs.io

"""Parallel hashing of files using :mod:`multiprocessing` and :mod:`pdiffcopy.mp`."""

# Standard library modules.
import functools
import hashlib
import os

# External dependencies.
from six.moves import range

# Modules included in our package.
from pdiffcopy.mp import WorkerPool

# Public identifiers that require documentation.
__all__ = ("compute_hashes", "hash_worker")


[docs]def compute_hashes(filename, block_size, method, concurrency):
    """Compute checksums of a file in blocks (parallel)."""
    with WorkerPool(
        concurrency=concurrency,
        generator_fn=functools.partial(range, 0, os.path.getsize(filename), block_size),
        worker_fn=functools.partial(hash_worker, block_size=block_size, filename=filename, method=method),
    ) as pool:
        for offset, digest in pool:
            yield offset, digest


[docs]def hash_worker(offset, block_size, filename, method):
    """Worker function to be run in child processes."""
    with open(filename, "rb") as handle:
        handle.seek(offset)
        context = hashlib.new(method)
        context.update(handle.read(block_size))
        return offset, context.hexdigest()