From 75e61e77b0d8de07bffffadc28fed90759b0e3ad Mon Sep 17 00:00:00 2001 From: Michał Górny Date: Tue, 13 Feb 2018 11:06:12 +0100 Subject: hash: Read small-ish files into memory via a single call Read files that are known to be small-ish (up to 1 MiB) into the memory via a single read() call instead of reading in smaller parts. This is mostly meant to workaround a bug in PyPy that causes wrong data when small buffers are repeatedly passed to C extensions. Apparently slurping the whole file does not exhibit this problem in gemato. --- gemato/hash.py | 19 ++++++++++++++++--- gemato/verify.py | 3 ++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/gemato/hash.py b/gemato/hash.py index b5b57cd..d8691e1 100644 --- a/gemato/hash.py +++ b/gemato/hash.py @@ -8,8 +8,8 @@ import io import gemato.exceptions - HASH_BUFFER_SIZE = 65536 +MAX_SLURP_SIZE = 1048576 class SizeHash(object): @@ -67,17 +67,30 @@ def get_hash_by_name(name): raise gemato.exceptions.UnsupportedHash(name) -def hash_file(f, hash_names): +def hash_file(f, hash_names, _apparent_size=0): """ Hash the contents of file object @f using all hashes specified as @hash_names. Returns a dict of (hash_name -> hex value) mappings. + + @_apparent_size can be given as a tip on how large is the file + expected to be. This is a private API used to workaround bug in PyPy + and should not be relied on being present long-term. """ hashes = {} for h in hash_names: hashes[h] = get_hash_by_name(h) - for block in iter(lambda: f.read1(HASH_BUFFER_SIZE), b''): + if _apparent_size != 0 and _apparent_size < MAX_SLURP_SIZE: + # if the file is reasonably small, read it all into one buffer; + # we do this since PyPy has some serious bug in dealing with + # passing buffers to C extensions and this apparently fails + # less; https://bitbucket.org/pypy/pypy/issues/2752 + block = f.read() for h in hashes.values(): h.update(block) + else: + for block in iter(lambda: f.read1(HASH_BUFFER_SIZE), b''): + for h in hashes.values(): + h.update(block) return dict((k, h.hexdigest()) for k, h in hashes.items()) diff --git a/gemato/verify.py b/gemato/verify.py index 6193e76..7cef4e5 100644 --- a/gemato/verify.py +++ b/gemato/verify.py @@ -112,7 +112,8 @@ def get_file_metadata(path, hashes): hashes = list(gemato.manifest.manifest_hashes_to_hashlib(e_hashes)) e_hashes.append('__size__') hashes.append('__size__') - checksums = gemato.hash.hash_file(f, hashes) + checksums = gemato.hash.hash_file(f, hashes, + _apparent_size=st.st_size) ret = {} for ek, k in zip(e_hashes, hashes): -- cgit v1.2.3