hash: Read small-ish files into memory via a single call

Read files that are known to be small-ish (up to 1 MiB) into the memory via a single read() call instead of reading in smaller parts. This is mostly meant to workaround a bug in PyPy that causes wrong data when small buffers are repeatedly passed to C extensions. Apparently slurping the whole file does not exhibit this problem in gemato.
author: Michał Górny <mgorny@gentoo.org> 2018-02-13 11:06:12 +0100
committer: Michał Górny <mgorny@gentoo.org> 2018-02-13 11:06:12 +0100
commit: 75e61e77b0d8de07bffffadc28fed90759b0e3ad (patch)
tree: b388979b2770ddcebf8a1fe8f27ea98a5f24e571
parent: de7393dcb9d784f2219b97c418b27f8cf32cd622 (diff)
download: gemato-75e61e77b0d8de07bffffadc28fed90759b0e3ad.tar.gz
2 files changed, 18 insertions, 4 deletions
diff --git a/gemato/hash.py b/gemato/hash.py
index b5b57cd..d8691e1 100644
--- a/gemato/hash.py
+++ b/gemato/hash.py
@@ -8,8 +8,8 @@ import io
 
 import gemato.exceptions
 
-
 HASH_BUFFER_SIZE = 65536
+MAX_SLURP_SIZE = 1048576
 
 
 class SizeHash(object):
@@ -67,17 +67,30 @@ def get_hash_by_name(name):
 	raise gemato.exceptions.UnsupportedHash(name)
 
 
-def hash_file(f, hash_names):
+def hash_file(f, hash_names, _apparent_size=0):
 	"""
 	Hash the contents of file object @f using all hashes specified
 	as @hash_names. Returns a dict of (hash_name -> hex value) mappings.
+
+	@_apparent_size can be given as a tip on how large is the file
+	expected to be. This is a private API used to workaround bug in PyPy
+	and should not be relied on being present long-term.
 	"""
 	hashes = {}
 	for h in hash_names:
 		hashes[h] = get_hash_by_name(h)
-	for block in iter(lambda: f.read1(HASH_BUFFER_SIZE), b''):
+	if _apparent_size != 0 and _apparent_size < MAX_SLURP_SIZE:
+		# if the file is reasonably small, read it all into one buffer;
+		# we do this since PyPy has some serious bug in dealing with
+		# passing buffers to C extensions and this apparently fails
+		# less; https://bitbucket.org/pypy/pypy/issues/2752
+		block = f.read()
 		for h in hashes.values():
 			h.update(block)
+	else:
+		for block in iter(lambda: f.read1(HASH_BUFFER_SIZE), b''):
+			for h in hashes.values():
+				h.update(block)
 	return dict((k, h.hexdigest()) for k, h in hashes.items())
 
 
diff --git a/gemato/verify.py b/gemato/verify.py
index 6193e76..7cef4e5 100644
--- a/gemato/verify.py
+++ b/gemato/verify.py
@@ -112,7 +112,8 @@ def get_file_metadata(path, hashes):
         hashes = list(gemato.manifest.manifest_hashes_to_hashlib(e_hashes))
         e_hashes.append('__size__')
         hashes.append('__size__')
-        checksums = gemato.hash.hash_file(f, hashes)
+        checksums = gemato.hash.hash_file(f, hashes,
+                                          _apparent_size=st.st_size)
 
         ret = {}
         for ek, k in zip(e_hashes, hashes):
author	Michał Górny <mgorny@gentoo.org>	2018-02-13 11:06:12 +0100
committer	Michał Górny <mgorny@gentoo.org>	2018-02-13 11:06:12 +0100
commit	75e61e77b0d8de07bffffadc28fed90759b0e3ad (patch)
tree	b388979b2770ddcebf8a1fe8f27ea98a5f24e571
parent	de7393dcb9d784f2219b97c418b27f8cf32cd622 (diff)
download	gemato-75e61e77b0d8de07bffffadc28fed90759b0e3ad.tar.gz