diff --git a/ionhash/__init__.py b/ionhash/__init__.py index a43c224..fbaf24c 100644 --- a/ionhash/__init__.py +++ b/ionhash/__init__.py @@ -11,17 +11,10 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -from six import BytesIO - -from amazon.ion.core import ION_STREAM_END_EVENT from amazon.ion.simple_types import _IonNature -from amazon.ion.simpleion import _dump, _FROM_TYPE -from amazon.ion.writer import blocking_writer -from amazon.ion.writer_binary import binary_writer +from ionhash.fast_value_hasher import hash_value from ionhash.hasher import hashlib_hash_function_provider -from ionhash.hasher import hash_writer -from ionhash.hasher import HashEvent # pydoc for this method is DUPLICATED in docs/index.rst @@ -54,10 +47,7 @@ def ion_hash(self, algorithm=None, hash_function_provider=None): else: hfp = hash_function_provider - hw = hash_writer(blocking_writer(binary_writer(), BytesIO()), hfp) - _dump(self, hw, _FROM_TYPE) - hw.send(ION_STREAM_END_EVENT) - return hw.send(HashEvent.DIGEST) + return hash_value(self, hfp) # adds the `ion_hash` method to all simpleion value classes: diff --git a/ionhash/fast_value_hasher.py b/ionhash/fast_value_hasher.py new file mode 100644 index 0000000..1d9421b --- /dev/null +++ b/ionhash/fast_value_hasher.py @@ -0,0 +1,99 @@ +from amazon.ion.core import IonType +from amazon.ion.simple_types import IonPyNull + +from functools import cmp_to_key + +from ionhash.hasher import _bytearray_comparator, _scalar_or_null_split_parts, _serialize_null, \ + _UPDATE_SCALAR_HASH_BYTES_JUMP_TABLE, _BEGIN_MARKER, _TQ, _END_MARKER, \ + _BEGIN_MARKER_BYTE, _END_MARKER_BYTE, _TQ_ANNOTATED_VALUE, _escape, _TQ_SYMBOL_SID0 + + +class _IonEventDuck: + """Looks like an IonEvent, quacks like an IonEvent... + Used for sending scalar values to the existing ion_binary_writer serializers. + """ + def __init__(self, value, ion_type): + self.value = value + self.ion_type = ion_type + + +# H(value) → h(s(value)) +def hash_value(value, hfp): + """An implementation of the [Ion Hash algorithm](https://github.com/amzn/ion-hash/blob/gh-pages/docs/spec.md) + for the Ion data model that doesn't instantiate any ion_readers or ion_writers. + + Args: + value: the Ion value to hash + hfp: hash function provider + + Returns: + Ion Hash digest of the given Ion value + """ + hash_fn = hfp() + hash_fn.update(serialize_value(value, hfp)) + return hash_fn.digest() + + +# s(value) → serialized bytes +def serialize_value(value, hfp): + """Transforms an Ion value to its Ion Hash serialized representation. + + Args: + value: the Ion value to serialize + hfp: hash function provider + + Returns: + bytes representing the given Ion value, serialized according to the Ion Hash algorithm + """ + if value.ion_annotations: + return _s_annotated_value(value, hfp) + else: + return _s_value(value, hfp) + + +# s(annotated value) → B || TQ || s(annotation1) || s(annotation2) || ... || s(annotationn) || s(value) || E +def _s_annotated_value(value, hfp): + return _BEGIN_MARKER + _TQ_ANNOTATED_VALUE + b''.join([_write_symbol(a) for a in value.ion_annotations]) \ + + _s_value(value, hfp) + _END_MARKER + + +# s(struct) → B || TQ || escape(concat(sort(H(field1), H(field2), ..., H(fieldn)))) || E +# s(list) or s(sexp) → B || TQ || s(value1) || s(value2) || ... || s(valuen)) || E +# s(scalar) → B || TQ || escape(representation) || E +def _s_value(value, hfp): + ion_type = value.ion_type + is_ion_null = isinstance(value, IonPyNull) + if ion_type == IonType.STRUCT and not is_ion_null: + field_hashes = [_h_field(field_name, field_value, hfp) for [field_name, field_value] in value.iteritems()] + field_hashes.sort(key=cmp_to_key(_bytearray_comparator)) + return _BEGIN_MARKER + bytes([_TQ[IonType.STRUCT]]) + _escape(b''.join(field_hashes)) + _END_MARKER + elif ion_type in [IonType.LIST, IonType.SEXP] and not is_ion_null: + return _BEGIN_MARKER + bytes([_TQ[ion_type]]) \ + + b''.join([bytes(serialize_value(child, hfp)) for child in value]) + _END_MARKER + else: + serializer = _serialize_null if is_ion_null else _UPDATE_SCALAR_HASH_BYTES_JUMP_TABLE[ion_type] + scalar_bytes = serializer(_IonEventDuck(None if is_ion_null else value, ion_type)) + [tq, representation] = _scalar_or_null_split_parts(ion_type, scalar_bytes) + if len(representation) == 0: + return bytes([_BEGIN_MARKER_BYTE, tq, _END_MARKER_BYTE]) + else: + return b''.join([_BEGIN_MARKER, bytes([tq]), _escape(representation), _END_MARKER]) + + +# H(field) → h(s(fieldname) || s(fieldvalue)) +def _h_field(field_name, field_value, hfp): + hash_fn = hfp() + hash_fn.update(_write_symbol(field_name) + serialize_value(field_value, hfp)) + return hash_fn.digest() + + +# Function for writing symbol tokens (annotations and field names) +# Has simplified logic compared to regular function because we can make some assumptions about it +# Namely, that this value does not have annotations, it is always type "symbol" +def _write_symbol(text_or_symbol_token): + text = getattr(text_or_symbol_token, 'text', text_or_symbol_token) + if text is None: + return bytes([_BEGIN_MARKER_BYTE, _TQ_SYMBOL_SID0, _END_MARKER_BYTE]) + else: + return _BEGIN_MARKER + bytes([_TQ[IonType.SYMBOL]]) \ + + _escape(bytearray(text, encoding="utf-8")) + _END_MARKER diff --git a/ionhash/hasher.py b/ionhash/hasher.py index def84c4..2435eb3 100644 --- a/ionhash/hasher.py +++ b/ionhash/hasher.py @@ -475,19 +475,12 @@ def _bytearray_comparator(a, b): def _escape(_bytes): """If _bytes contains one or more BEGIN_MARKER_BYTEs, END_MARKER_BYTEs, or ESCAPE_BYTEs, returns a new bytearray with such bytes preceeded by a ESCAPE_BYTE; otherwise, returns - the original _bytes unchanged." + the original _bytes unchanged. """ - for b in _bytes: - if b == _BEGIN_MARKER_BYTE or b == _END_MARKER_BYTE or b == _ESCAPE_BYTE: - # found a byte that needs to be escaped; build a new byte array that - # escapes that byte as well as any others - escaped_bytes = bytearray() - for c in _bytes: - if c == _BEGIN_MARKER_BYTE or c == _END_MARKER_BYTE or c == _ESCAPE_BYTE: - escaped_bytes.append(_ESCAPE_BYTE) - escaped_bytes.append(c) - return escaped_bytes - + if _BEGIN_MARKER_BYTE in _bytes or _END_MARKER_BYTE in _bytes or _ESCAPE_BYTE in _bytes: + return _bytes.replace(bytes([_ESCAPE_BYTE]), bytes([_ESCAPE_BYTE, _ESCAPE_BYTE])) \ + .replace(_BEGIN_MARKER, bytes([_ESCAPE_BYTE, _BEGIN_MARKER_BYTE])) \ + .replace(_END_MARKER, bytes([_ESCAPE_BYTE, _END_MARKER_BYTE])) # no escaping needed, return the original _bytes return _bytes diff --git a/tests/test_ion_hash_tests.py b/tests/test_ion_hash_tests.py index 5d99a53..566beb8 100644 --- a/tests/test_ion_hash_tests.py +++ b/tests/test_ion_hash_tests.py @@ -196,14 +196,14 @@ def to_ion_hash(algorithm): _actual_updates, _actual_digests)) - _run_test(ion_test, to_ion_hash) - + # Do not assert on expected_updates because the implementation of ion_hash() is not backed by an ion_writer + _run_test(ion_test, to_ion_hash, should_assert_on_expected_updates=False) _actual_updates = [] _actual_digests = [] -def _run_test(ion_test, digester): +def _run_test(ion_test, digester, should_assert_on_expected_updates=True): expect = ion_test['expect'] for algorithm in expect: expected_updates = [] @@ -211,7 +211,7 @@ def _run_test(ion_test, digester): final_digest = None for sexp in expect[algorithm]: annot = sexp.ion_annotations[0].text - if annot == "update": + if annot == "update" and should_assert_on_expected_updates: expected_updates.append(sexp_to_bytearray(sexp)) pass elif annot == "digest": @@ -224,7 +224,7 @@ def _run_test(ion_test, digester): actual_digest_bytes = digester(algorithm) - if len(expected_updates) > 0: + if should_assert_on_expected_updates and len(expected_updates) > 0: assert _actual_updates == expected_updates if final_digest is not None: