amazon-ion · popematt · Sep 3, 2021 · Aug 27, 2021 · Sep 2, 2021 · tgregg
diff --git a/ionhash/__init__.py b/ionhash/__init__.py
@@ -11,17 +11,10 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-from six import BytesIO
-
-from amazon.ion.core import ION_STREAM_END_EVENT
 from amazon.ion.simple_types import _IonNature
-from amazon.ion.simpleion import _dump, _FROM_TYPE
-from amazon.ion.writer import blocking_writer
-from amazon.ion.writer_binary import binary_writer
 
+from ionhash.fast_value_hasher import hash_value
 from ionhash.hasher import hashlib_hash_function_provider
-from ionhash.hasher import hash_writer
-from ionhash.hasher import HashEvent
 
 
 # pydoc for this method is DUPLICATED in docs/index.rst
@@ -54,10 +47,7 @@ def ion_hash(self, algorithm=None, hash_function_provider=None):
     else:
         hfp = hash_function_provider
 
-    hw = hash_writer(blocking_writer(binary_writer(), BytesIO()), hfp)
-    _dump(self, hw, _FROM_TYPE)
-    hw.send(ION_STREAM_END_EVENT)
-    return hw.send(HashEvent.DIGEST)
+    return hash_value(self, hfp)
 
 
 # adds the `ion_hash` method to all simpleion value classes:

diff --git a/ionhash/fast_value_hasher.py b/ionhash/fast_value_hasher.py
@@ -0,0 +1,99 @@
+from amazon.ion.core import IonType
+from amazon.ion.simple_types import IonPyNull
+
+from functools import cmp_to_key
+
+from ionhash.hasher import _bytearray_comparator, _scalar_or_null_split_parts, _serialize_null, \
+    _UPDATE_SCALAR_HASH_BYTES_JUMP_TABLE, _BEGIN_MARKER, _TQ, _END_MARKER, \
+    _BEGIN_MARKER_BYTE, _END_MARKER_BYTE, _TQ_ANNOTATED_VALUE, _escape, _TQ_SYMBOL_SID0
+
+
+class _IonEventDuck:
+    """Looks like an IonEvent, quacks like an IonEvent...
+    Used for sending scalar values to the existing ion_binary_writer serializers.
+    """
+    def __init__(self, value, ion_type):
+        self.value = value
+        self.ion_type = ion_type
+
+
+# H(value) → h(s(value))
+def hash_value(value, hfp):
+    """An implementation of the [Ion Hash algorithm](https://github.com/amzn/ion-hash/blob/gh-pages/docs/spec.md)
+    for the Ion data model that doesn't instantiate any ion_readers or ion_writers.
+
+    Args:
+        value: the Ion value to hash
+        hfp: hash function provider
+
+    Returns:
+        Ion Hash digest of the given Ion value
+    """
+    hash_fn = hfp()
+    hash_fn.update(serialize_value(value, hfp))
+    return hash_fn.digest()
+
+
+# s(value) → serialized bytes
+def serialize_value(value, hfp):
+    """Transforms an Ion value to its Ion Hash serialized representation.
+
+    Args:
+        value: the Ion value to serialize
+        hfp: hash function provider
+
+    Returns:
+        bytes representing the given Ion value, serialized according to the Ion Hash algorithm
+    """
+    if value.ion_annotations:
+        return _s_annotated_value(value, hfp)
+    else:
+        return _s_value(value, hfp)
+
+
+# s(annotated value) → B || TQ || s(annotation1) || s(annotation2) || ... || s(annotationn) || s(value) || E
+def _s_annotated_value(value, hfp):
+    return _BEGIN_MARKER + _TQ_ANNOTATED_VALUE + b''.join([_write_symbol(a) for a in value.ion_annotations]) \
+           + _s_value(value, hfp) + _END_MARKER
+
+
+# s(struct) → B || TQ || escape(concat(sort(H(field1), H(field2), ..., H(fieldn)))) || E
+# s(list) or s(sexp) → B || TQ || s(value1) || s(value2) || ... || s(valuen)) || E
+# s(scalar) → B || TQ || escape(representation) || E
+def _s_value(value, hfp):
+    ion_type = value.ion_type
+    is_ion_null = isinstance(value, IonPyNull)
+    if ion_type == IonType.STRUCT and not is_ion_null:
+        field_hashes = [_h_field(field_name, field_value, hfp) for [field_name, field_value] in value.iteritems()]
+        field_hashes.sort(key=cmp_to_key(_bytearray_comparator))
+        return _BEGIN_MARKER + bytes([_TQ[IonType.STRUCT]]) + _escape(b''.join(field_hashes)) + _END_MARKER
+    elif ion_type in [IonType.LIST, IonType.SEXP] and not is_ion_null:
+        return _BEGIN_MARKER + bytes([_TQ[ion_type]]) \
+               + b''.join([bytes(serialize_value(child, hfp)) for child in value]) + _END_MARKER
+    else:
+        serializer = _serialize_null if is_ion_null else _UPDATE_SCALAR_HASH_BYTES_JUMP_TABLE[ion_type]
+        scalar_bytes = serializer(_IonEventDuck(None if is_ion_null else value, ion_type))
+        [tq, representation] = _scalar_or_null_split_parts(ion_type, scalar_bytes)
+        if len(representation) == 0:
+            return bytes([_BEGIN_MARKER_BYTE, tq, _END_MARKER_BYTE])
+        else:
+            return b''.join([_BEGIN_MARKER, bytes([tq]), _escape(representation), _END_MARKER])
+
+
+# H(field) → h(s(fieldname) || s(fieldvalue))
+def _h_field(field_name, field_value, hfp):
+    hash_fn = hfp()
+    hash_fn.update(_write_symbol(field_name) + serialize_value(field_value, hfp))
+    return hash_fn.digest()
+
+
+# Function for writing symbol tokens (annotations and field names)
+# Has simplified logic compared to regular function because we can make some assumptions about it
+# Namely, that this value does not have annotations, it is always type "symbol"
+def _write_symbol(text_or_symbol_token):
+    text = getattr(text_or_symbol_token, 'text', text_or_symbol_token)
+    if text is None:
+        return bytes([_BEGIN_MARKER_BYTE, _TQ_SYMBOL_SID0, _END_MARKER_BYTE])
+    else:
+        return _BEGIN_MARKER + bytes([_TQ[IonType.SYMBOL]]) \
+               + _escape(bytearray(text, encoding="utf-8")) + _END_MARKER
diff --git a/ionhash/hasher.py b/ionhash/hasher.py
@@ -475,19 +475,12 @@ def _bytearray_comparator(a, b):
 def _escape(_bytes):
     """If _bytes contains one or more BEGIN_MARKER_BYTEs, END_MARKER_BYTEs, or ESCAPE_BYTEs,
     returns a new bytearray with such bytes preceeded by a ESCAPE_BYTE;  otherwise, returns
-    the original _bytes unchanged."
+    the original _bytes unchanged.
     """
-    for b in _bytes:
-        if b == _BEGIN_MARKER_BYTE or b == _END_MARKER_BYTE or b == _ESCAPE_BYTE:
-            # found a byte that needs to be escaped;  build a new byte array that
-            # escapes that byte as well as any others
-            escaped_bytes = bytearray()
-            for c in _bytes:
-                if c == _BEGIN_MARKER_BYTE or c == _END_MARKER_BYTE or c == _ESCAPE_BYTE:
-                    escaped_bytes.append(_ESCAPE_BYTE)
-                escaped_bytes.append(c)
-            return escaped_bytes
-
+    if _BEGIN_MARKER_BYTE in _bytes or _END_MARKER_BYTE in _bytes or _ESCAPE_BYTE in _bytes:
+        return _bytes.replace(bytes([_ESCAPE_BYTE]), bytes([_ESCAPE_BYTE, _ESCAPE_BYTE])) \
+                     .replace(_BEGIN_MARKER, bytes([_ESCAPE_BYTE, _BEGIN_MARKER_BYTE])) \
+                     .replace(_END_MARKER, bytes([_ESCAPE_BYTE, _END_MARKER_BYTE]))
     # no escaping needed, return the original _bytes
     return _bytes
 
diff --git a/tests/test_ion_hash_tests.py b/tests/test_ion_hash_tests.py
@@ -196,22 +196,22 @@ def to_ion_hash(algorithm):
                                                                             _actual_updates,
                                                                             _actual_digests))
 
-    _run_test(ion_test, to_ion_hash)
-
+    # Do not assert on expected_updates because the implementation of ion_hash() is not backed by an ion_writer
+    _run_test(ion_test, to_ion_hash, should_assert_on_expected_updates=False)
 
 _actual_updates = []
 _actual_digests = []
 
 
-def _run_test(ion_test, digester):
+def _run_test(ion_test, digester, should_assert_on_expected_updates=True):
     expect = ion_test['expect']
     for algorithm in expect:
         expected_updates = []
         expected_digests = []
         final_digest = None
         for sexp in expect[algorithm]:
             annot = sexp.ion_annotations[0].text
-            if annot == "update":
+            if annot == "update" and should_assert_on_expected_updates:
                 expected_updates.append(sexp_to_bytearray(sexp))
                 pass
             elif annot == "digest":
@@ -224,7 +224,7 @@ def _run_test(ion_test, digester):
 
         actual_digest_bytes = digester(algorithm)
 
-        if len(expected_updates) > 0:
+        if should_assert_on_expected_updates and len(expected_updates) > 0:
             assert _actual_updates == expected_updates
 
         if final_digest is not None: