Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add faster implementation of _IonNature.ion_hash #22

Merged
merged 2 commits into from
Sep 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions ionhash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,10 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

from six import BytesIO

from amazon.ion.core import ION_STREAM_END_EVENT
from amazon.ion.simple_types import _IonNature
from amazon.ion.simpleion import _dump, _FROM_TYPE
from amazon.ion.writer import blocking_writer
from amazon.ion.writer_binary import binary_writer

from ionhash.fast_value_hasher import hash_value
from ionhash.hasher import hashlib_hash_function_provider
from ionhash.hasher import hash_writer
from ionhash.hasher import HashEvent


# pydoc for this method is DUPLICATED in docs/index.rst
Expand Down Expand Up @@ -54,10 +47,7 @@ def ion_hash(self, algorithm=None, hash_function_provider=None):
else:
hfp = hash_function_provider

hw = hash_writer(blocking_writer(binary_writer(), BytesIO()), hfp)
_dump(self, hw, _FROM_TYPE)
hw.send(ION_STREAM_END_EVENT)
return hw.send(HashEvent.DIGEST)
return hash_value(self, hfp)


# adds the `ion_hash` method to all simpleion value classes:
Expand Down
99 changes: 99 additions & 0 deletions ionhash/fast_value_hasher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from amazon.ion.core import IonType
from amazon.ion.simple_types import IonPyNull

from functools import cmp_to_key

from ionhash.hasher import _bytearray_comparator, _scalar_or_null_split_parts, _serialize_null, \
_UPDATE_SCALAR_HASH_BYTES_JUMP_TABLE, _BEGIN_MARKER, _TQ, _END_MARKER, \
_BEGIN_MARKER_BYTE, _END_MARKER_BYTE, _TQ_ANNOTATED_VALUE, _escape, _TQ_SYMBOL_SID0


class _IonEventDuck:
"""Looks like an IonEvent, quacks like an IonEvent...
Used for sending scalar values to the existing ion_binary_writer serializers.
"""
def __init__(self, value, ion_type):
self.value = value
self.ion_type = ion_type


# H(value) → h(s(value))
def hash_value(value, hfp):
"""An implementation of the [Ion Hash algorithm](https://github.com/amzn/ion-hash/blob/gh-pages/docs/spec.md)
for the Ion data model that doesn't instantiate any ion_readers or ion_writers.

Args:
value: the Ion value to hash
hfp: hash function provider

Returns:
Ion Hash digest of the given Ion value
"""
hash_fn = hfp()
hash_fn.update(serialize_value(value, hfp))
return hash_fn.digest()


# s(value) → serialized bytes
def serialize_value(value, hfp):
"""Transforms an Ion value to its Ion Hash serialized representation.

Args:
value: the Ion value to serialize
hfp: hash function provider

Returns:
bytes representing the given Ion value, serialized according to the Ion Hash algorithm
"""
if value.ion_annotations:
return _s_annotated_value(value, hfp)
else:
return _s_value(value, hfp)


# s(annotated value) → B || TQ || s(annotation1) || s(annotation2) || ... || s(annotationn) || s(value) || E
def _s_annotated_value(value, hfp):
return _BEGIN_MARKER + _TQ_ANNOTATED_VALUE + b''.join([_write_symbol(a) for a in value.ion_annotations]) \
+ _s_value(value, hfp) + _END_MARKER


# s(struct) → B || TQ || escape(concat(sort(H(field1), H(field2), ..., H(fieldn)))) || E
# s(list) or s(sexp) → B || TQ || s(value1) || s(value2) || ... || s(valuen)) || E
# s(scalar) → B || TQ || escape(representation) || E
def _s_value(value, hfp):
ion_type = value.ion_type
is_ion_null = isinstance(value, IonPyNull)
if ion_type == IonType.STRUCT and not is_ion_null:
field_hashes = [_h_field(field_name, field_value, hfp) for [field_name, field_value] in value.iteritems()]
field_hashes.sort(key=cmp_to_key(_bytearray_comparator))
return _BEGIN_MARKER + bytes([_TQ[IonType.STRUCT]]) + _escape(b''.join(field_hashes)) + _END_MARKER
elif ion_type in [IonType.LIST, IonType.SEXP] and not is_ion_null:
return _BEGIN_MARKER + bytes([_TQ[ion_type]]) \
+ b''.join([bytes(serialize_value(child, hfp)) for child in value]) + _END_MARKER
else:
serializer = _serialize_null if is_ion_null else _UPDATE_SCALAR_HASH_BYTES_JUMP_TABLE[ion_type]
scalar_bytes = serializer(_IonEventDuck(None if is_ion_null else value, ion_type))
[tq, representation] = _scalar_or_null_split_parts(ion_type, scalar_bytes)
if len(representation) == 0:
return bytes([_BEGIN_MARKER_BYTE, tq, _END_MARKER_BYTE])
else:
return b''.join([_BEGIN_MARKER, bytes([tq]), _escape(representation), _END_MARKER])


# H(field) → h(s(fieldname) || s(fieldvalue))
def _h_field(field_name, field_value, hfp):
hash_fn = hfp()
hash_fn.update(_write_symbol(field_name) + serialize_value(field_value, hfp))
return hash_fn.digest()


# Function for writing symbol tokens (annotations and field names)
# Has simplified logic compared to regular function because we can make some assumptions about it
# Namely, that this value does not have annotations, it is always type "symbol"
def _write_symbol(text_or_symbol_token):
text = getattr(text_or_symbol_token, 'text', text_or_symbol_token)
if text is None:
return bytes([_BEGIN_MARKER_BYTE, _TQ_SYMBOL_SID0, _END_MARKER_BYTE])
else:
return _BEGIN_MARKER + bytes([_TQ[IonType.SYMBOL]]) \
+ _escape(bytearray(text, encoding="utf-8")) + _END_MARKER
17 changes: 5 additions & 12 deletions ionhash/hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,19 +475,12 @@ def _bytearray_comparator(a, b):
def _escape(_bytes):
"""If _bytes contains one or more BEGIN_MARKER_BYTEs, END_MARKER_BYTEs, or ESCAPE_BYTEs,
returns a new bytearray with such bytes preceeded by a ESCAPE_BYTE; otherwise, returns
the original _bytes unchanged."
the original _bytes unchanged.
"""
for b in _bytes:
if b == _BEGIN_MARKER_BYTE or b == _END_MARKER_BYTE or b == _ESCAPE_BYTE:
# found a byte that needs to be escaped; build a new byte array that
# escapes that byte as well as any others
escaped_bytes = bytearray()
for c in _bytes:
if c == _BEGIN_MARKER_BYTE or c == _END_MARKER_BYTE or c == _ESCAPE_BYTE:
escaped_bytes.append(_ESCAPE_BYTE)
escaped_bytes.append(c)
return escaped_bytes

if _BEGIN_MARKER_BYTE in _bytes or _END_MARKER_BYTE in _bytes or _ESCAPE_BYTE in _bytes:
return _bytes.replace(bytes([_ESCAPE_BYTE]), bytes([_ESCAPE_BYTE, _ESCAPE_BYTE])) \
.replace(_BEGIN_MARKER, bytes([_ESCAPE_BYTE, _BEGIN_MARKER_BYTE])) \
.replace(_END_MARKER, bytes([_ESCAPE_BYTE, _END_MARKER_BYTE]))
Comment on lines +481 to +483
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we avoid recomputing bytes([...]) on each call by storing the values?

Copy link
Contributor Author

@popematt popematt Sep 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know. I have also heard that looking up non-local values can be expensive in Python, and if we want to compute once and store the value, then it would need to be a non-local value. I'll create a followup issue to see if it would improve performance.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fine with me, although it does seem like a simple enough thing to try given that you already have a basic perf testing setup.

# no escaping needed, return the original _bytes
return _bytes

10 changes: 5 additions & 5 deletions tests/test_ion_hash_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,22 +196,22 @@ def to_ion_hash(algorithm):
_actual_updates,
_actual_digests))

_run_test(ion_test, to_ion_hash)

# Do not assert on expected_updates because the implementation of ion_hash() is not backed by an ion_writer
_run_test(ion_test, to_ion_hash, should_assert_on_expected_updates=False)

_actual_updates = []
_actual_digests = []


def _run_test(ion_test, digester):
def _run_test(ion_test, digester, should_assert_on_expected_updates=True):
expect = ion_test['expect']
for algorithm in expect:
expected_updates = []
expected_digests = []
final_digest = None
for sexp in expect[algorithm]:
annot = sexp.ion_annotations[0].text
if annot == "update":
if annot == "update" and should_assert_on_expected_updates:
expected_updates.append(sexp_to_bytearray(sexp))
pass
elif annot == "digest":
Expand All @@ -224,7 +224,7 @@ def _run_test(ion_test, digester):

actual_digest_bytes = digester(algorithm)

if len(expected_updates) > 0:
if should_assert_on_expected_updates and len(expected_updates) > 0:
assert _actual_updates == expected_updates

if final_digest is not None:
Expand Down