Skip to content

Commit

Permalink
Merge pull request #764 from benjeffery/metadata-schema-equality
Browse files Browse the repository at this point in the history
Add equality methods for metadata schemas and canonicalise string representation
  • Loading branch information
mergify[bot] authored Aug 12, 2020
2 parents 05f8d40 + 0ef84d2 commit 59d3c30
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 5 deletions.
136 changes: 134 additions & 2 deletions python/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def test_metadata_schema(self):
"additionalProperties": False,
}
ms = metadata.MetadataSchema(schema)
self.assertEqual(str(ms), json.dumps(schema))
self.assertEqual(str(ms), tskit.canonical_json(schema))
# Missing required properties
with self.assertRaises(exceptions.MetadataValidationError):
ms.validate_and_encode_row({})
Expand All @@ -344,6 +344,72 @@ def test_parse(self):
with self.assertRaises(ValueError):
metadata.parse_metadata_schema(json.dumps({"codec": "json"})[:-1])

def test_canonical_string(self):
schema = collections.OrderedDict(
codec="json",
title="Example Metadata",
type="object",
properties=collections.OrderedDict(
one={"type": "string"}, two={"type": "number"}
),
required=["one", "two"],
additionalProperties=False,
)
schema2 = collections.OrderedDict(
type="object",
properties=collections.OrderedDict(
two={"type": "number"}, one={"type": "string"}
),
required=["one", "two"],
additionalProperties=False,
title="Example Metadata",
codec="json",
)
self.assertNotEqual(json.dumps(schema), json.dumps(schema2))
self.assertEqual(
str(metadata.MetadataSchema(schema)), str(metadata.MetadataSchema(schema2))
)

def test_equality(self):
schema = metadata.MetadataSchema(
{
"codec": "json",
"title": "Example Metadata",
"type": "object",
"properties": {"one": {"type": "string"}, "two": {"type": "number"}},
"required": ["one", "two"],
"additionalProperties": False,
}
)
schema_same = metadata.MetadataSchema(
collections.OrderedDict(
type="object",
properties=collections.OrderedDict(
two={"type": "number"}, one={"type": "string"}
),
required=["one", "two"],
additionalProperties=False,
title="Example Metadata",
codec="json",
)
)
schema_diff = metadata.MetadataSchema(
{
"codec": "json",
"title": "Example Metadata",
"type": "object",
"properties": {"one": {"type": "string"}, "two": {"type": "string"}},
"required": ["one", "two"],
"additionalProperties": False,
}
)
self.assertTrue(schema == schema)
self.assertFalse(schema != schema)
self.assertTrue(schema == schema_same)
self.assertFalse(schema != schema_same)
self.assertTrue(schema != schema_diff)
self.assertFalse(schema == schema_diff)

def test_bad_top_level_type(self):
for bad_type in ["array", "boolean", "integer", "null", "number", "string"]:
schema = {
Expand Down Expand Up @@ -388,11 +454,19 @@ def test_json_codec(self):
# Valid row data
row_data = {"one": "tree", "two": 5}
self.assertEqual(
ms.validate_and_encode_row(row_data), json.dumps(row_data).encode()
ms.validate_and_encode_row(row_data),
tskit.canonical_json(row_data).encode(),
)
self.assertEqual(ms.decode_row(json.dumps(row_data).encode()), row_data)
# Round trip
self.assertEqual(ms.decode_row(ms.validate_and_encode_row(row_data)), row_data)
# Test canonical encoding
row_data = collections.OrderedDict(one="tree", two=5)
row_data2 = collections.OrderedDict(two=5, one="tree")
self.assertNotEqual(json.dumps(row_data), json.dumps(row_data2))
self.assertEqual(
ms.validate_and_encode_row(row_data), ms.validate_and_encode_row(row_data2)
)

def test_msgpack_codec(self):
class MsgPackCodec(metadata.AbstractMetadataCodec):
Expand Down Expand Up @@ -1626,3 +1700,61 @@ def test_population(self):
self.assertDictEqual(
metadata.MetadataSchema(schema).decode_row(example), expected
)


class TestTableCollectionEquality(unittest.TestCase):
def test_equality(self):
ts = msprime.simulate(10, random_seed=42)
tables = ts.dump_tables()
tables2 = ts.dump_tables()
schema = collections.OrderedDict(
codec="json",
title="Example Metadata",
type="object",
properties=collections.OrderedDict(
one={"type": "string"}, two={"type": "number"}
),
required=["one", "two"],
additionalProperties=False,
)
schema2 = collections.OrderedDict(
type="object",
properties=collections.OrderedDict(
two={"type": "number"}, one={"type": "string"}
),
required=["one", "two"],
additionalProperties=False,
title="Example Metadata",
codec="json",
)
tables.metadata_schema = metadata.MetadataSchema(schema)
self.assertNotEqual(tables, tables2)
tables2.metadata_schema = metadata.MetadataSchema(schema2)
self.assertEqual(tables, tables2)
tables.metadata = collections.OrderedDict(one="tree", two=5)
self.assertNotEqual(tables, tables2)
tables2.metadata = collections.OrderedDict(two=5, one="tree")
self.assertEqual(tables, tables2)

def test_fixing_uncanonical(self):
ts = msprime.simulate(10, random_seed=42)
tables = ts.dump_tables()
schema = collections.OrderedDict(
codec="json",
title="Example Metadata",
type="object",
properties=collections.OrderedDict(
one={"type": "string"}, two={"type": "number"}
),
required=["one", "two"],
additionalProperties=False,
)
# Set with low-level to emulate loading.
tables.ll_tables.metadata_schema = json.dumps(schema)
self.assertNotEqual(
tables.ll_tables.metadata_schema, tskit.canonical_json(schema),
)
tables.metadata_schema = tables.metadata_schema
self.assertEqual(
tables.ll_tables.metadata_schema, tskit.canonical_json(schema),
)
29 changes: 29 additions & 0 deletions python/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"""
Tests for functions in util.py
"""
import collections
import itertools
import math
import pickle
Expand All @@ -34,6 +35,34 @@
from tskit import UNKNOWN_TIME


class TestCanonicalJSON(unittest.TestCase):
def test_canonical_json(self):
self.assertEqual(util.canonical_json([3, 2, 1]), "[3,2,1]")
self.assertEqual(
util.canonical_json(collections.OrderedDict(c=3, b=2, a=1)),
'{"a":1,"b":2,"c":3}',
)
self.assertEqual(
util.canonical_json(
collections.OrderedDict(
c="3",
b=collections.OrderedDict(
{
"b": 1,
"z": {},
" space": 42,
"1": "number",
"_": "underscore",
}
),
a="1",
)
),
'{"a":"1","b":{" space":42,"1":"number",'
'"_":"underscore","b":1,"z":{}},"c":"3"}',
)


class TestUnknownTime(unittest.TestCase):
def test_unknown_time(self):
self.assertTrue(math.isnan(UNKNOWN_TIME))
Expand Down
12 changes: 9 additions & 3 deletions python/tskit/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

import jsonschema

import tskit
import tskit.exceptions as exceptions


Expand Down Expand Up @@ -111,7 +112,7 @@ def __init__(self, schema: Mapping[str, Any]) -> None:
pass

def encode(self, obj: Any) -> bytes:
return json.dumps(obj).encode()
return tskit.canonical_json(obj).encode()

def decode(self, encoded: bytes) -> Any:
return json.loads(encoded.decode())
Expand Down Expand Up @@ -516,14 +517,17 @@ def __init__(self, schema: Optional[Mapping[str, Any]]) -> None:
# does not.
schema = codec_cls.modify_schema(schema)
codec_instance = codec_cls(schema)
self._string = json.dumps(schema)
self._string = tskit.canonical_json(schema)
self._validate_row = TSKITMetadataSchemaValidator(schema).validate
self.encode_row = codec_instance.encode
self.decode_row = codec_instance.decode

def __str__(self) -> str:
return self._string

def __eq__(self, other) -> bool:
return self._string == other._string

@property
def schema(self) -> Optional[Mapping[str, Any]]:
# Make schema read-only
Expand Down Expand Up @@ -567,7 +571,9 @@ def parse_metadata_schema(encoded_schema: str) -> MetadataSchema:
return MetadataSchema(schema=None)
else:
try:
decoded = json.loads(encoded_schema)
decoded = json.loads(
encoded_schema, object_pairs_hook=collections.OrderedDict
)
except json.decoder.JSONDecodeError:
raise ValueError(f"Metadata schema is not JSON, found {encoded_schema}")
return MetadataSchema(decoded)
13 changes: 13 additions & 0 deletions python/tskit/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,26 @@
"""
Module responsible for various utility functions used in other modules.
"""
import json
import struct

import numpy as np

from tskit import UNKNOWN_TIME


def canonical_json(obj):
"""
Returns string of encoded JSON with keys sorted and whitespace removed to enable
byte-level comparison of encoded data.
:param Any obj: Python object to encode
:return: The encoded string
:rtype: str
"""
return json.dumps(obj, sort_keys=True, separators=(",", ":"))


def is_unknown_time(time):
"""
As the default unknown mutation time is NAN equality always fails. This
Expand Down

0 comments on commit 59d3c30

Please sign in to comment.