From 717afcb7e25033e638f42e4b114edb570ec9a619 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Tue, 6 Aug 2024 10:12:31 +0300 Subject: [PATCH] Replace semantic_similarity_* slots by similarity_* slots (#386) Replace `semantic_similarity_score` with `similarity_score` and `semantic_similarity_measure` with `similarity_measure` in the data model (https://github.com/mapping-commons/sssom/issues/385). The main rationale is that we frequently want to recording similarity scores for measures that are not semantic, such as Levenshtein distance. Co-authored-by: Emily Hartley --- CHANGELOG.md | 1 + examples/schema/similarity_score.sssom.tsv | 15 +++++++ src/docs/spec-formats-tsv.md | 5 +++ src/sssom_schema/schema/sssom_schema.yaml | 46 +++++++++++----------- 4 files changed, 45 insertions(+), 22 deletions(-) create mode 100644 examples/schema/similarity_score.sssom.tsv diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ab083c2..d1032251 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Next - Add the concept of "propagatable slots". +- Replace `semantic_similarity_score` with `similarity_score` and `semantic_similarity_measure` with `similarity_measure` in the data model ([issue](https://github.com/mapping-commons/sssom/issues/385)) ## SSSOM version 0.15.1 diff --git a/examples/schema/similarity_score.sssom.tsv b/examples/schema/similarity_score.sssom.tsv new file mode 100644 index 00000000..eb28b910 --- /dev/null +++ b/examples/schema/similarity_score.sssom.tsv @@ -0,0 +1,15 @@ +#curie_map: +# HP: http://purl.obolibrary.org/obo/HP_ +# MP: http://purl.obolibrary.org/obo/MP_ +# orcid: https://orcid.org/ +# wikidata: https://www.wikidata.org/entity/ +#mapping_set_id: https://w3id.org/sssom/commons/examples/similarity_score.sssom.tsv +#license: "https://creativecommons.org/publicdomain/zero/1.0/" +#creator_id: +# - orcid:0000-0002-7356-1779 +#mapping_provider: "https://w3id.org/sssom/core_team" +#comment: This is an example file for the SSSOM for illustration only. Its contents are entirely fabricated. +subject_id predicate_id object_id mapping_justification similarity_score similarity_measure +HP:0009124 skos:exactMatch MP:0000003 semapv:LexicalSimilarityThresholdMatching 0.8 wikidata:Q865360 +HP:0008551 skos:exactMatch MP:0000018 semapv:LexicalSimilarityThresholdMatching 0.4 wikidata:Q865360 +HP:0000411 skos:exactMatch MP:0000021 semapv:SemanticSimilarityThresholdMatching 1.0 wikidata:Q1784941 diff --git a/src/docs/spec-formats-tsv.md b/src/docs/spec-formats-tsv.md index 6a12dc7f..4ea9853a 100644 --- a/src/docs/spec-formats-tsv.md +++ b/src/docs/spec-formats-tsv.md @@ -179,6 +179,11 @@ Any other value in the `match_term_type` slot MUST be treated as an error. If the set already contains `subject_type` and `object_type` slots, any `match_term_type` slot can be silently ignored. +#### semantic_similarity_score and semantic_similarity_measure + +Initial versions of this specification defined a `semantic_similarity_score` slot to store the semantic similarity, and a `semantic_similarity_measure` slot to describe how the the semantic similarity is assessed. In SSSOM 1.0, those slots were replaced by more generic `similarity_score` and `similarity_measure` slots. + +Upon encountering a `semantic_similarity_score` (respectively `semantic_similarity_measure`) slot, implementations supporting pre-1.0 versions MUST silently transform it into a `similarity_score` (respectively `similarity_measure`) slot. No changes on the value of the slot are required. ## Canonical SSSOM/TSV format diff --git a/src/sssom_schema/schema/sssom_schema.yaml b/src/sssom_schema/schema/sssom_schema.yaml index 877ae9e5..0c3c5aa8 100644 --- a/src/sssom_schema/schema/sssom_schema.yaml +++ b/src/sssom_schema/schema/sssom_schema.yaml @@ -556,34 +556,36 @@ slots: - https://github.com/mapping-commons/sssom/issues/166 - https://github.com/mapping-commons/sssom/pull/258 - https://github.com/mapping-commons/sssom/blob/master/examples/schema/curation_rule_text.sssom.tsv - semantic_similarity_score: - description: A score between 0 and 1 to denote the semantic similarity, where - 1 denotes equivalence. - range: double - minimum_value: 0.0 - maximum_value: 1.0 - semantic_similarity_measure: - description: The measure used for computing the the semantic similarity score. - To make processing this field as unambiguous as possible, we recommend using - wikidata identifiers, but wikipedia pages could also be acceptable. - range: string - examples: - - value: https://www.wikidata.org/wiki/Q865360 - description: (the Wikidata identifier for the Jaccard index measure). similarity_score: - description: A score between 0 and 1 to denote the similarity, where - 1 denotes equivalence. + description: A score between 0 and 1 to denote the similarity between two entities, where + 1 denotes equivalence, and 0 denotes disjointness. The score is meant to be used in conjunction + with the similarity_measure field, to document, for example, the lexical or semantic match + of a matching algorithm. range: double minimum_value: 0.0 maximum_value: 1.0 + see_also: + - https://github.com/mapping-commons/sssom/issues/385 + - https://github.com/mapping-commons/sssom/pull/386 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/similarity_score.sssom.tsv similarity_measure: - description: The measure used for computing the the similarity score. + description: The measure used for computing a similarity score. + This field is meant to be used in conjunction with the similarity_score field, to document, + for example, the lexical or semantic match of a matching algorithm. To make processing this field as unambiguous as possible, we recommend using - wikidata identifiers, but wikipedia pages could also be acceptable. + wikidata CURIEs, but the type of this field is deliberately unspecified. range: string examples: - - value: https://www.wikidata.org/wiki/Q865360 - description: (the Wikidata identifier for the Jaccard index measure). + - value: https://www.wikidata.org/entity/Q865360 + description: (the Wikidata IRI for the Jaccard index measure). + - value: wikidata:Q865360 + description: (the Wikidata CURIE for the Jaccard index measure). + - value: Levenshtein distance + description: (a score to measure the distance between two character sequences). + see_also: + - https://github.com/mapping-commons/sssom/issues/385 + - https://github.com/mapping-commons/sssom/pull/386 + - https://github.com/mapping-commons/sssom/blob/master/examples/schema/similarity_score.sssom.tsv issue_tracker_item: description: The issue tracker item discussing this mapping. range: EntityReference @@ -696,8 +698,8 @@ classes: - match_string - subject_preprocessing - object_preprocessing - - semantic_similarity_score - - semantic_similarity_measure + - similarity_score + - similarity_measure - see_also - issue_tracker_item - other