From 45f8471cc0e48c07b457e180a9868bc495503ec3 Mon Sep 17 00:00:00 2001 From: solonovamax Date: Tue, 3 Oct 2023 14:43:48 -0400 Subject: [PATCH] Large docs refactor Signed-off-by: solonovamax --- kt-string-similarity/dokka/includes/edit.md | 236 ++++++++++ .../dokka/includes/interfaces.md | 51 +++ .../dokka/includes/kt-string-similarity.md | 14 +- .../dokka/includes/stringsimilarity.md | 412 +++++------------- .../stringsimilarity/RatcliffObershelp.kt | 8 +- .../interfaces/NormalizedStringDistance.kt | 2 +- .../interfaces/NormalizedStringSimilarity.kt | 2 +- 7 files changed, 407 insertions(+), 318 deletions(-) create mode 100644 kt-string-similarity/dokka/includes/edit.md create mode 100644 kt-string-similarity/dokka/includes/interfaces.md diff --git a/kt-string-similarity/dokka/includes/edit.md b/kt-string-similarity/dokka/includes/edit.md new file mode 100644 index 0000000..5d4792f --- /dev/null +++ b/kt-string-similarity/dokka/includes/edit.md @@ -0,0 +1,236 @@ +# Package ca.solostudios.stringsimilarity.edit + +This package contains the edit-based string measure implementations. + +## Algorithms + +### [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] + +The [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance between two words is the minimum number of +single-character edits (insertions, deletions, or substitutions) required to change one word into the other. + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\). + +#### Example + +```kotlin +val levenshtein = Levenshtein() + +println(levenshtein.distance("My string", "My \$tring")) // prints 1.0 +``` + +### [Normalized Levenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] + +This is computed as the [levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] +normalized to be in the range \\([0.0, 1.0]\\). + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\). + +#### Example + +```kotlin +val normLevenshtein = NormalizedLevenshtein() + +println(normLevenshtein.distance("My string", "My \$tring")) // prints 0.10526315789473684 +``` + +### [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] + +Similar to the [Levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein], +the [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] with transposition +(also sometimes calls unrestricted Damerau-Levenshtein distance) is the minimum number of operations needed to transform +one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, +or a **transposition of two adjacent characters**. + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\). + +This is not to be confused with the optimal string alignment distance, which is an extension where no substring can be +edited more than once. + +#### Example + +```kotlin +val damerau = DamerauLevenshtein() + +println(damerau.distance("ABCDEF", "ABDCEF")) // prints 1.0 + +// 2 substitutions +println(damerau.distance("ABCDEF", "BACDFE")) // prints 2.0 + +// 1 deletion +println(damerau.distance("ABCDEF", "ABCDE")) // prints 1.0 +println(damerau.distance("ABCDEF", "BCDEF")) // prints 1.0 +println(damerau.distance("ABCDEF", "ABCGDEF")) // prints 1.0 + +// All different +println(damerau.distance("ABCDEF", "POIU")) // prints 6.0 + +// Transpose +println(damerau.distance("CA", "ABC")) // prints 2.0 +``` + +### [Normalized Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.NormalizedDamerauLevenshtein] + +This is computed as the [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] +normalized to be in the range \\([0.0, 1.0]\\). + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\). + +#### Example + +```kotlin +val damerau = NormalizedDamerauLevenshtein() + +println(damerau.distance("ABCDEF", "ABDCEF")) // prints 0.15384615384615385 + +// 2 substitutions +println(damerau.distance("ABCDEF", "BACDFE")) // prints 0.2857142857142857 + +// 1 deletion +println(damerau.distance("ABCDEF", "ABCDE")) // prints 0.16666666666666666 +println(damerau.distance("ABCDEF", "BCDEF")) // prints 0.16666666666666666 +println(damerau.distance("ABCDEF", "ABCGDEF")) // prints 0.14285714285714285 + +// All different +println(damerau.distance("ABCDEF", "POIU")) // prints 0.75 + +// Transpose +println(damerau.distance("CA", "ABC")) // prints 0.5714285714285714 +``` + +### [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] + +The [Optimal String Alignment distance][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] variant +of [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] +(sometimes called the restricted edit distance) computes the number of edit operations needed +to make the strings equal under the condition that **no substring is edited more than once**, +whereas the true the [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] +presents no such restriction. +The difference from the algorithm for the [Levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] is the +addition of one recurrence for the transposition operations. + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\). + +#### Example + +```kotlin +val osa = OptimalStringAlignment() + +println(osa.distance("ABCDEF", "ABDCEF")) // prints 1.0 + +// 2 substitutions +println(osa.distance("ABCDEF", "BACDFE")) // prints 2.0 + +// 1 deletion +println(osa.distance("ABCDEF", "ABCDE")) // prints 1.0 +println(osa.distance("ABCDEF", "BCDEF")) // prints 1.0 +println(osa.distance("ABCDEF", "ABCGDEF")) // prints 1.0 + +// All different +println(osa.distance("ABCDEF", "POIU")) // prints 6.0 + +println(osa.distance("CA", "ABC")) // prints 3.0 +``` + +### [Normalized Optimal String Alignment][ca.solostudios.stringsimilarity.edit.NormalizedOptimalStringAlignment] + +This is computed as the [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] +normalized to be in the range \\([0.0, 1.0]\\). + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\). + +#### Example + +```kotlin +val osa = NormalizedOptimalStringAlignment() + +println(osa.distance("ABCDEF", "ABDCEF")) // prints 0.15384615384615385 + +// 2 substitutions +println(osa.distance("ABCDEF", "BACDFE")) // prints 0.2857142857142857 + +// 1 deletion +println(osa.distance("ABCDEF", "ABCDE")) // prints 0.16666666666666666 +println(osa.distance("ABCDEF", "BCDEF")) // prints 0.16666666666666666 +println(osa.distance("ABCDEF", "ABCGDEF")) // prints 0.14285714285714285 + +// All different +println(osa.distance("ABCDEF", "POIU")) // prints 0.75 + +// Transpose +println(osa.distance("CA", "ABC")) // prints 0.75 +``` + +### [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] + +The [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] (LCS) problem consists in finding the longest +subsequence common to two (or more) sequences. +It differs from problems of finding common substrings: unlike substrings, subsequences are not required to +occupy consecutive positions within the original sequences. + +It is used by the diff utility, by Git for reconciling multiple changes, etc. + +The [LCS distance][ca.solostudios.stringsimilarity.edit.LCS] is equivalent +to the [Levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] when only insertion and deletion is +allowed (no substitution), or when the cost of the substitution is the double of the cost of an insertion or deletion. + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\)[@ft-a]. + +#### Example + +```kotlin +val lcs = LongestCommonSubsequence() + +println(lcs.distance("AGCAT", "GAC")) // prints 4.0 + +println(lcs.distance("AGCAT", "AGCT")) // prints 1.0 +``` + +### [Normalized Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.NormalizedLCS] + +This is computed as the [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] +normalized to be in the range \\([0.0, 1.0]\\). + +It is a metric string distance. This class implements the dynamic programming approach, +which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\)[@ft-a]. + +#### Example + +```kotlin +val normalizedLCS = NormalizedLCS() + +println(normalizedLCS.distance("ABCDEFG", "ABCDEFHJKL")) // prints 0.45454545454545453 + +println(normalizedLCS.distance("ABDEF", "ABDIF")) // prints 0.3333333333333333 +``` + +

Notes

+
+
    +
  1. + +K.S. Larsen proposed an algorithm that computes the length of LCS in time +\\(O(log(m) \\times log(n))\\).[@ref-4] But the algorithm has a memory requirement \\(O(m \\times n^2)\\) and was thus not +implemented here. +
  2. +
+
+ +

References

+
+
    +
  1. + +Larsen, K. S. (1992-10). Length of maximal common subsequences. DAIMI Report +Series, 21(426). +[[sci-hub]](https://sci-hub.st/10.7146/dpb.v21i426.6740) +
  2. +
+
diff --git a/kt-string-similarity/dokka/includes/interfaces.md b/kt-string-similarity/dokka/includes/interfaces.md new file mode 100644 index 0000000..0439a35 --- /dev/null +++ b/kt-string-similarity/dokka/includes/interfaces.md @@ -0,0 +1,51 @@ +# Package ca.solostudios.stringsimilarity.interfaces + +This package contains all the interfaces for string measures. + +## Normalized, metric, similarity and distance + +Although the topic might seem simple, a lot of different algorithms exist to measure text similarity or distance. +Therefore, the library defines some interfaces to categorize them. + +### (Normalized) Similarity and Distance + +- [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity]: Implementing algorithms define a + similarity between + strings (0 means strings are completely different). +- [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity]: The interface + extends [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity]. + Implementing algorithms compute a similarity that has been normalized based on the number of operations performed. + This means that for non-weighted implementations, the result will always be between 0 and 1. + [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is an example of this. +- [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance]: Implementing algorithms define a distance + between strings (0 means strings are identical), like [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] for example. + The maximum distance value depends on the algorithm. +- [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance]: This interface + extends [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance]. + Implementing algorithms compute a distance that has been normalized based on the number of operations performed. + This means that for non-weighted implementations, the result will always be between \\([0, 1]\\). + [NormalizedLevenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] is an example of this. + +Generally, algorithms that +implement [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity] +also implement [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance]. +This is because the similarity can be computed as \\(1 - \\text{distance}\\), +and the distance can be computed as \\(1 - \\text{similarity}\\). + +> Note: This is only applicable if the result is *always* between 0 and 1. + +### Metric Distances + +The [MetricStringDistance][ca.solostudios.stringsimilarity.interfaces.MetricStringDistance] +interface indicates that the implementing class is a metric distance, +which means that it satisfies the required axioms to be considered metric. +Read [MetricStringDistance][ca.solostudios.stringsimilarity.interfaces.MetricStringDistance] for more information. + +A lot of nearest-neighbor search algorithms and indexing structures rely on the triangle inequality. +You can check "Similarity Search, The Metric Space Approach" by Zezula et al. for a survey. +These cannot be used with non-metric similarity measures. + +### Edit Measures + +The edit measure interfaces indicate when a specific algorithm is edit-based. +See the `edit` package for all implementors. diff --git a/kt-string-similarity/dokka/includes/kt-string-similarity.md b/kt-string-similarity/dokka/includes/kt-string-similarity.md index 2ce7382..b2cb61c 100644 --- a/kt-string-similarity/dokka/includes/kt-string-similarity.md +++ b/kt-string-similarity/dokka/includes/kt-string-similarity.md @@ -26,14 +26,14 @@ The "cost" columns gives an estimation of the computational/memory costs to comp | Name | Distance | Similarity | Normalized | Metric | Memory cost | Execution cost | |--------------------------------------------|:--------:|:----------:|:----------:|:------:|----------------------|------------------------------------| -| Levenshtein | ☒ | ☐ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | -| Damerau-Levenshtein[@ft-c] | ☒ | ☐ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | -| Optimal String Alignment[@ft-c] | ☒ | ☐ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | -| Longest Common Subsequence | ☒ | ☐ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] | +| Levenshtein | ☒ | ☒ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | +| Damerau-Levenshtein[@ft-c] | ☒ | ☒ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | +| Optimal String Alignment[@ft-c] | ☒ | ☒ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | +| Longest Common Subsequence | ☒ | ☒ | ☐ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] | | Normalized Levenshtein | ☒ | ☒ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | -| Normalized Damerau-Levenshtein[@ft-c] | ☒ | ☐ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | -| Normalized Optimal String Alignment[@ft-c] | ☒ | ☐ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | -| Normalized Longest Common Subsequence | ☒ | ☐ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] | +| Normalized Damerau-Levenshtein[@ft-c] | ☒ | ☒ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | +| Normalized Optimal String Alignment[@ft-c] | ☒ | ☒ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a] | +| Normalized Longest Common Subsequence | ☒ | ☒ | ☒ | ☒ | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] | | Cosine similarity | ☒ | ☒ | ☒ | ☐ | \\(O(m + n)\\) | \\(O(m + n)\\) | | Jaccard index | ☒ | ☒ | ☒ | ☒ | \\(O(m + n)\\) | \\(O(m + n)\\) | | Jaro-Winkler | ☒ | ☒ | ☒ | ☐ | \\(O(m + n)\\) | \\(O(m \\times n)\\) | diff --git a/kt-string-similarity/dokka/includes/stringsimilarity.md b/kt-string-similarity/dokka/includes/stringsimilarity.md index e23245f..a6f131c 100644 --- a/kt-string-similarity/dokka/includes/stringsimilarity.md +++ b/kt-string-similarity/dokka/includes/stringsimilarity.md @@ -1,204 +1,45 @@ # Package ca.solostudios.stringsimilarity -Package containing all the string similarity algorithms +This package contains most of the string measure implementations. ## Algorithms -### Normalized, metric, similarity and distance - -Although the topic might seem simple, a lot of different algorithms exist to measure text similarity or distance. -Therefore, the library defines some interfaces to categorize them. - -#### (Normalized) similarity and distance - -- [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity]: Implementing algorithms define a - similarity between - strings (0 means strings are completely different). -- [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity]: The interface - extends [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity]. - Implementing algorithms compute a similarity that has been normalized based on the number of operations performed. - This means that for non-weighted implementations, the result will always be between 0 and 1. - [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is an example of this. -- [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance]: Implementing algorithms define a distance - between strings - (0 means strings are identical), like [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] for example. - The maximum distance value depends on the algorithm. -- [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance]: This interface - extends [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance]. - Implementing algorithms compute a distance that has been normalized based on the number of operations performed. - This means that for non-weighted implementations, the result will always be between \\([0, 1]\\). - [NormalizedLevenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] is an example of this. - -Generally, algorithms that -implement [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity] -also implement [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance], and -\\(\text{similarity} = 1 - \text{distance}\\). -
-( !! Only if the result is *always* between 0 and 1. !! ) - -But there are a few exceptions, like N-Gram similarity and distance (Kondrak). - -#### Metric distances - -The [MetricStringDistance][ca.solostudios.stringsimilarity.interfaces.MetricStringDistance] interface: A few of the -distances are actually -metric distances, which means that verify the triangle inequality \\(d(x, y) <= d(x,z) + d(z,y)\\). -For example, [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] is a metric distance, but -[NormalizedLevenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] is not. - -A lot of nearest-neighbor search algorithms and indexing structures rely on the triangle inequality. -You can check "Similarity Search, The Metric Space Approach" by Zezula et al. for a survey. -These cannot be used with non metric similarity measures. +### Edit-based measures (Levenshtein, LCS, Damerau-Levenshtein, etc.) + +All edit-based string measures are located in the `edit` package. ### Shingles (n-gram) based similarity and distance -A few algorithms work by converting strings into sets of n-grams (sequences of n characters, also sometimes called -k-shingles). +A few algorithms work by converting strings into sets of n-grams +(sequences of n characters, also sometimes called k-shingles). The similarity or distance between the strings is then the similarity or distance between the sets. -Some of them, like [jaccard][ca.solostudios.stringsimilarity.Jaccard], consider strings as sets of shingles, and don't -consider the number -of occurences of each shingle. +Some of them, like the [Jaccard index][ca.solostudios.stringsimilarity.Jaccard], consider strings as sets of shingles, and don't +consider the number of occurrences of each shingle. Others, like [cosine similarity][ca.solostudios.stringsimilarity.Cosine], work using what is sometimes called the -profile of the strings, -which takes into account the number of occurences of each shingle. +profile of the strings, which takes into account the number of occurrences of each shingle. For these algorithms, another use case is possible when dealing with large datasets: 1. Compute the set or profile representation of all the strings 2. Compute the similarity between sets or profiles -### [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] - -The [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance between two words is the minimum number of -single-character edits -(insertions, deletions or substitutions) required to change one word into the other. - -It is a metric string distance. -This implementation uses dynamic programming (Wagner–Fischer algorithm), with only 2 rows of data. -The space requirement is thus \\(O(m)\\) and the algorithm runs in \\(O(m \\times n)\\). - -#### Example - -```kotlin -val levenshtein = Levenshtein() - -println(levenshtein.distance("My string", "My \$tring")) // prints 1.0 -``` - -### [Normalized Levenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] - -This distance is computed as [levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] divided by the -length of the longest -string. -The resulting value is always in the interval \\([0.0, 1.0]\\) but it is not a metric anymore! - -The similarity is computed as 1 - normalized distance. - -```kotlin -val normLevenshtein = NormalizedLevenshtein() - -println(normLevenshtein.distance("My string", "My \$tring")) // prints 0.1111111111111111 -``` - -### [Weighted Levenshtein][ca.solostudios.stringsimilarity.WeightedLevenshtein] - -An implementation of [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] that allows to define different -weights for different -character substitutions. - -This algorithm is usually used for optical character recognition (OCR) applications. -For OCR, the cost of substituting P and R is lower then the cost of substituting P and M for example because because -from and OCR point of -view P is similar to R. - -It can also be used for keyboard typing auto-correction. -Here the cost of substituting T and R is lower for example because these are located next to each other on an AZERTY or -QWERTY keyboard. -Hence the probability that the user mistyped the characters is higher. - -```kotlin -val weightedLevenshtein = WeightedLevenshtein() { old, new -> - if (old == 't' && new == 'r') 0.5 else 1.0 -} - -println(weightedLevenshtein.distance("String1", "Srring2")) // prints 1.5 -``` - -### [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] - -Similar to [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein], -[Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] with transposition -(also sometimes calls unrestricted Damerau-Levenshtein distance) is the minimum number of operations needed to transform -one string into the -other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a * -*transposition of two adjacent -characters**. - -It does respect triangle inequality, and is thus a metric distance. - -This is not to be confused with the optimal string alignment distance, which is an extension where no substring can be -edited more than -once. - -```kotlin -val damerau = Damerau() - -println(damerau.distance("ABCDEF", "ABDCEF")) // prints 1.0 - -// 2 substitutions -println(d.distance("ABCDEF", "BACDFE")) // prints 2.0 - -// 1 deletion -println(d.distance("ABCDEF", "ABCDE")) // prints 1.0 -println(d.distance("ABCDEF", "BCDEF")) // prints 1.0 -println(d.distance("ABCDEF", "ABCGDEF")) // prints 1.0 - -// All different -println(d.distance("ABCDEF", "POIU")) // prints 6.0 -``` - -### [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] - -The [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] variant -of [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] (sometimes called the restricted edit -distance) computes the number of -edit operations needed to make the strings equal under the condition that **no substring is edited more than once**, -whereas the -true [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] presents no such restriction. -The difference from the algorithm for [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance is the -addition of one recurrence -for the transposition -operations. - -Note that for the [optimal string alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] distance, the -triangle inequality does -not hold and so it is not a true metric. - -```kotlin -val optimalStringAlignment = OptimalStringAlignment() - -println(optimalStringAlignment.distance("CA", "ABC")) // prints 3.0 -``` - ### [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] -[Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is a string edit distance that was developed in the area of -record linkage -(duplicate detection) (Winkler, 1990). -The Jaro–Winkler distance metric is designed and best suited for short strings such as person names, and to detect -transposition typos. +[Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is a string distance that was developed in the area of +record linkage (duplicate detection), as defined by Winkler[@ref-1]. +The Jaro–Winkler distance metric is designed and best suited for short strings such as person names, +and to detect transposition typos. [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] computes the similarity between 2 strings, and the returned -value lies in the -interval \\([0.0, 1.0]\\). +value lies in the interval \\([0.0, 1.0]\\). It is (roughly) a variation of [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein], where the -transposition of 2 close -characters is considered less important than the transposition of 2 characters that are far from each other. +transposition of 2 close characters is considered less important +than the transposition of 2 characters that are far from each other. [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] penalizes additions or substitutions that cannot be expressed as transpositions. -The distance is computed as \\(1 - [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler]\\) similarity. +#### Example ```kotlin val jaroWinkler = JaroWinkler() @@ -210,76 +51,15 @@ println(jaroWinkler.similarity("My string", "My tsring")) // prints 0.9740740656 println(jaroWinkler.similarity("My string", "My ntrisg")) // prints 0.8962963223457336 ``` -### [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] - -The [longest common subsequence][ca.solostudios.stringsimilarity.edit.LCS] (LCS) problem consists in finding the longest -subsequence common to two (or more) sequences. -It differs from problems of finding common substrings: unlike substrings, subsequences are not required to -occupy consecutive positions within the original sequences. - -It is used by the diff utility, by Git for reconciling multiple changes, etc. - -The [LCS][ca.solostudios.stringsimilarity.edit.LCS] distance between strings X (of length n) and Y (of length m) is -\\(n + m - 2 |LCS(X, Y)|\\), -\\(\\text{min} = 0\\), -\\(\\text{max} = n + m\\) - -[LCS][ca.solostudios.stringsimilarity.edit.LCS] distance is equivalent -to [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance when only insertion and deletion is -allowed (no substitution), or -when the cost of the substitution is the double of the cost of an insertion or deletion. - -This class implements the dynamic programming approach, which has a space requirement \\(O(m \\times n)\\), and -computation cost -\\(O(m \\times n)\\). - -In "Length of Maximal Common Subsequences", K.S. Larsen proposed an algorithm that computes the length -of [LCS][ca.solostudios.stringsimilarity.edit.LCS] in time \\(O(log(m) \\times log(n))\\). -But the algorithm has a memory requirement \\(O(m \\times n^2)\\) and was thus not implemented here. - -```kotlin -val longestCommonSubsequence = LongestCommonSubsequence() - -println(longestCommonSubsequence.distance("AGCAT", "GAC")) // prints 4.0 - -println(longestCommonSubsequence.distance("AGCAT", "AGCT")) // prints 1.0 -``` - -### [Normalized Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.NormalizedLCS] - -This distance is computed as [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] divided by the -length of the longest -string. -The resulting value is always in the interval \\([0.0, 1.0]\\) but it is not a metric anymore! - -The similarity is computed as 1 - normalized distance. - -The distance is computed as \\(1 - |LCS(s1, s2)| / max(|s1|, |s2|)\\) - -```kotlin -val normalizedLCS = NormalizedLCS() - -// LCS: "ABCDEF" => length = 6 -// longest = "ABCDEFHJKL" => length = 10 -// => 1 - 6/10 = 0.4 -println(normalizedLCS.distance("ABCDEFG", "ABCDEFHJKL")) // prints 0.4 - -// LCS: "ABDF" => length = 4 -// longest = "ABDEF" => length = 5 -// => 1 - 4 / 5 = 0.2 -println(normalizedLCS.distance("ABDEF", "ABDIF")) // prints 0.2 -``` - ### [N-Gram][ca.solostudios.stringsimilarity.NGram] -[Normalized N-Gram][ca.solostudios.stringsimilarity.NGram] distance as defined by Kondrak, -"N-Gram Similarity and Distance", String Processing and Information Retrieval, -Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126. +[N-Gram][ca.solostudios.stringsimilarity.NGram] similarity/distance is a distance used to measure the similarity of two strings, +that always lies in the range \\([0.0, 1.0]\\). -The algorithm uses affixing with special character '`\\n`' to increase the weight of first characters. +The algorithm uses affixing with special character '`\\0`' to increase the weight of first characters. The normalization is achieved by dividing the total similarity score the original length of the longest word. -In the paper, Kondrak also defines a similarity measure, which is not implemented (yet). +#### Example ```kotlin val twogram = NGram(2) @@ -297,101 +77,90 @@ A few algorithms work by converting strings into sets of n-grams (sequences of n k-shingles). The similarity or distance between the strings is then the similarity or distance between the sets. -The cost for computing these similarities and distances is mainly domnitated by k-shingling -(converting the strings into sequences of k characters). -Therefore there are typically two use cases for these algorithms: +> Note: although it may seem it is, the [N-Gram][ca.solostudios.stringsimilarity.NGram] similarity/distance is not shingle-based. -Directly compute the distance between strings: +The cost for computing these similarities and distances is mainly dominated by k-shingling +(converting the strings into sequences of k characters). -```kotlin -val dig = QGram(2) +There are typically two use cases for these algorithms: -// AB BC CD CE -// 1 1 1 0 -// 1 1 0 1 -// Total: 2 +1. Directly compute the distance between strings: + ```kotlin + val dig = QGram(2) -println(dig.distance("ABCD", "ABCE")) // prints 2 -``` + // AB BC CD CE + // 1 1 1 0 + // 1 1 0 1 + // Total: 2 -Or, for large datasets, pre-compute the profile of all strings. -The similarity can then be computed between profiles: + println(dig.distance("ABCD", "ABCE")) // prints 2 + ``` -```kotlin -/** - * Example of computing cosine similarity with pre-computed profiles. - */ -val s1 = "My first string" -val s2 = "My other string..." +2. For large datasets, pre-compute the profile of all strings. + The similarity can then be computed between profiles: + ```kotlin + /** + * Example of computing cosine similarity with pre-computed profiles. + */ + val s1 = "My first string" + val s2 = "My other string..." -// Let's work with sequences of 2 characters... -val cosine = new Cosine(2) + // Let's work with sequences of 2 characters... + val cosine = new Cosine(2) -// Pre-compute the profile of strings -val profile1 = cosine.profile(s1) -val profile2 = cosine.profile(s2) + // Pre-compute the profile of strings + val profile1 = cosine.profile(s1) + val profile2 = cosine.profile(s2) -// ... + // ... -println(cosine.similarity(profile1, profile2)) // prints 0.516185 -``` + println(cosine.similarity(profile1, profile2)) // prints 0.516185 + ``` Pay attention, this only works if the same KShingling object is used to parse all input strings! #### [Q-Gram][ca.solostudios.stringsimilarity.QGram] [Q-gram][ca.solostudios.stringsimilarity.QGram] distance, as defined by Ukkonen in -"Approximate string-matching with q-grams and maximal matches" +"Approximate string-matching with q-grams and maximal matches"[@ref-3]. The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences -of -each [n-gram][ca.solostudios.stringsimilarity.NGram]): \\(SUM( |V1_i - V2_i| )\\). -[Q-gram][ca.solostudios.stringsimilarity.QGram] distance is a lower bound -on [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] -distance, but can be computed in \\(O(m + n)\\), where [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] -requires -\\(O(m \\times n)\\) +of each [n-gram][ca.solostudios.stringsimilarity.NGram]): \\(\\sum_{i=1}^n \\lVert \\vec{v1_i} - \\vec{v2_i} \\rVert\\). +[Q-gram][ca.solostudios.stringsimilarity.QGram] distance is a lower bound on +the [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance, but can be computed in \\(O(m + n)\\) time, +whereas the [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance requires \\(O(m \\times n)\\) time. #### [Cosine similarity][ca.solostudios.stringsimilarity.Cosine] -The similarity between the two strings is the cosine of the angle between these two vectors representation, and is -computed as \\(V1 \\cdot V2 / (|V1| * |V2|)\\) - -Distance is computed as \\(1 - \text{cosine similarity}\\). +The similarity between the two strings is the cosine of the angle between these two vector representation, and is +computed as \\(\\frac{\\vec{v_1} \\cdot \\vec{v_2}}{\\lVert\\vec{v_1}\\rVert \\times \\lVert\\vec{v_2}\\rVert}\\). #### [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] Like [Q-Gram][ca.solostudios.stringsimilarity.QGram] distance, the input strings are first converted into sets of -n-grams (sequences of n -characters, also called k-shingles), but this time the cardinality of -each [n-gram][ca.solostudios.stringsimilarity.NGram] is not taken into -account. +n-grams (sequences of n characters, also called k-shingles), but this time the cardinality of +each [n-gram][ca.solostudios.stringsimilarity.NGram] is not taken into account. Each input string is simply a set of n-grams. -The [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] is then computed as \\(|V1 \\cap V2| / |V1 \\cup V2|\\). +The [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] is then computed as +\\(\\frac{\\lVert V_1 \\cap V_2 \\rVert}{\\lVert V_1 \\cup V_2 \\rVert}\\). -Distance is computed as 1 - similarity. [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] is a metric distance. -#### [Sorensen-Dice coefficient][ca.solostudios.stringsimilarity.SorensenDice] - -Similar to [Jaccard index][ca.solostudios.stringsimilarity.Jaccard], but this time the similarity is computed as \\(2 * -|V1 \\cap V2| / ( -|V1| + |V2|)\\). +#### [Sørensen-Dice coefficient][ca.solostudios.stringsimilarity.SorensenDice] -Distance is computed as 1 - similarity. +Similar to the [Jaccard index][ca.solostudios.stringsimilarity.Jaccard], but this time the similarity is computed as +\\(\\frac{2 \\times \\lVert V_1 \\cap V_2 \\rVert}{\\lVert V_1 \\rVert + \\lVert V_2 \\rVert}\\). ### [Ratcliff-Obershelp][ca.solostudios.stringsimilarity.RatcliffObershelp] -[Ratcliff/Obershelp Pattern Recognition][ca.solostudios.stringsimilarity.RatcliffObershelp], also known as Gestalt -Pattern Matching, is a -string-matching algorithm for determining the similarity of two strings. -It was developed in 1983 by John W. Ratcliff and John A. Obershelp and published in the Dr. Dobb's Journal in July 1988 +[Ratcliff/Obershelp Pattern Recognition][ca.solostudios.stringsimilarity.RatcliffObershelp], +also known as Gestalt Pattern Matching, is a string-matching algorithm for determining the similarity of two strings. +It was developed in 1983 by John W. Ratcliff and John A. Obershelp and published in the Dr. Dobb's Journal in July 1988[@ref-4]. [Ratcliff/Obershelp][ca.solostudios.stringsimilarity.RatcliffObershelp] computes the similarity between 2 strings, and -the returned value -lies in the interval \\([0.0, 1.0]\\). +the returned value lies in the interval \\([0.0, 1.0]\\). -The distance is computed as 1 - Ratcliff/Obershelp similarity. +#### Example ```kotlin val ratcliffObershelp = RatcliffObershelp() @@ -405,16 +174,18 @@ println(ratcliffObershelp.similarity("My string", "My ntrisg")) // prints 0.7777 ### Experimental -#### [SIFT4][ca.solostudios.stringsimilarity.Sift4] +#### [Sift4][ca.solostudios.stringsimilarity.Sift4] -[SIFT4][ca.solostudios.stringsimilarity.Sift4] is a general purpose string distance algorithm inspired -by [JaroWinkler][ca.solostudios.stringsimilarity.JaroWinkler] -and [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS]. +[Sift4][ca.solostudios.stringsimilarity.Sift4] is a general purpose string distance algorithm inspired +by [JaroWinkler][ca.solostudios.stringsimilarity.JaroWinkler] and [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS]. It was developed to produce a distance measure that matches as close as possible to the human perception of string distance. -Hence it takes into account elements like character substitution, character distance, longest common subsequence etc. +Hence, it takes into account elements like character substitution, character distance, longest common subsequence etc. + It was developed using experimental testing, and without theoretical background. +#### Example + ```kotlin val s1 = "This is the first string" val s2 = "And this is another string" @@ -425,3 +196,34 @@ val result = sift4.distance(s1, s2) assertEquals(expectedResult, result, 0.0) ``` + +

References

+
+
    +
  1. + +Winkler, W. E. (1990). String comparator metrics and enhanced decision rules +in the fellegi-sunter model of record linkage. *Proceedings of the Survey +Research Methods Section*, 354-359. +
  2. +
  3. + +Kondrak, G. (2005-11-02). N-gram similarity and distance. In String processing +and information retrieval, lecture notes in computer science (Pages 115-126). +Springer Berlin Heidelberg. +[[sci-hub]](https://sci-hub.st/10.1007/11575832_13) +
  4. +
  5. + +Ukkonen, E. (1992-01). Approximate string matching with q-grams and maximal +matches. *Theoretical Computer Science*, *92*(1), 191–211. +[[sci-hub]](https://sci-hub.st/10.1016/0304-3975(92)90143-4) +
  6. +
  7. + +Ratcliff, J., & Metzener, D. E. (1988-07-01). Pattern matching: The gestalt +approach. *Dr. Dobb’s Journal*, *13*(7), 46. https://www.drdobbs.com/database/ +pattern-matching-the-gestalt-approach/184407970?pgno=5 +
  8. +
+
diff --git a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt index 2236392..01064c4 100644 --- a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt +++ b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt @@ -35,16 +35,16 @@ import ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity * similarity between strings. * * The similarity is defined as - * \(D_{ro} = \frac{2K_m}{\lVert S_1 \rVert + \lVert S_2 \rVert}\). + * \(D_{ro} = \frac{2K_m}{\lVert X \rVert + \lVert Y \rVert}\). * Where \(K_m\) us the number of matching characters. * * The distance is computed as * \(1 - similarity(X, Y)\). * * #### References - * Ratcliff, J., & Metzener, D. E. (1988-07-01). Pattern matching: The gestalt ap- - * proach. *Dr. Dobb’s Journal*, *13*(7), 46. https://www.drdobbs.com/database/ - * pattern-matching-the-gestalt-approach/184407970?pgno=5 + * Ratcliff, J., & Metzener, D. E. (1988-07-01). Pattern matching: The gestalt + * approach. *Dr. Dobb’s Journal*, *13*(7), 46. + * https://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970?pgno=5 * * @author [Ligi](https://github.com/dxpux), solonovamax, Ported to java from .net by denmase */ diff --git a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt index 1405f83..099f929 100644 --- a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt +++ b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt @@ -30,7 +30,7 @@ package ca.solostudios.stringsimilarity.interfaces /** * Normalized string distances return a normalized distance between two strings. * - * The returned distance is always in the range \([0, 1]\). + * The returned distance is always in the range \([0, 1]\). * - `0` indicates that both strings are *equivalent*. Equivalent strings are not necessarily identical. * - `1` indicates that neither string have anything in common. * - If two strings are identical, then it should always return `0`. diff --git a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt index 6af9cec..85e2e17 100644 --- a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt +++ b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt @@ -30,7 +30,7 @@ package ca.solostudios.stringsimilarity.interfaces /** * Normalized string similarities return a normalized similarity between two strings. * - * The returned distance is always in the range \([0, 1]\). + * The returned distance is always in the range \([0, 1]\). * - `0` indicates that neither string have anything in common. * - `1` indicates that both strings are equivalent. Equivalent strings are not necessarily identical. * - If two strings are identical, then it should always return `1`.