From 45f8471cc0e48c07b457e180a9868bc495503ec3 Mon Sep 17 00:00:00 2001
From: solonovamax <solonovamax@12oclockpoint.com>
Date: Tue, 3 Oct 2023 14:43:48 -0400
Subject: [PATCH] Large docs refactor

Signed-off-by: solonovamax <solonovamax@12oclockpoint.com>
---
 kt-string-similarity/dokka/includes/edit.md   | 236 ++++++++++
 .../dokka/includes/interfaces.md              |  51 +++
 .../dokka/includes/kt-string-similarity.md    |  14 +-
 .../dokka/includes/stringsimilarity.md        | 412 +++++-------------
 .../stringsimilarity/RatcliffObershelp.kt     |   8 +-
 .../interfaces/NormalizedStringDistance.kt    |   2 +-
 .../interfaces/NormalizedStringSimilarity.kt  |   2 +-
 7 files changed, 407 insertions(+), 318 deletions(-)
 create mode 100644 kt-string-similarity/dokka/includes/edit.md
 create mode 100644 kt-string-similarity/dokka/includes/interfaces.md

diff --git a/kt-string-similarity/dokka/includes/edit.md b/kt-string-similarity/dokka/includes/edit.md
new file mode 100644
index 0000000..5d4792f
--- /dev/null
+++ b/kt-string-similarity/dokka/includes/edit.md
@@ -0,0 +1,236 @@
+# Package ca.solostudios.stringsimilarity.edit
+
+This package contains the edit-based string measure implementations.
+
+## Algorithms
+
+### [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein]
+
+The [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance between two words is the minimum number of
+single-character edits (insertions, deletions, or substitutions) required to change one word into the other.
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\).
+
+#### Example
+
+```kotlin
+val levenshtein = Levenshtein()
+
+println(levenshtein.distance("My string", "My \$tring")) // prints 1.0
+```
+
+### [Normalized Levenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein]
+
+This is computed as the [levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein]
+normalized to be in the range \\(&#91;0.0, 1.0&#93;\\).
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\).
+
+#### Example
+
+```kotlin
+val normLevenshtein = NormalizedLevenshtein()
+
+println(normLevenshtein.distance("My string", "My \$tring")) // prints 0.10526315789473684
+```
+
+### [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein]
+
+Similar to the [Levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein],
+the [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] with transposition
+(also sometimes calls unrestricted Damerau-Levenshtein distance) is the minimum number of operations needed to transform
+one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character,
+or a **transposition of two adjacent characters**.
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\).
+
+This is not to be confused with the optimal string alignment distance, which is an extension where no substring can be
+edited more than once.
+
+#### Example
+
+```kotlin
+val damerau = DamerauLevenshtein()
+
+println(damerau.distance("ABCDEF", "ABDCEF")) // prints 1.0
+
+// 2 substitutions
+println(damerau.distance("ABCDEF", "BACDFE")) // prints 2.0
+
+// 1 deletion
+println(damerau.distance("ABCDEF", "ABCDE")) // prints 1.0
+println(damerau.distance("ABCDEF", "BCDEF")) // prints 1.0
+println(damerau.distance("ABCDEF", "ABCGDEF")) // prints 1.0
+
+// All different
+println(damerau.distance("ABCDEF", "POIU")) // prints 6.0
+
+// Transpose
+println(damerau.distance("CA", "ABC")) // prints 2.0
+```
+
+### [Normalized Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.NormalizedDamerauLevenshtein]
+
+This is computed as the [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein]
+normalized to be in the range \\(&#91;0.0, 1.0&#93;\\).
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\).
+
+#### Example
+
+```kotlin
+val damerau = NormalizedDamerauLevenshtein()
+
+println(damerau.distance("ABCDEF", "ABDCEF")) // prints 0.15384615384615385
+
+// 2 substitutions
+println(damerau.distance("ABCDEF", "BACDFE")) // prints 0.2857142857142857
+
+// 1 deletion
+println(damerau.distance("ABCDEF", "ABCDE")) // prints 0.16666666666666666
+println(damerau.distance("ABCDEF", "BCDEF")) // prints 0.16666666666666666
+println(damerau.distance("ABCDEF", "ABCGDEF")) // prints 0.14285714285714285
+
+// All different
+println(damerau.distance("ABCDEF", "POIU")) // prints 0.75
+
+// Transpose
+println(damerau.distance("CA", "ABC")) // prints 0.5714285714285714
+```
+
+### [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment]
+
+The [Optimal String Alignment distance][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] variant
+of [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein]
+(sometimes called the restricted edit distance) computes the number of edit operations needed
+to make the strings equal under the condition that **no substring is edited more than once**,
+whereas the true the [Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein]
+presents no such restriction.
+The difference from the algorithm for the [Levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] is the
+addition of one recurrence for the transposition operations.
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\).
+
+#### Example
+
+```kotlin
+val osa = OptimalStringAlignment()
+
+println(osa.distance("ABCDEF", "ABDCEF")) // prints 1.0
+
+// 2 substitutions
+println(osa.distance("ABCDEF", "BACDFE")) // prints 2.0
+
+// 1 deletion
+println(osa.distance("ABCDEF", "ABCDE")) // prints 1.0
+println(osa.distance("ABCDEF", "BCDEF")) // prints 1.0
+println(osa.distance("ABCDEF", "ABCGDEF")) // prints 1.0
+
+// All different
+println(osa.distance("ABCDEF", "POIU")) // prints 6.0
+
+println(osa.distance("CA", "ABC")) // prints 3.0
+```
+
+### [Normalized Optimal String Alignment][ca.solostudios.stringsimilarity.edit.NormalizedOptimalStringAlignment]
+
+This is computed as the [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment]
+normalized to be in the range \\(&#91;0.0, 1.0&#93;\\).
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\).
+
+#### Example
+
+```kotlin
+val osa = NormalizedOptimalStringAlignment()
+
+println(osa.distance("ABCDEF", "ABDCEF")) // prints 0.15384615384615385
+
+// 2 substitutions
+println(osa.distance("ABCDEF", "BACDFE")) // prints 0.2857142857142857
+
+// 1 deletion
+println(osa.distance("ABCDEF", "ABCDE")) // prints 0.16666666666666666
+println(osa.distance("ABCDEF", "BCDEF")) // prints 0.16666666666666666
+println(osa.distance("ABCDEF", "ABCGDEF")) // prints 0.14285714285714285
+
+// All different
+println(osa.distance("ABCDEF", "POIU")) // prints 0.75
+
+// Transpose
+println(osa.distance("CA", "ABC")) // prints 0.75
+```
+
+### [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS]
+
+The [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] (LCS) problem consists in finding the longest
+subsequence common to two (or more) sequences.
+It differs from problems of finding common substrings: unlike substrings, subsequences are not required to
+occupy consecutive positions within the original sequences.
+
+It is used by the diff utility, by Git for reconciling multiple changes, etc.
+
+The [LCS distance][ca.solostudios.stringsimilarity.edit.LCS] is equivalent
+to the [Levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] when only insertion and deletion is
+allowed (no substitution), or when the cost of the substitution is the double of the cost of an insertion or deletion.
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\)[@ft-a].
+
+#### Example
+
+```kotlin
+val lcs = LongestCommonSubsequence()
+
+println(lcs.distance("AGCAT", "GAC")) // prints 4.0
+
+println(lcs.distance("AGCAT", "AGCT")) // prints 1.0
+```
+
+### [Normalized Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.NormalizedLCS]
+
+This is computed as the [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS]
+normalized to be in the range \\(&#91;0.0, 1.0&#93;\\).
+
+It is a metric string distance. This class implements the dynamic programming approach,
+which has a space requirement \\(O(m \\times n)\\), and computation cost \\(O(m \\times n)\\)[@ft-a].
+
+#### Example
+
+```kotlin
+val normalizedLCS = NormalizedLCS()
+
+println(normalizedLCS.distance("ABCDEFG", "ABCDEFHJKL")) // prints 0.45454545454545453
+
+println(normalizedLCS.distance("ABDEF", "ABDIF")) // prints 0.3333333333333333
+```
+
+<h2 class="footnotes-header">Notes</h2>
+<div class="footnotes">
+<ol>
+<li id="footnote-a">
+
+K.S. Larsen proposed an algorithm that computes the length of LCS in time
+\\(O(log(m) \\times log(n))\\).[@ref-4] But the algorithm has a memory requirement \\(O(m \\times n^2)\\) and was thus not
+implemented here.
+</li>
+</ol>
+</div>
+
+<h2 class="references-header">References</h2>
+<div class="references">
+<ol>
+<li id="reference-1">
+
+Larsen, K. S. (1992-10). Length of maximal common subsequences. DAIMI Report
+Series, 21(426).
+<https://doi.org/10.7146/dpb.v21i426.6740><sup>[&#91;sci-hub&#93;](https://sci-hub.st/10.7146/dpb.v21i426.6740)</sup>
+</li>
+</ol>
+</div>
diff --git a/kt-string-similarity/dokka/includes/interfaces.md b/kt-string-similarity/dokka/includes/interfaces.md
new file mode 100644
index 0000000..0439a35
--- /dev/null
+++ b/kt-string-similarity/dokka/includes/interfaces.md
@@ -0,0 +1,51 @@
+# Package ca.solostudios.stringsimilarity.interfaces
+
+This package contains all the interfaces for string measures.
+
+## Normalized, metric, similarity and distance
+
+Although the topic might seem simple, a lot of different algorithms exist to measure text similarity or distance.
+Therefore, the library defines some interfaces to categorize them.
+
+### (Normalized) Similarity and Distance
+
+- [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity]: Implementing algorithms define a
+  similarity between
+  strings (0 means strings are completely different).
+- [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity]: The interface
+  extends [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity].
+  Implementing algorithms compute a similarity that has been normalized based on the number of operations performed.
+  This means that for non-weighted implementations, the result will always be between 0 and 1.
+  [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is an example of this.
+- [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance]: Implementing algorithms define a distance
+  between strings (0 means strings are identical), like [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] for example.
+  The maximum distance value depends on the algorithm.
+- [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance]: This interface
+  extends [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance].
+  Implementing algorithms compute a distance that has been normalized based on the number of operations performed.
+  This means that for non-weighted implementations, the result will always be between \\(&#91;0, 1&#93;\\).
+  [NormalizedLevenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] is an example of this.
+
+Generally, algorithms that
+implement [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity]
+also implement [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance].
+This is because the similarity can be computed as \\(1 - \\text{distance}\\),
+and the distance can be computed as \\(1 - \\text{similarity}\\).
+
+> Note: This is only applicable if the result is *always* between 0 and 1.
+
+### Metric Distances
+
+The [MetricStringDistance][ca.solostudios.stringsimilarity.interfaces.MetricStringDistance]
+interface indicates that the implementing class is a metric distance,
+which means that it satisfies the required axioms to be considered metric.
+Read [MetricStringDistance][ca.solostudios.stringsimilarity.interfaces.MetricStringDistance] for more information.
+
+A lot of nearest-neighbor search algorithms and indexing structures rely on the triangle inequality.
+You can check "Similarity Search, The Metric Space Approach" by Zezula et al. for a survey.
+These cannot be used with non-metric similarity measures.
+
+### Edit Measures
+
+The edit measure interfaces indicate when a specific algorithm is edit-based.
+See the `edit` package for all implementors.
diff --git a/kt-string-similarity/dokka/includes/kt-string-similarity.md b/kt-string-similarity/dokka/includes/kt-string-similarity.md
index 2ce7382..b2cb61c 100644
--- a/kt-string-similarity/dokka/includes/kt-string-similarity.md
+++ b/kt-string-similarity/dokka/includes/kt-string-similarity.md
@@ -26,14 +26,14 @@ The "cost" columns gives an estimation of the computational/memory costs to comp
 
 | Name                                       | Distance | Similarity | Normalized | Metric | Memory cost          | Execution cost                     |
 |--------------------------------------------|:--------:|:----------:|:----------:|:------:|----------------------|------------------------------------|
-| Levenshtein                                |    ☒     |     ☐      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
-| Damerau-Levenshtein[@ft-c]                 |    ☒     |     ☐      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
-| Optimal String Alignment[@ft-c]            |    ☒     |     ☐      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
-| Longest Common Subsequence                 |    ☒     |     ☐      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] |
+| Levenshtein                                |    ☒     |     ☒      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
+| Damerau-Levenshtein[@ft-c]                 |    ☒     |     ☒      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
+| Optimal String Alignment[@ft-c]            |    ☒     |     ☒      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
+| Longest Common Subsequence                 |    ☒     |     ☒      |     ☐      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] |
 | Normalized Levenshtein                     |    ☒     |     ☒      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
-| Normalized Damerau-Levenshtein[@ft-c]      |    ☒     |     ☐      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
-| Normalized Optimal String Alignment[@ft-c] |    ☒     |     ☐      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
-| Normalized Longest Common Subsequence      |    ☒     |     ☐      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] |
+| Normalized Damerau-Levenshtein[@ft-c]      |    ☒     |     ☒      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
+| Normalized Optimal String Alignment[@ft-c] |    ☒     |     ☒      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a]        |
+| Normalized Longest Common Subsequence      |    ☒     |     ☒      |     ☒      |   ☒    | \\(O(m \\times n)\\) | \\(O(m \\times n)\\)[@ft-a][@ft-b] |
 | Cosine similarity                          |    ☒     |     ☒      |     ☒      |   ☐    | \\(O(m + n)\\)       | \\(O(m + n)\\)                     |
 | Jaccard index                              |    ☒     |     ☒      |     ☒      |   ☒    | \\(O(m + n)\\)       | \\(O(m + n)\\)                     |
 | Jaro-Winkler                               |    ☒     |     ☒      |     ☒      |   ☐    | \\(O(m + n)\\)       | \\(O(m \\times n)\\)               |
diff --git a/kt-string-similarity/dokka/includes/stringsimilarity.md b/kt-string-similarity/dokka/includes/stringsimilarity.md
index e23245f..a6f131c 100644
--- a/kt-string-similarity/dokka/includes/stringsimilarity.md
+++ b/kt-string-similarity/dokka/includes/stringsimilarity.md
@@ -1,204 +1,45 @@
 # Package ca.solostudios.stringsimilarity
 
-Package containing all the string similarity algorithms
+This package contains most of the string measure implementations.
 
 ## Algorithms
 
-### Normalized, metric, similarity and distance
-
-Although the topic might seem simple, a lot of different algorithms exist to measure text similarity or distance.
-Therefore, the library defines some interfaces to categorize them.
-
-#### (Normalized) similarity and distance
-
-- [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity]: Implementing algorithms define a
-  similarity between
-  strings (0 means strings are completely different).
-- [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity]: The interface
-  extends [StringSimilarity][ca.solostudios.stringsimilarity.interfaces.StringSimilarity].
-  Implementing algorithms compute a similarity that has been normalized based on the number of operations performed.
-  This means that for non-weighted implementations, the result will always be between 0 and 1.
-  [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is an example of this.
-- [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance]: Implementing algorithms define a distance
-  between strings
-  (0 means strings are identical), like [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] for example.
-  The maximum distance value depends on the algorithm.
-- [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance]: This interface
-  extends [StringDistance][ca.solostudios.stringsimilarity.interfaces.StringDistance].
-  Implementing algorithms compute a distance that has been normalized based on the number of operations performed.
-  This means that for non-weighted implementations, the result will always be between \\([0, 1]\\).
-  [NormalizedLevenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] is an example of this.
-
-Generally, algorithms that
-implement [NormalizedStringSimilarity][ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity]
-also implement [NormalizedStringDistance][ca.solostudios.stringsimilarity.interfaces.NormalizedStringDistance], and
-\\(\text{similarity} = 1 - \text{distance}\\).
-<br/>
-( !! Only if the result is *always* between 0 and 1. !! )
-
-But there are a few exceptions, like N-Gram similarity and distance (Kondrak).
-
-#### Metric distances
-
-The [MetricStringDistance][ca.solostudios.stringsimilarity.interfaces.MetricStringDistance] interface: A few of the
-distances are actually
-metric distances, which means that verify the triangle inequality \\(d(x, y) <= d(x,z) + d(z,y)\\).
-For example, [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] is a metric distance, but
-[NormalizedLevenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein] is not.
-
-A lot of nearest-neighbor search algorithms and indexing structures rely on the triangle inequality.
-You can check "Similarity Search, The Metric Space Approach" by Zezula et al. for a survey.
-These cannot be used with non metric similarity measures.
+### Edit-based measures (Levenshtein, LCS, Damerau-Levenshtein, etc.)
+
+All edit-based string measures are located in the `edit` package.
 
 ### Shingles (n-gram) based similarity and distance
 
-A few algorithms work by converting strings into sets of n-grams (sequences of n characters, also sometimes called
-k-shingles).
+A few algorithms work by converting strings into sets of n-grams
+(sequences of n characters, also sometimes called k-shingles).
 The similarity or distance between the strings is then the similarity or distance between the sets.
 
-Some of them, like [jaccard][ca.solostudios.stringsimilarity.Jaccard], consider strings as sets of shingles, and don't
-consider the number
-of occurences of each shingle.
+Some of them, like the [Jaccard index][ca.solostudios.stringsimilarity.Jaccard], consider strings as sets of shingles, and don't
+consider the number of occurrences of each shingle.
 Others, like [cosine similarity][ca.solostudios.stringsimilarity.Cosine], work using what is sometimes called the
-profile of the strings,
-which takes into account the number of occurences of each shingle.
+profile of the strings, which takes into account the number of occurrences of each shingle.
 
 For these algorithms, another use case is possible when dealing with large datasets:
 
 1. Compute the set or profile representation of all the strings
 2. Compute the similarity between sets or profiles
 
-### [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein]
-
-The [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance between two words is the minimum number of
-single-character edits
-(insertions, deletions or substitutions) required to change one word into the other.
-
-It is a metric string distance.
-This implementation uses dynamic programming (Wagner–Fischer algorithm), with only 2 rows of data.
-The space requirement is thus \\(O(m)\\) and the algorithm runs in \\(O(m \\times n)\\).
-
-#### Example
-
-```kotlin
-val levenshtein = Levenshtein()
-
-println(levenshtein.distance("My string", "My \$tring")) // prints 1.0
-```
-
-### [Normalized Levenshtein][ca.solostudios.stringsimilarity.edit.NormalizedLevenshtein]
-
-This distance is computed as [levenshtein distance][ca.solostudios.stringsimilarity.edit.Levenshtein] divided by the
-length of the longest
-string.
-The resulting value is always in the interval \\([0.0, 1.0]\\) but it is not a metric anymore!
-
-The similarity is computed as 1 - normalized distance.
-
-```kotlin
-val normLevenshtein = NormalizedLevenshtein()
-
-println(normLevenshtein.distance("My string", "My \$tring")) // prints 0.1111111111111111
-```
-
-### [Weighted Levenshtein][ca.solostudios.stringsimilarity.WeightedLevenshtein]
-
-An implementation of [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] that allows to define different
-weights for different
-character substitutions.
-
-This algorithm is usually used for optical character recognition (OCR) applications.
-For OCR, the cost of substituting P and R is lower then the cost of substituting P and M for example because because
-from and OCR point of
-view P is similar to R.
-
-It can also be used for keyboard typing auto-correction.
-Here the cost of substituting T and R is lower for example because these are located next to each other on an AZERTY or
-QWERTY keyboard.
-Hence the probability that the user mistyped the characters is higher.
-
-```kotlin
-val weightedLevenshtein = WeightedLevenshtein() { old, new ->
-    if (old == 't' && new == 'r') 0.5 else 1.0
-}
-
-println(weightedLevenshtein.distance("String1", "Srring2")) // prints 1.5
-```
-
-### [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein]
-
-Similar to [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein],
-[Damerau-Levenshtein distance][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] with transposition
-(also sometimes calls unrestricted Damerau-Levenshtein distance) is the minimum number of operations needed to transform
-one string into the
-other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a *
-*transposition of two adjacent
-characters**.
-
-It does respect triangle inequality, and is thus a metric distance.
-
-This is not to be confused with the optimal string alignment distance, which is an extension where no substring can be
-edited more than
-once.
-
-```kotlin
-val damerau = Damerau()
-
-println(damerau.distance("ABCDEF", "ABDCEF")) // prints 1.0
-
-// 2 substitutions
-println(d.distance("ABCDEF", "BACDFE")) // prints 2.0
-
-// 1 deletion
-println(d.distance("ABCDEF", "ABCDE")) // prints 1.0
-println(d.distance("ABCDEF", "BCDEF")) // prints 1.0
-println(d.distance("ABCDEF", "ABCGDEF")) // prints 1.0
-
-// All different
-println(d.distance("ABCDEF", "POIU")) // prints 6.0
-```
-
-### [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment]
-
-The [Optimal String Alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] variant
-of [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] (sometimes called the restricted edit
-distance) computes the number of
-edit operations needed to make the strings equal under the condition that **no substring is edited more than once**,
-whereas the
-true [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein] presents no such restriction.
-The difference from the algorithm for [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance is the
-addition of one recurrence
-for the transposition
-operations.
-
-Note that for the [optimal string alignment][ca.solostudios.stringsimilarity.edit.OptimalStringAlignment] distance, the
-triangle inequality does
-not hold and so it is not a true metric.
-
-```kotlin
-val optimalStringAlignment = OptimalStringAlignment()
-
-println(optimalStringAlignment.distance("CA", "ABC")) // prints 3.0
-```
-
 ### [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler]
 
-[Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is a string edit distance that was developed in the area of
-record linkage
-(duplicate detection) (Winkler, 1990).
-The Jaro–Winkler distance metric is designed and best suited for short strings such as person names, and to detect
-transposition typos.
+[Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] is a string distance that was developed in the area of
+record linkage (duplicate detection), as defined by Winkler[@ref-1].
+The Jaro–Winkler distance metric is designed and best suited for short strings such as person names,
+and to detect transposition typos.
 
 [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] computes the similarity between 2 strings, and the returned
-value lies in the
-interval \\([0.0, 1.0]\\).
+value lies in the interval \\(&#91;0.0, 1.0&#93;\\).
 It is (roughly) a variation of [Damerau-Levenshtein][ca.solostudios.stringsimilarity.edit.DamerauLevenshtein], where the
-transposition of 2 close
-characters is considered less important than the transposition of 2 characters that are far from each other.
+transposition of 2 close characters is considered less important
+than the transposition of 2 characters that are far from each other.
 [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler] penalizes additions or substitutions that cannot be
 expressed as transpositions.
 
-The distance is computed as \\(1 - [Jaro-Winkler][ca.solostudios.stringsimilarity.JaroWinkler]\\) similarity.
+#### Example
 
 ```kotlin
 val jaroWinkler = JaroWinkler()
@@ -210,76 +51,15 @@ println(jaroWinkler.similarity("My string", "My tsring")) // prints 0.9740740656
 println(jaroWinkler.similarity("My string", "My ntrisg")) // prints 0.8962963223457336
 ```
 
-### [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS]
-
-The [longest common subsequence][ca.solostudios.stringsimilarity.edit.LCS] (LCS) problem consists in finding the longest
-subsequence common to two (or more) sequences.
-It differs from problems of finding common substrings: unlike substrings, subsequences are not required to
-occupy consecutive positions within the original sequences.
-
-It is used by the diff utility, by Git for reconciling multiple changes, etc.
-
-The [LCS][ca.solostudios.stringsimilarity.edit.LCS] distance between strings X (of length n) and Y (of length m) is
-\\(n + m - 2 |LCS(X, Y)|\\),
-\\(\\text{min} = 0\\),
-\\(\\text{max} = n + m\\)
-
-[LCS][ca.solostudios.stringsimilarity.edit.LCS] distance is equivalent
-to [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance when only insertion and deletion is
-allowed (no substitution), or
-when the cost of the substitution is the double of the cost of an insertion or deletion.
-
-This class implements the dynamic programming approach, which has a space requirement \\(O(m \\times n)\\), and
-computation cost
-\\(O(m \\times n)\\).
-
-In "Length of Maximal Common Subsequences", K.S. Larsen proposed an algorithm that computes the length
-of [LCS][ca.solostudios.stringsimilarity.edit.LCS] in time \\(O(log(m) \\times log(n))\\).
-But the algorithm has a memory requirement \\(O(m \\times n^2)\\) and was thus not implemented here.
-
-```kotlin
-val longestCommonSubsequence = LongestCommonSubsequence()
-
-println(longestCommonSubsequence.distance("AGCAT", "GAC")) // prints 4.0
-
-println(longestCommonSubsequence.distance("AGCAT", "AGCT")) // prints 1.0
-```
-
-### [Normalized Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.NormalizedLCS]
-
-This distance is computed as [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS] divided by the
-length of the longest
-string.
-The resulting value is always in the interval \\([0.0, 1.0]\\) but it is not a metric anymore!
-
-The similarity is computed as 1 - normalized distance.
-
-The distance is computed as \\(1 - |LCS(s1, s2)| / max(|s1|, |s2|)\\)
-
-```kotlin
-val normalizedLCS = NormalizedLCS()
-
-// LCS: "ABCDEF" => length = 6
-// longest = "ABCDEFHJKL" => length = 10
-// => 1 - 6/10 = 0.4
-println(normalizedLCS.distance("ABCDEFG", "ABCDEFHJKL")) // prints 0.4
-
-// LCS: "ABDF" => length = 4
-// longest = "ABDEF" => length = 5
-// => 1 - 4 / 5 = 0.2
-println(normalizedLCS.distance("ABDEF", "ABDIF")) // prints 0.2
-```
-
 ### [N-Gram][ca.solostudios.stringsimilarity.NGram]
 
-[Normalized N-Gram][ca.solostudios.stringsimilarity.NGram] distance as defined by Kondrak,
-"N-Gram Similarity and Distance", String Processing and Information Retrieval,
-Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
+[N-Gram][ca.solostudios.stringsimilarity.NGram] similarity/distance is a distance used to measure the similarity of two strings,
+that always lies in the range \\(&#91;0.0, 1.0&#93;\\).
 
-The algorithm uses affixing with special character '`\\n`' to increase the weight of first characters.
+The algorithm uses affixing with special character '`\\0`' to increase the weight of first characters.
 The normalization is achieved by dividing the total similarity score the original length of the longest word.
 
-In the paper, Kondrak also defines a similarity measure, which is not implemented (yet).
+#### Example
 
 ```kotlin
 val twogram = NGram(2)
@@ -297,101 +77,90 @@ A few algorithms work by converting strings into sets of n-grams (sequences of n
 k-shingles).
 The similarity or distance between the strings is then the similarity or distance between the sets.
 
-The cost for computing these similarities and distances is mainly domnitated by k-shingling
-(converting the strings into sequences of k characters).
-Therefore there are typically two use cases for these algorithms:
+> Note: although it may seem it is, the [N-Gram][ca.solostudios.stringsimilarity.NGram] similarity/distance is not shingle-based.
 
-Directly compute the distance between strings:
+The cost for computing these similarities and distances is mainly dominated by k-shingling
+(converting the strings into sequences of k characters).
 
-```kotlin
-val dig = QGram(2)
+There are typically two use cases for these algorithms:
 
-// AB BC CD CE
-// 1  1  1  0
-// 1  1  0  1
-// Total: 2
+1. Directly compute the distance between strings:
+   ```kotlin
+   val dig = QGram(2)
 
-println(dig.distance("ABCD", "ABCE")) // prints 2
-```
+   // AB BC CD CE
+   // 1  1  1  0
+   // 1  1  0  1
+   // Total: 2
 
-Or, for large datasets, pre-compute the profile of all strings.
-The similarity can then be computed between profiles:
+   println(dig.distance("ABCD", "ABCE")) // prints 2
+   ```
 
-```kotlin
-/**
- * Example of computing cosine similarity with pre-computed profiles.
- */
-val s1 = "My first string"
-val s2 = "My other string..."
+2. For large datasets, pre-compute the profile of all strings.
+   The similarity can then be computed between profiles:
+   ```kotlin
+   /**
+    * Example of computing cosine similarity with pre-computed profiles.
+    */
+   val s1 = "My first string"
+   val s2 = "My other string..."
 
-// Let's work with sequences of 2 characters...
-val cosine = new Cosine(2)
+   // Let's work with sequences of 2 characters...
+   val cosine = new Cosine(2)
 
-// Pre-compute the profile of strings
-val profile1 = cosine.profile(s1)
-val profile2 = cosine.profile(s2)
+   // Pre-compute the profile of strings
+   val profile1 = cosine.profile(s1)
+   val profile2 = cosine.profile(s2)
 
-// ...
+   // ...
 
-println(cosine.similarity(profile1, profile2)) // prints 0.516185
-```
+   println(cosine.similarity(profile1, profile2)) // prints 0.516185
+   ```
 
 Pay attention, this only works if the same KShingling object is used to parse all input strings!
 
 #### [Q-Gram][ca.solostudios.stringsimilarity.QGram]
 
 [Q-gram][ca.solostudios.stringsimilarity.QGram] distance, as defined by Ukkonen in
-"Approximate string-matching with q-grams and maximal matches"
+"Approximate string-matching with q-grams and maximal matches"[@ref-3].
 
 The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences
-of
-each [n-gram][ca.solostudios.stringsimilarity.NGram]): \\(SUM( |V1_i - V2_i| )\\).
-[Q-gram][ca.solostudios.stringsimilarity.QGram] distance is a lower bound
-on [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein]
-distance, but can be computed in \\(O(m + n)\\), where [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein]
-requires
-\\(O(m \\times n)\\)
+of each [n-gram][ca.solostudios.stringsimilarity.NGram]): \\(\\sum_{i=1}^n \\lVert \\vec{v1_i} - \\vec{v2_i} \\rVert\\).
+[Q-gram][ca.solostudios.stringsimilarity.QGram] distance is a lower bound on
+the [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance, but can be computed in \\(O(m + n)\\) time,
+whereas the [Levenshtein][ca.solostudios.stringsimilarity.edit.Levenshtein] distance requires \\(O(m \\times n)\\) time.
 
 #### [Cosine similarity][ca.solostudios.stringsimilarity.Cosine]
 
-The similarity between the two strings is the cosine of the angle between these two vectors representation, and is
-computed as \\(V1 \\cdot V2 / (|V1| * |V2|)\\)
-
-Distance is computed as \\(1 - \text{cosine similarity}\\).
+The similarity between the two strings is the cosine of the angle between these two vector representation, and is
+computed as \\(\\frac{\\vec{v_1} \\cdot \\vec{v_2}}{\\lVert\\vec{v_1}\\rVert \\times \\lVert\\vec{v_2}\\rVert}\\).
 
 #### [Jaccard index][ca.solostudios.stringsimilarity.Jaccard]
 
 Like [Q-Gram][ca.solostudios.stringsimilarity.QGram] distance, the input strings are first converted into sets of
-n-grams (sequences of n
-characters, also called k-shingles), but this time the cardinality of
-each [n-gram][ca.solostudios.stringsimilarity.NGram] is not taken into
-account.
+n-grams (sequences of n characters, also called k-shingles), but this time the cardinality of
+each [n-gram][ca.solostudios.stringsimilarity.NGram] is not taken into account.
 Each input string is simply a set of n-grams.
-The [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] is then computed as \\(|V1 \\cap V2| / |V1 \\cup V2|\\).
+The [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] is then computed as
+\\(\\frac{\\lVert V_1 \\cap V_2 \\rVert}{\\lVert V_1 \\cup V_2 \\rVert}\\).
 
-Distance is computed as 1 - similarity.
 [Jaccard index][ca.solostudios.stringsimilarity.Jaccard] is a metric distance.
 
-#### [Sorensen-Dice coefficient][ca.solostudios.stringsimilarity.SorensenDice]
-
-Similar to [Jaccard index][ca.solostudios.stringsimilarity.Jaccard], but this time the similarity is computed as \\(2 *
-|V1 \\cap V2| / (
-|V1| + |V2|)\\).
+#### [Sørensen-Dice coefficient][ca.solostudios.stringsimilarity.SorensenDice]
 
-Distance is computed as 1 - similarity.
+Similar to the [Jaccard index][ca.solostudios.stringsimilarity.Jaccard], but this time the similarity is computed as
+\\(\\frac{2 \\times \\lVert V_1 \\cap V_2 \\rVert}{\\lVert V_1 \\rVert + \\lVert V_2 \\rVert}\\).
 
 ### [Ratcliff-Obershelp][ca.solostudios.stringsimilarity.RatcliffObershelp]
 
-[Ratcliff/Obershelp Pattern Recognition][ca.solostudios.stringsimilarity.RatcliffObershelp], also known as Gestalt
-Pattern Matching, is a
-string-matching algorithm for determining the similarity of two strings.
-It was developed in 1983 by John W. Ratcliff and John A. Obershelp and published in the Dr. Dobb's Journal in July 1988
+[Ratcliff/Obershelp Pattern Recognition][ca.solostudios.stringsimilarity.RatcliffObershelp],
+also known as Gestalt Pattern Matching, is a string-matching algorithm for determining the similarity of two strings.
+It was developed in 1983 by John W. Ratcliff and John A. Obershelp and published in the Dr. Dobb's Journal in July 1988[@ref-4].
 
 [Ratcliff/Obershelp][ca.solostudios.stringsimilarity.RatcliffObershelp] computes the similarity between 2 strings, and
-the returned value
-lies in the interval \\([0.0, 1.0]\\).
+the returned value lies in the interval \\(&#91;0.0, 1.0&#93;\\).
 
-The distance is computed as 1 - Ratcliff/Obershelp similarity.
+#### Example
 
 ```kotlin
 val ratcliffObershelp = RatcliffObershelp()
@@ -405,16 +174,18 @@ println(ratcliffObershelp.similarity("My string", "My ntrisg")) // prints 0.7777
 
 ### Experimental
 
-#### [SIFT4][ca.solostudios.stringsimilarity.Sift4]
+#### [Sift4][ca.solostudios.stringsimilarity.Sift4]
 
-[SIFT4][ca.solostudios.stringsimilarity.Sift4] is a general purpose string distance algorithm inspired
-by [JaroWinkler][ca.solostudios.stringsimilarity.JaroWinkler]
-and [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS].
+[Sift4][ca.solostudios.stringsimilarity.Sift4] is a general purpose string distance algorithm inspired
+by [JaroWinkler][ca.solostudios.stringsimilarity.JaroWinkler] and [Longest Common Subsequence][ca.solostudios.stringsimilarity.edit.LCS].
 It was developed to produce a distance measure that matches as close as possible to the human perception of string
 distance.
-Hence it takes into account elements like character substitution, character distance, longest common subsequence etc.
+Hence, it takes into account elements like character substitution, character distance, longest common subsequence etc.
+
 It was developed using experimental testing, and without theoretical background.
 
+#### Example
+
 ```kotlin
 val s1 = "This is the first string"
 val s2 = "And this is another string"
@@ -425,3 +196,34 @@ val result = sift4.distance(s1, s2)
 
 assertEquals(expectedResult, result, 0.0)
 ```
+
+<h2 class="references-header">References</h2>
+<div class="references">
+<ol>
+<li id="reference-1">
+
+Winkler, W. E. (1990). String comparator metrics and enhanced decision rules
+in the fellegi-sunter model of record linkage. *Proceedings of the Survey
+Research Methods Section*, 354-359. <https://eric.ed.gov/?id=ED325505>
+</li>
+<li id="reference-2">
+
+Kondrak, G. (2005-11-02). N-gram similarity and distance. In String processing
+and information retrieval, lecture notes in computer science (Pages 115-126).
+Springer Berlin Heidelberg.
+<https://doi.org/10.1007/11575832_13><sup>[&#91;sci-hub&#93;](https://sci-hub.st/10.1007/11575832_13)</sup>
+</li>
+<li id="reference-3">
+
+Ukkonen, E. (1992-01). Approximate string matching with q-grams and maximal
+matches. *Theoretical Computer Science*, *92*(1), 191–211.
+<https://doi.org/10.1016/0304-3975(92)90143-4><sup>[&#91;sci-hub&#93;](https://sci-hub.st/10.1016/0304-3975(92)90143-4)</sup>
+</li>
+<li id="reference-4">
+
+Ratcliff, J., & Metzener, D. E. (1988-07-01). Pattern matching: The gestalt
+approach. *Dr. Dobb’s Journal*, *13*(7), 46. https://www.drdobbs.com/database/
+pattern-matching-the-gestalt-approach/184407970?pgno=5
+</li>
+</ol>
+</div>
diff --git a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt
index 2236392..01064c4 100644
--- a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt
+++ b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/RatcliffObershelp.kt
@@ -35,16 +35,16 @@ import ca.solostudios.stringsimilarity.interfaces.NormalizedStringSimilarity
  * similarity between strings.
  *
  * The similarity is defined as
- * \(D_{ro} = \frac{2K_m}{\lVert S_1 \rVert + \lVert S_2 \rVert}\).
+ * \(D_{ro} = \frac{2K_m}{\lVert X \rVert + \lVert Y \rVert}\).
  * Where \(K_m\) us the number of matching characters.
  *
  * The distance is computed as
  * \(1 - similarity(X, Y)\).
  *
  * #### References
- * Ratcliff, J., & Metzener, D. E. (1988-07-01). Pattern matching: The gestalt ap-
- * proach. *Dr. Dobb’s Journal*, *13*(7), 46. https://www.drdobbs.com/database/
- * pattern-matching-the-gestalt-approach/184407970?pgno=5
+ * Ratcliff, J., & Metzener, D. E. (1988-07-01). Pattern matching: The gestalt
+ * approach. *Dr. Dobb’s Journal*, *13*(7), 46.
+ * https://www.drdobbs.com/database/pattern-matching-the-gestalt-approach/184407970?pgno=5
  *
  * @author [Ligi](https://github.com/dxpux), solonovamax, Ported to java from .net by denmase
  */
diff --git a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt
index 1405f83..099f929 100644
--- a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt
+++ b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringDistance.kt
@@ -30,7 +30,7 @@ package ca.solostudios.stringsimilarity.interfaces
 /**
  * Normalized string distances return a normalized distance between two strings.
  *
- * The returned distance is always in the range \(&#91;0, 1]\).
+ * The returned distance is always in the range \(&#91;0, 1&#93;\).
  * - `0` indicates that both strings are *equivalent*. Equivalent strings are not necessarily identical.
  * - `1` indicates that neither string have anything in common.
  * - If two strings are identical, then it should always return `0`.
diff --git a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt
index 6af9cec..85e2e17 100644
--- a/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt
+++ b/kt-string-similarity/src/commonMain/kotlin/ca/solostudios/stringsimilarity/interfaces/NormalizedStringSimilarity.kt
@@ -30,7 +30,7 @@ package ca.solostudios.stringsimilarity.interfaces
 /**
  * Normalized string similarities return a normalized similarity between two strings.
  *
- * The returned distance is always in the range \(&#91;0, 1]\).
+ * The returned distance is always in the range \(&#91;0, 1&#93;\).
  * - `0` indicates that neither string have anything in common.
  * - `1` indicates that both strings are equivalent. Equivalent strings are not necessarily identical.
  * - If two strings are identical, then it should always return `1`.