Skip to content

Commit

Permalink
Minor code refactors
Browse files Browse the repository at this point in the history
- Move private function to companion object
- Make ShingleBased types more consistent for distance

Signed-off-by: solonovamax <[email protected]>
  • Loading branch information
solonovamax committed Sep 29, 2023
1 parent 44a8e4e commit ad42d6c
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ public class Cosine(k: Int = DEFAULT_K) : ShingleBased(k), NormalizedStringDista
* @see NormalizedStringDistance
*/
override fun distance(s1: String, s2: String): Double {
return 1.0 - similarity(s1, s2)
if (s1 == s2)
return 0.0
if (s1.length < k || s2.length < k)
return 1.0

return distance(profile(s1), profile(s2))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,12 @@ public class Jaccard(k: Int = DEFAULT_K) : ShingleBased(k), MetricStringDistance
* @see NormalizedStringDistance
*/
override fun distance(s1: String, s2: String): Double {
return 1.0 - similarity(s1, s2)
if (s1 == s2)
return 0.0
if (s1.length < k || s2.length < k)
return 1.0

return distance(profile(s1), profile(s2))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,52 +110,52 @@ public class JaroWinkler(
return 1.0 - similarity(s1, s2)
}

private fun matches(s1: String, s2: String): Matches {
val (shortest, longest) = minMaxByLength(s1, s2)
val searchRange = max(longest.length / 2 - 1, 0)
val matchIndexes = IntArray(shortest.length) { -1 }

val matchFlags = BooleanArray(longest.length)

val matches = shortest.mapIndexedNotNull { index, char ->
val low = max(index - searchRange, 0)
val high = min(index + searchRange + 1, longest.length)
val matchIndex = (low until high).firstOrNull { i ->
!matchFlags[i] && char == longest[i]
}
if (matchIndex != null) {
matchIndexes[index] = matchIndex
matchFlags[matchIndex] = true
char
} else {
null
private companion object {
private const val DEFAULT_THRESHOLD = 0.7
private const val JW_COEFFICIENT = 0.1

private data class Matches(
val matches: Int,
val transpositions: Int,
val commonPrefixLength: Int,
val longestLength: Int,
)

private fun matches(s1: String, s2: String): Matches {
val (shortest, longest) = minMaxByLength(s1, s2)
val searchRange = max(longest.length / 2 - 1, 0)
val matchIndexes = IntArray(shortest.length) { -1 }

val matchFlags = BooleanArray(longest.length)

val matches = shortest.mapIndexedNotNull { index, char ->
val low = max(index - searchRange, 0)
val high = min(index + searchRange + 1, longest.length)
val matchIndex = (low until high).firstOrNull { i ->
!matchFlags[i] && char == longest[i]
}
if (matchIndex != null) {
matchIndexes[index] = matchIndex
matchFlags[matchIndex] = true
char
} else {
null
}
}.size

val ms1 = shortest.filterIndexed { i, _ -> matchIndexes[i] != -1 }.toCharArray()
val ms2 = longest.filterIndexed { i, _ -> matchFlags[i] }.toCharArray()

// val transpositions = ms1.zip(ms2).count { (c1, c2) -> c1 != c2 } / 2
var transpositions = 0
ms1.forEachIndexed { i, c1 ->
if (c1 != ms2[i])
transpositions++
}
}.size

val ms1 = shortest.filterIndexed { i, _ -> matchIndexes[i] != -1 }.toCharArray()
val ms2 = longest.filterIndexed { i, _ -> matchFlags[i] }.toCharArray()
val commonPrefixLength = shortest.commonPrefixWith(longest).length

// val transpositions = ms1.zip(ms2).count { (c1, c2) -> c1 != c2 } / 2
var transpositions = 0
ms1.forEachIndexed { i, c1 ->
if (c1 != ms2[i])
transpositions++
return Matches(matches, transpositions / 2, commonPrefixLength, longest.length)
}

val commonPrefixLength = shortest.commonPrefixWith(longest).length

return Matches(matches, transpositions / 2, commonPrefixLength, longest.length)
}

private data class Matches(
val matches: Int,
val transpositions: Int,
val commonPrefixLength: Int,
val longestLength: Int,
)

private companion object {
private const val DEFAULT_THRESHOLD = 0.7
private const val JW_COEFFICIENT = 0.1
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ public class SorensenDice(k: Int = DEFAULT_K) : ShingleBased(k), NormalizedStrin
* @see NormalizedStringDistance
*/
override fun distance(s1: String, s2: String): Double {
return 1 - similarity(s1, s2)
if (s1 == s2)
return 0.0
if (s1.length < k || s2.length < k)
return 1.0

return distance(profile(s1), profile(s2))
}

/**
Expand Down

0 comments on commit ad42d6c

Please sign in to comment.