Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add levenshtein distance calculator #260

Merged
merged 2 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/searching/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ The binary search algorithm is a simple search in an ordered array-like compound
## [Dijkstra](./src/dijkstra.cairo)

Dijkstra's algorithm is a graph search algorithm that finds the shortest path from a source node to all other nodes in a weighted graph, ensuring the shortest distances are progressively updated as it explores nodes. It maintains a priority queue of nodes based on their tentative distances from the source and greedily selects the node with the smallest distance at each step.

## [Levenshtein distance](./src/levenshtein_distance.cairo)

The Levenshtein distance is a string metric for measuring the difference between two sequences. It is the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into the other. This version of the algorithm optmizes the space complexity. Time complexity: O(nm). Space complexity: O(n),
77 changes: 77 additions & 0 deletions src/searching/src/levenshtein_distance.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// The Levenshtein Distance
use dict::Felt252DictTrait;


/// Compute the edit distance between two byte arrays
/// * `arr1` - The first byte array.
/// * `arr2` - The second byte array.
/// # Returns
/// * `usize` - The edit distance between the two byte arrays.
fn levenshtein_distance(arr1: @ByteArray, arr2: @ByteArray) -> usize {
// Get the lengths of both arrays
let arr1_len = arr1.len();
let arr2_len = arr2.len();

// If the first array is empty, the distance is the length of the second array
if arr1_len == 0 {
return arr2_len;
}

// Initialize a dictionary to store previous distances, with keys and values as indices
let mut prev_distances = felt252_dict_new::<usize>();
let mut index: usize = 0;
loop {
// Break the loop when index equals the length of the first array plus 1
if index == arr1_len + 1 {
break;
}
prev_distances.insert(index.into(), index);
index += 1;
};

// Initialize a variable to keep track of the current row
let mut current_row: usize = 0;
loop {
// Break the loop when current row equals the length of the second array
if current_row == arr2_len {
break;
}
let second_array_element = arr2.at(current_row).unwrap();
let mut previous_substitution_cost = prev_distances.get(0);
prev_distances.insert(0, current_row + 1);

// Initialize a variable to keep track of the current column
let mut current_column: usize = 0;
loop {
// Break the loop when current column equals the length of the first array
if current_column == arr1_len {
break;
}
let first_array_element = arr1.at(current_column).unwrap();
let deletion_cost = prev_distances.get(current_column.into()) + 1;
let insertion_cost = prev_distances.get((current_column + 1).into()) + 1;
let substitution_cost = if first_array_element == second_array_element {
previous_substitution_cost
} else {
previous_substitution_cost + 1
};

previous_substitution_cost = prev_distances.get((current_column + 1).into());
let mut min_cost = deletion_cost;
if insertion_cost < min_cost {
min_cost = insertion_cost;
}
if substitution_cost < min_cost {
min_cost = substitution_cost;
}
prev_distances.insert((current_column + 1).into(), min_cost);

current_column += 1
};

current_row += 1;
};

// Return the Levenshtein distance
prev_distances.get(arr1_len.into())
}
1 change: 1 addition & 0 deletions src/searching/src/lib.cairo
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod binary_search;
mod dijkstra;
mod levenshtein_distance;

#[cfg(test)]
mod tests;
1 change: 1 addition & 0 deletions src/searching/src/tests.cairo
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
mod binary_search_test;
mod dijkstra_test;
mod levenshtein_distance_test;
114 changes: 114 additions & 0 deletions src/searching/src/tests/levenshtein_distance_test.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
use alexandria_searching::levenshtein_distance::levenshtein_distance;


#[test]
#[available_gas(5000000)]
fn bm_search_test_1() {
// FROG -> 46,52,4f,47
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x46_u8);
arr1.append_byte(0x52_u8);
arr1.append_byte(0x4f_u8);
arr1.append_byte(0x47_u8);
// DOG -> 44,4f,47
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x44_u8);
arr2.append_byte(0x4f_u8);
arr2.append_byte(0x47_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 2, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_2() {
let mut arr1: ByteArray = Default::default();
let mut arr2: ByteArray = Default::default();

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 0, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_3() {
let mut arr1: ByteArray = Default::default();
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x61_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 1, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_4() {
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x61_u8);
let mut arr2: ByteArray = Default::default();

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 1, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_5() {
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x61_u8);
arr1.append_byte(0x62_u8);
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x61_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 1, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_6() {
// foobar -> 66,6f,6f,62,61,72
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x66_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x62_u8);
arr1.append_byte(0x61_u8);
arr1.append_byte(0x72_u8);
// foobar -> 66,6f,6f,62,61,72
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x66_u8);
arr2.append_byte(0x6f_u8);
arr2.append_byte(0x6f_u8);
arr2.append_byte(0x62_u8);
arr2.append_byte(0x61_u8);
arr2.append_byte(0x72_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 0, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_7() {
// foobar -> 66,6f,6f,62,61,72
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x66_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x62_u8);
arr1.append_byte(0x61_u8);
arr1.append_byte(0x72_u8);
// barfoo -> 62,61,72,66,6f,6f
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x62_u8);
arr2.append_byte(0x61_u8);
arr2.append_byte(0x72_u8);
arr2.append_byte(0x66_u8);
arr2.append_byte(0x6f_u8);
arr2.append_byte(0x6f_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 6, 'invalid result');
}