diff --git a/README.md b/README.md index dafc4ce..52de57e 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,15 @@ __RLTK__ An attempt to manually port some of nltk to rust. -Currently in it's infancy: +Currently in it's infancy (but growing): * rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\", "a", "b", "c", "\"] * rltk::util::pad_sequence == same as above with customisation * rltk::util::pad_sequence_left == same * rltk::util::pad_sequence_right == same -* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a"], \["b"], \["b"], \["c"]] +* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]] * rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2) * rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3) +* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]] +* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"] +* rltk::metrics::distance::edit_distance("") diff --git a/src/metrics/distance.rs b/src/metrics/distance.rs index 64fab4d..3f52bdb 100644 --- a/src/metrics/distance.rs +++ b/src/metrics/distance.rs @@ -14,8 +14,6 @@ impl Element { } } - - // non recursive implementation requires a table // my guess is that this is more efficient (should check) pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec> { diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 04f720c..fabd061 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -1,5 +1,10 @@ pub mod distance; +/// Calculate the Levenshtein edit-distance between two strings. +/// The edit distance is the number of characters that need to be substituted, inserted, or deleted, to transform s1 into s2. +/// For example, transforming “rain” to “shine” requires three steps, consisting of two substitutions and one insertion: +/// “rain” -> “sain” -> “shin” -> “shine”. +/// These operations could have been done in other orders, but at least three steps are needed. pub fn edit_distance(s1: &str, s2: &str) -> usize { distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value }