added some documentation on the new functions

This commit is contained in:
Sander Hautvast 2022-05-09 22:26:40 +02:00
parent 294158a640
commit 715ea83328
3 changed files with 10 additions and 4 deletions

View file

@ -2,12 +2,15 @@ __RLTK__
An attempt to manually port some of nltk to rust.
Currently in it's infancy:
Currently in it's infancy (but growing):
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"]
* rltk::util::pad_sequence == same as above with customisation
* rltk::util::pad_sequence_left == same
* rltk::util::pad_sequence_right == same
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a"], \["b"], \["b"], \["c"]]
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]]
* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2)
* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3)
* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]]
* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"]
* rltk::metrics::distance::edit_distance("")

View file

@ -14,8 +14,6 @@ impl Element {
}
}
// non recursive implementation requires a table
// my guess is that this is more efficient (should check)
pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec<Vec<Element>> {

View file

@ -1,5 +1,10 @@
pub mod distance;
/// Calculate the Levenshtein edit-distance between two strings.
/// The edit distance is the number of characters that need to be substituted, inserted, or deleted, to transform s1 into s2.
/// For example, transforming “rain” to “shine” requires three steps, consisting of two substitutions and one insertion:
/// “rain” -> “sain” -> “shin” -> “shine”.
/// These operations could have been done in other orders, but at least three steps are needed.
pub fn edit_distance(s1: &str, s2: &str) -> usize {
distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value
}