added some documentation on the new functions
This commit is contained in:
parent
294158a640
commit
715ea83328
3 changed files with 10 additions and 4 deletions
|
|
@ -2,12 +2,15 @@ __RLTK__
|
||||||
|
|
||||||
An attempt to manually port some of nltk to rust.
|
An attempt to manually port some of nltk to rust.
|
||||||
|
|
||||||
Currently in it's infancy:
|
Currently in it's infancy (but growing):
|
||||||
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"]
|
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"]
|
||||||
* rltk::util::pad_sequence == same as above with customisation
|
* rltk::util::pad_sequence == same as above with customisation
|
||||||
* rltk::util::pad_sequence_left == same
|
* rltk::util::pad_sequence_left == same
|
||||||
* rltk::util::pad_sequence_right == same
|
* rltk::util::pad_sequence_right == same
|
||||||
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a"], \["b"], \["b"], \["c"]]
|
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]]
|
||||||
* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2)
|
* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2)
|
||||||
* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3)
|
* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3)
|
||||||
|
* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]]
|
||||||
|
* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"]
|
||||||
|
* rltk::metrics::distance::edit_distance("")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,8 +14,6 @@ impl Element {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// non recursive implementation requires a table
|
// non recursive implementation requires a table
|
||||||
// my guess is that this is more efficient (should check)
|
// my guess is that this is more efficient (should check)
|
||||||
pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec<Vec<Element>> {
|
pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec<Vec<Element>> {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,10 @@
|
||||||
pub mod distance;
|
pub mod distance;
|
||||||
|
|
||||||
|
/// Calculate the Levenshtein edit-distance between two strings.
|
||||||
|
/// The edit distance is the number of characters that need to be substituted, inserted, or deleted, to transform s1 into s2.
|
||||||
|
/// For example, transforming “rain” to “shine” requires three steps, consisting of two substitutions and one insertion:
|
||||||
|
/// “rain” -> “sain” -> “shin” -> “shine”.
|
||||||
|
/// These operations could have been done in other orders, but at least three steps are needed.
|
||||||
pub fn edit_distance(s1: &str, s2: &str) -> usize {
|
pub fn edit_distance(s1: &str, s2: &str) -> usize {
|
||||||
distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value
|
distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue