added some documentation on the new functions
This commit is contained in:
parent
294158a640
commit
715ea83328
3 changed files with 10 additions and 4 deletions
|
|
@ -2,12 +2,15 @@ __RLTK__
|
|||
|
||||
An attempt to manually port some of nltk to rust.
|
||||
|
||||
Currently in it's infancy:
|
||||
Currently in it's infancy (but growing):
|
||||
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"]
|
||||
* rltk::util::pad_sequence == same as above with customisation
|
||||
* rltk::util::pad_sequence_left == same
|
||||
* rltk::util::pad_sequence_right == same
|
||||
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a"], \["b"], \["b"], \["c"]]
|
||||
* rltk::util::ngrams(\["a","b","c"],2) -> \[\["a", "b"], \["b", "c"]]
|
||||
* rltk::util::bigrams(\["a","b","c"]) == ngrams(..., 2)
|
||||
* rltk::util::trigrams(\["a","b","c"]) == ngrams(..., 3)
|
||||
* rltk::util::everygrams(\["a","b","c"],2) == \[\["a"], \["a", "b"], \["b"], \["b", "c"]]
|
||||
* rltk::util::flatten(\[\["a"], \["a", "b"], \["b"], \["b", "c"]]) == \[\"a", "a", "b", "b", "b", "c"]
|
||||
* rltk::metrics::distance::edit_distance("")
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,6 @@ impl Element {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// non recursive implementation requires a table
|
||||
// my guess is that this is more efficient (should check)
|
||||
pub(crate) fn get_edit_distance_table(word1: &str, word2: &str) -> Vec<Vec<Element>> {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
pub mod distance;
|
||||
|
||||
/// Calculate the Levenshtein edit-distance between two strings.
|
||||
/// The edit distance is the number of characters that need to be substituted, inserted, or deleted, to transform s1 into s2.
|
||||
/// For example, transforming “rain” to “shine” requires three steps, consisting of two substitutions and one insertion:
|
||||
/// “rain” -> “sain” -> “shin” -> “shine”.
|
||||
/// These operations could have been done in other orders, but at least three steps are needed.
|
||||
pub fn edit_distance(s1: &str, s2: &str) -> usize {
|
||||
distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue