added some more documentation

This commit is contained in:
Sander Hautvast 2022-05-09 22:39:12 +02:00
parent ae7931774d
commit f72da25396
2 changed files with 8 additions and 0 deletions

View file

@ -2,6 +2,12 @@ __RLTK__
An attempt to manually port some of nltk to rust. An attempt to manually port some of nltk to rust.
from https://www.nltk.org/api/nltk.lm.html:
_So as to avoid re-creating the text in memory, both train and vocab are lazy iterators. They are evaluated on demand at training time._
rltk has the same philosophy: everything is done using iterators (on iterators) on string slices.
Currently in it's infancy (but growing): Currently in it's infancy (but growing):
* rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"] * rltk::lm::preprocessing::pad_both_ends(\["a","b","c"], 2) -> "\<s>", "a", "b", "c", "\</s>"]
* rltk::util::pad_sequence == same as above with customisation * rltk::util::pad_sequence == same as above with customisation

View file

@ -5,6 +5,8 @@ pub mod distance;
/// For example, transforming “rain” to “shine” requires three steps, consisting of two substitutions and one insertion: /// For example, transforming “rain” to “shine” requires three steps, consisting of two substitutions and one insertion:
/// “rain” -> “sain” -> “shin” -> “shine”. /// “rain” -> “sain” -> “shin” -> “shine”.
/// These operations could have been done in other orders, but at least three steps are needed. /// These operations could have been done in other orders, but at least three steps are needed.
///
/// substitution cost is (for now at least) hardcoded as 2
pub fn edit_distance(s1: &str, s2: &str) -> usize { pub fn edit_distance(s1: &str, s2: &str) -> usize {
distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value distance::get_edit_distance_table(s1, s2)[s1.len()][s2.len()].value
} }