inclusion of similar work
Some checks failed
Rust / build (push) Has been cancelled

This commit is contained in:
Sander Hautvast 2022-12-18 17:36:43 +01:00
parent 9e919336be
commit b1f6802568
11 changed files with 2930 additions and 34 deletions

View file

@ -6,10 +6,14 @@ edition = "2021"
[dependencies] [dependencies]
unicode-segmentation = "1" unicode-segmentation = "1"
anyhow = "1" anyhow = "1"
#regex = "1" regex="1.5"
lazy_static = "1"
crossbeam-channel = "0.5"
chrono = "0.4"
unicode-segmentation = "1.9.0"
[dev-dependencies] [dev-dependencies]
more-asserts = "0.2.2" more-asserts = "0.3"
[[bin]] [[bin]]
name = "count" name = "count"

View file

@ -0,0 +1,10 @@
àáâäæãåāÀÁÂÄÆÃÅĀ:a
èéêëēėęÈÉÊËĒĖĘ:e
îïíīįìÎÏÍĪĮÌ:i
ûüùúūÛÜÙÚŪ:u
ôöòóœøōõÔÖÒÓŒØŌÕ:o
çćčÇĆČ:c
ÿŸ:y
łŁ:l
ñńÑŃ:n
žźżŽŹŻ:z

13
dat/default_whitelist.dat Normal file
View file

@ -0,0 +1,13 @@
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
.,;
àáâäæãåāÀÁÂÄÆÃÅĀ
èéêëēėęÈÉÊËĒĖĘ
îïíīįìÎÏÍĪĮÌ
ûüùúūÛÜÙÚŪ
ôöòóœøōõÔÖÒÓŒØŌÕ
çćčÇĆČ
ÿŸ
łŁ
ñńÑŃ
žźżŽŹŻ

2438
patch.txt Normal file

File diff suppressed because it is too large Load diff

115
src/bin/clean.rs Normal file
View file

@ -0,0 +1,115 @@
#![warn(clippy::all, clippy::pedantic)]
use std::{env, io, process, thread};
use std::collections::HashMap;
use std::fs::File;
use std::io::BufRead;
use chrono::{DateTime, Local};
use crossbeam_channel::bounded;
use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
use crate::worker::ThreadPool;
const BUFFER_SIZE: usize = 10_000;
fn main() -> Result<(), io::Error> {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: clean [whitelist-filename] input-filename");
process::exit(-1);
}
let input_filename = if args.len() == 3 {
&args[2]
} else {
&args[1]
};
let start: DateTime<Local> = Local::now();
let file = File::open(input_filename)?;
let reader = io::BufReader::new(file);
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
let (merge_sender, merge_receiver) = bounded::<Option<Vec<String>>>(8);
{
thread::spawn(move || {
loop {
// handle Error case silently
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
if let Some(clean_buffer) = maybe_buffer {
for line in clean_buffer {
println!("{}", line);
}
} else {
// None is the break signal (may be made nicer using enum)
break;
}
}
});
}
let mut line_count = 0usize;
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
for line in reader.lines().flatten() {
if line_count < BUFFER_SIZE {
buffer.push(line);
line_count += 1;
} else {
// multi producer, single consumer. So multiple senders, using clone
let merge_sender = merge_sender.clone();
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
buffer = Vec::with_capacity(BUFFER_SIZE);
line_count = 0;
}
}
// last, partially filled buffer
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
worker_pool.wait();
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
Ok(())
}
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<Vec<String>>>) {
lazy_static! {
static ref args: Vec<String> = env::args().collect();
static ref w:Vec<String> = if args.len() == 3 {
let whitelist_filename = &args[1];
read_whitelist(whitelist_filename).unwrap()
} else {
read_std_whitelist().unwrap()
};
static ref WHITELIST:Vec<&'static str> = w.iter().map(|s| s.as_str()).collect::<Vec<&'static str>>();
}
let mut clean_buffer = vec![];
for line in buffer {
let mut clean_line = String::new();
line.graphemes(true).filter(|g| WHITELIST.contains(g)).for_each(|c| clean_line.push_str(c));
clean_buffer.push(clean_line);
}
sender.send(Some(clean_buffer)).unwrap();
}
fn read_whitelist(filename: &String) -> Result<Vec<String>, io::Error> {
let file = File::open(filename)?;
let reader = io::BufReader::new(file);
let mut whitelist = vec![];
for line in reader.lines().flatten() {
line.graphemes(true).for_each(|token| whitelist.push(token.to_owned()));
}
Ok(whitelist)
}
fn read_std_whitelist() -> Result<Vec<String>, io::Error> {
let whitelist_dat = include_str!("../../dat/default_whitelist.dat");
let mut whitelist = vec![];
whitelist_dat.graphemes(true).for_each(|token| whitelist.push(token.to_owned()));
Ok(whitelist)
}

View file

@ -1,3 +1,122 @@
fn main(){ #![warn(clippy::all, clippy::pedantic)]
rltk::pipelines::count();
} use std::{env, process};
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead};
use std::io::Error;
use std::sync::{Arc, Mutex};
use std::thread;
use chrono::prelude::*;
use crossbeam_channel::bounded;
use crate::worker::ThreadPool;
const BUFFER_SIZE: usize = 10_000;
/// Creates counts for single words in a text (file). These can be used as preprocessing for (sub)word tokenization
/// Writes the result to a new file.
///
/// removes all interpunction and special characters
///
/// Uses a fork-join pattern using 8 threads (can be changed in the code).
///
pub fn main() -> Result<(), Error> {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: tokenize [filename]");
process::exit(-1);
}
let input_filename = &args[1];
let start: DateTime<Local> = Local::now();
eprintln!("started at {:?}", start);
let file = File::open(input_filename)?;
let reader = io::BufReader::new(file);
// counter for determining if the buffer is full (and ready to be passed to a worker thread)
let mut line_count = 0usize;
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
// this is the end result before writing to file
let merge_counter = Arc::new(Mutex::new(HashMap::new()));
// bounded channel for sending intermediate results (counts for a single buffer) to the merger
// let (merge_sender, merge_receiver) = crossbeam_channel::bounded::<Option<HashMap<String, usize>>>(8); //#threads
let (merge_sender, merge_receiver) = bounded::<Option<HashMap<String, usize>>>(8);
// crossbeam_channel::bounded::<Option<HashMap<String, usize>>>(8); //#threads
// create new scope for the merge_counter clone
{
let merge_counter = Arc::clone(&merge_counter);
// create a single thread that waits for intermediate counts to process them in the overall result
thread::spawn(move || {
loop {
// handle Error case silently
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
if let Some(counter) = maybe_buffer {
// create a lock on the merge_counter (this is the only thread while processing). The main thread will read later
let mut merge_counter = merge_counter.lock().unwrap();
// update counts and discard intermediate result
for (word, count) in counter {
let entry = merge_counter.entry(word).or_insert(0);
*entry += count;
}
} else {
// None is the break signal (may be made nicer using enum)
break;
}
}
});
}
// read a file of text with line separators
// when the buffer is full, pass it to a handle_line_buffer function
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
for line in reader.lines().flatten() {
if line_count < BUFFER_SIZE {
buffer.push(line);
line_count += 1;
} else {
// multi producer, single consumer. So multiple senders, using clone
let merge_sender = merge_sender.clone();
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
buffer = Vec::with_capacity(BUFFER_SIZE);
line_count = 0;
}
}
// last, partially filled buffer
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
// wait for processing to finish
worker_pool.wait();
for (word, count) in merge_counter.lock().unwrap().iter() {
// cutoff is 5 (inclusive min word count)
// TODO parametrize
if *count > 4 {
println!("{}: {}", word, count);
}
}
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
Ok(())
}
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<HashMap<String, usize>>>) {
let mut counter = HashMap::new();
for line in buffer {
for word in line.split(&[' ', ',', '.', ';']) {
let count = counter.entry(word.to_lowercase().to_owned()).or_insert(0);
*count += 1;
}
}
sender.send(Some(counter)).unwrap();
}

3
src/bin/mod.rs Normal file
View file

@ -0,0 +1,3 @@
mod count;
mod clean;
mod substitute;

127
src/bin/substitute.rs Normal file
View file

@ -0,0 +1,127 @@
#![warn(clippy::all, clippy::pedantic)]
use std::{env, io, process, thread};
use std::collections::HashMap;
use std::fs::File;
use std::hash::Hash;
use std::io::BufRead;
use chrono::{DateTime, Local};
use crossbeam_channel::bounded;
use lazy_static::lazy_static;
use unicode_segmentation::UnicodeSegmentation;
use crate::worker::ThreadPool;
const BUFFER_SIZE: usize = 10_000;
fn main() -> Result<(), io::Error> {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: clean [whitelist-filename] input-filename");
process::exit(-1);
}
let input_filename = if args.len() == 3 {
&args[2]
} else {
&args[1]
};
let start: DateTime<Local> = Local::now();
let file = File::open(input_filename)?;
let reader = io::BufReader::new(file);
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
let (merge_sender, merge_receiver) = bounded::<Option<Vec<String>>>(8);
{
thread::spawn(move || {
loop {
// handle Error case silently
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
if let Some(clean_buffer) = maybe_buffer {
for line in clean_buffer {
println!("{}", line);
}
} else {
// None is the break signal (may be made nicer using enum)
break;
}
}
});
}
let mut line_count = 0usize;
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
for line in reader.lines().flatten() {
if line_count < BUFFER_SIZE {
buffer.push(line);
line_count += 1;
} else {
// multi producer, single consumer. So multiple senders, using clone
let merge_sender = merge_sender.clone();
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
buffer = Vec::with_capacity(BUFFER_SIZE);
line_count = 0;
}
}
// last, partially filled buffer
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
worker_pool.wait();
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
Ok(())
}
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<Vec<String>>>) {
lazy_static! {
static ref args: Vec<String> = env::args().collect();
static ref SUBST:HashMap<String, String> = if args.len() == 3 {
let whitelist_filename = &args[1];
read_whitelist(whitelist_filename).unwrap()
} else {
read_std_whitelist().unwrap()
};
static ref WHITELIST:HashMap<&'static str, &'static str> = SUBST.iter().map(|(s,t)| (s.as_str(),t.as_str())).collect::<HashMap<&'static str, &'static str>>();
}
let mut clean_buffer = vec![];
for line in buffer {
let mut clean_line = String::new();
line.graphemes(true).map(|g|*(SUBST.get(g)).or_else(||Some(g.to_owned()))).for_each(|c| clean_line.push_str(c.unwrap()));
clean_buffer.push(clean_line);
}
sender.send(Some(clean_buffer)).unwrap();
}
fn read_whitelist(filename: &String) -> Result<HashMap<String, String>, io::Error> {
let file = File::open(filename)?;
let reader = io::BufReader::new(file);
let mut substitutions = HashMap::new();
for line in reader.lines().flatten() {
let key_value = line.split(":").map(|e|e.to_owned()).collect::<Vec<String>>();
let source = key_value[0].clone();
let dest = key_value[1].clone();
source.graphemes(true).for_each(|token| { substitutions.insert(token.to_owned(), dest.clone()); });
}
Ok(substitutions)
}
fn read_std_whitelist() -> Result<HashMap<String, String>, io::Error> {
let substitute_dat = include_str!("../../dat/default_substitute.dat");
let mut substitutions = HashMap::new();
for line in substitute_dat.split("\n") {
let key_value = line.split(":").map(|e|e.to_owned()).collect::<Vec<String>>();
let source = key_value[0].clone();
let dest = key_value[1].clone();
source.graphemes(true).for_each(|token| { substitutions.insert(token.to_owned(), dest.clone()); });
}
Ok(substitutions)
}

View file

@ -3,6 +3,7 @@ pub mod util;
pub mod metrics; pub mod metrics;
pub mod mat; pub mod mat;
pub mod pipelines; pub mod pipelines;
mod worker;
#[cfg(test)] #[cfg(test)]
pub(crate) mod test; pub(crate) mod test;

View file

@ -1,29 +0,0 @@
use std::collections::BTreeMap;
use std::collections::hash_map::DefaultHasher;
use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{self, BufRead};
pub fn count() -> anyhow::Result<()> {
let mut store: BTreeMap<String, usize> = BTreeMap::new();
let stdin = io::stdin();
for line in stdin.lock().lines() {
let line = line?;
for token in line.split(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) {
let count = store.entry(token.to_owned()).or_insert(0);
*count += 1;
}
}
for (key, value) in store{
println!("{}:{}", key,value);
}
Ok(())
}
fn hash(string: &str) -> u64 {
let mut hasher = DefaultHasher::new();
string.hash(&mut hasher);
hasher.finish()
}
pub fn create_binary_bow(file: File) {}

95
src/worker.rs Normal file
View file

@ -0,0 +1,95 @@
use std::sync::{Arc, Mutex};
use std::thread;
use std::time::Duration;
// custom threadpool implementation
// may be replaced by crate, if I find the right one. Have looked at crossbeam and rayon.
// Rayon par_iter for instance doesn't seem usable, because it lacks buffering, which I guess would increase context switching
// this is in principle an uncorroborated thought
// It does use crossbeam_channel::bounded, which is crucial to avoid memory overload (more than 30 Gbs instead of at most ~3 in the current implementation)
enum Message {
NewJob(Job),
Terminate,
}
pub struct ThreadPool {
workers: Vec<Worker>,
sender: crossbeam_channel::Sender<Message>,
}
type Job = Box<dyn FnOnce() + Send + 'static>;
impl ThreadPool {
/// Create a new threadpool
/// size: the number of threads in the pool
/// # Panics
#[must_use = "size must be >0"]
pub fn new(size: usize) -> Self {
assert!(size > 0);
// The max number of messages in the channel should be higher as the number of workerthreads grows
// to avoid idle threads.
let (sender, receiver) = crossbeam_channel::bounded(size * 2); // just a stab
let receiver = Arc::new(Mutex::new(receiver));
let mut workers = Vec::with_capacity(size);
for _ in 0..size {
workers.push(Worker::new(Arc::clone(&receiver)));
}
Self {
workers,
sender,
}
}
/// submit a function for multithreaded processing
pub fn execute<F>(&self, f: F) where F: FnOnce() + Send + 'static {
let job = Box::new(f);
self.sender.send(Message::NewJob(job)).unwrap();
}
/// wait for workerthreads to finish
pub fn wait(&mut self) {
for worker in &mut self.workers {
self.sender.send(Message::Terminate).unwrap_or(());
if let Some(thread) = worker.thread.take() {
thread.join().unwrap();
}
}
}
}
struct Worker {
thread: Option<thread::JoinHandle<()>>,
}
impl Worker {
fn new(receiver: Arc<Mutex<crossbeam_channel::Receiver<Message>>>) -> Self {
let thread = thread::spawn(move || {
loop {
// idle worker threads die quickly, so avoid this situation
// the timeout can be increased if that is not what you want
// For small workloads the timeout is necessary, because some theads may never get the Terminate command and thus prevent the process from ending.
let message_r = receiver.lock().unwrap().recv_timeout(Duration::from_secs(1));
if let Ok(message) = message_r {
match message {
Message::NewJob(job) => {
job();
}
Message::Terminate => {
break;
}
}
} else {
break;
}
}
});
Self {
thread: Some(thread),
}
}
}