This commit is contained in:
parent
9e919336be
commit
b1f6802568
11 changed files with 2930 additions and 34 deletions
|
|
@ -6,10 +6,14 @@ edition = "2021"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
unicode-segmentation = "1"
|
unicode-segmentation = "1"
|
||||||
anyhow = "1"
|
anyhow = "1"
|
||||||
#regex = "1"
|
regex="1.5"
|
||||||
|
lazy_static = "1"
|
||||||
|
crossbeam-channel = "0.5"
|
||||||
|
chrono = "0.4"
|
||||||
|
unicode-segmentation = "1.9.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
more-asserts = "0.2.2"
|
more-asserts = "0.3"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "count"
|
name = "count"
|
||||||
|
|
|
||||||
10
dat/default_substitute.dat
Normal file
10
dat/default_substitute.dat
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
àáâäæãåāÀÁÂÄÆÃÅĀ:a
|
||||||
|
èéêëēėęÈÉÊËĒĖĘ:e
|
||||||
|
îïíīįìÎÏÍĪĮÌ:i
|
||||||
|
ûüùúūÛÜÙÚŪ:u
|
||||||
|
ôöòóœøōõÔÖÒÓŒØŌÕ:o
|
||||||
|
çćčÇĆČ:c
|
||||||
|
ÿŸ:y
|
||||||
|
łŁ:l
|
||||||
|
ñńÑŃ:n
|
||||||
|
žźżŽŹŻ:z
|
||||||
13
dat/default_whitelist.dat
Normal file
13
dat/default_whitelist.dat
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
abcdefghijklmnopqrstuvwxyz
|
||||||
|
ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
.,;
|
||||||
|
àáâäæãåāÀÁÂÄÆÃÅĀ
|
||||||
|
èéêëēėęÈÉÊËĒĖĘ
|
||||||
|
îïíīįìÎÏÍĪĮÌ
|
||||||
|
ûüùúūÛÜÙÚŪ
|
||||||
|
ôöòóœøōõÔÖÒÓŒØŌÕ
|
||||||
|
çćčÇĆČ
|
||||||
|
ÿŸ
|
||||||
|
łŁ
|
||||||
|
ñńÑŃ
|
||||||
|
žźżŽŹŻ
|
||||||
115
src/bin/clean.rs
Normal file
115
src/bin/clean.rs
Normal file
|
|
@ -0,0 +1,115 @@
|
||||||
|
#![warn(clippy::all, clippy::pedantic)]
|
||||||
|
|
||||||
|
use std::{env, io, process, thread};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::BufRead;
|
||||||
|
use chrono::{DateTime, Local};
|
||||||
|
use crossbeam_channel::bounded;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
|
use crate::worker::ThreadPool;
|
||||||
|
|
||||||
|
const BUFFER_SIZE: usize = 10_000;
|
||||||
|
|
||||||
|
fn main() -> Result<(), io::Error> {
|
||||||
|
let args: Vec<String> = env::args().collect();
|
||||||
|
if args.len() < 2 {
|
||||||
|
eprintln!("Usage: clean [whitelist-filename] input-filename");
|
||||||
|
process::exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let input_filename = if args.len() == 3 {
|
||||||
|
&args[2]
|
||||||
|
} else {
|
||||||
|
&args[1]
|
||||||
|
};
|
||||||
|
|
||||||
|
let start: DateTime<Local> = Local::now();
|
||||||
|
|
||||||
|
let file = File::open(input_filename)?;
|
||||||
|
let reader = io::BufReader::new(file);
|
||||||
|
|
||||||
|
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
|
||||||
|
let (merge_sender, merge_receiver) = bounded::<Option<Vec<String>>>(8);
|
||||||
|
|
||||||
|
{
|
||||||
|
thread::spawn(move || {
|
||||||
|
loop {
|
||||||
|
// handle Error case silently
|
||||||
|
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
|
||||||
|
if let Some(clean_buffer) = maybe_buffer {
|
||||||
|
for line in clean_buffer {
|
||||||
|
println!("{}", line);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// None is the break signal (may be made nicer using enum)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut line_count = 0usize;
|
||||||
|
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
|
||||||
|
for line in reader.lines().flatten() {
|
||||||
|
if line_count < BUFFER_SIZE {
|
||||||
|
buffer.push(line);
|
||||||
|
line_count += 1;
|
||||||
|
} else {
|
||||||
|
// multi producer, single consumer. So multiple senders, using clone
|
||||||
|
let merge_sender = merge_sender.clone();
|
||||||
|
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
|
||||||
|
buffer = Vec::with_capacity(BUFFER_SIZE);
|
||||||
|
line_count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// last, partially filled buffer
|
||||||
|
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
|
||||||
|
|
||||||
|
worker_pool.wait();
|
||||||
|
|
||||||
|
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<Vec<String>>>) {
|
||||||
|
lazy_static! {
|
||||||
|
static ref args: Vec<String> = env::args().collect();
|
||||||
|
static ref w:Vec<String> = if args.len() == 3 {
|
||||||
|
let whitelist_filename = &args[1];
|
||||||
|
read_whitelist(whitelist_filename).unwrap()
|
||||||
|
} else {
|
||||||
|
read_std_whitelist().unwrap()
|
||||||
|
};
|
||||||
|
static ref WHITELIST:Vec<&'static str> = w.iter().map(|s| s.as_str()).collect::<Vec<&'static str>>();
|
||||||
|
}
|
||||||
|
let mut clean_buffer = vec![];
|
||||||
|
for line in buffer {
|
||||||
|
let mut clean_line = String::new();
|
||||||
|
line.graphemes(true).filter(|g| WHITELIST.contains(g)).for_each(|c| clean_line.push_str(c));
|
||||||
|
clean_buffer.push(clean_line);
|
||||||
|
}
|
||||||
|
sender.send(Some(clean_buffer)).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_whitelist(filename: &String) -> Result<Vec<String>, io::Error> {
|
||||||
|
let file = File::open(filename)?;
|
||||||
|
let reader = io::BufReader::new(file);
|
||||||
|
let mut whitelist = vec![];
|
||||||
|
for line in reader.lines().flatten() {
|
||||||
|
line.graphemes(true).for_each(|token| whitelist.push(token.to_owned()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(whitelist)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_std_whitelist() -> Result<Vec<String>, io::Error> {
|
||||||
|
let whitelist_dat = include_str!("../../dat/default_whitelist.dat");
|
||||||
|
let mut whitelist = vec![];
|
||||||
|
|
||||||
|
whitelist_dat.graphemes(true).for_each(|token| whitelist.push(token.to_owned()));
|
||||||
|
|
||||||
|
Ok(whitelist)
|
||||||
|
}
|
||||||
123
src/bin/count.rs
123
src/bin/count.rs
|
|
@ -1,3 +1,122 @@
|
||||||
fn main(){
|
#![warn(clippy::all, clippy::pedantic)]
|
||||||
rltk::pipelines::count();
|
|
||||||
|
use std::{env, process};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufRead};
|
||||||
|
use std::io::Error;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::thread;
|
||||||
|
|
||||||
|
use chrono::prelude::*;
|
||||||
|
use crossbeam_channel::bounded;
|
||||||
|
|
||||||
|
use crate::worker::ThreadPool;
|
||||||
|
|
||||||
|
const BUFFER_SIZE: usize = 10_000;
|
||||||
|
|
||||||
|
/// Creates counts for single words in a text (file). These can be used as preprocessing for (sub)word tokenization
|
||||||
|
/// Writes the result to a new file.
|
||||||
|
///
|
||||||
|
/// removes all interpunction and special characters
|
||||||
|
///
|
||||||
|
/// Uses a fork-join pattern using 8 threads (can be changed in the code).
|
||||||
|
///
|
||||||
|
pub fn main() -> Result<(), Error> {
|
||||||
|
let args: Vec<String> = env::args().collect();
|
||||||
|
if args.len() < 2 {
|
||||||
|
eprintln!("Usage: tokenize [filename]");
|
||||||
|
process::exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let input_filename = &args[1];
|
||||||
|
let start: DateTime<Local> = Local::now();
|
||||||
|
|
||||||
|
eprintln!("started at {:?}", start);
|
||||||
|
let file = File::open(input_filename)?;
|
||||||
|
let reader = io::BufReader::new(file);
|
||||||
|
|
||||||
|
// counter for determining if the buffer is full (and ready to be passed to a worker thread)
|
||||||
|
let mut line_count = 0usize;
|
||||||
|
|
||||||
|
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
|
||||||
|
|
||||||
|
// this is the end result before writing to file
|
||||||
|
let merge_counter = Arc::new(Mutex::new(HashMap::new()));
|
||||||
|
|
||||||
|
// bounded channel for sending intermediate results (counts for a single buffer) to the merger
|
||||||
|
// let (merge_sender, merge_receiver) = crossbeam_channel::bounded::<Option<HashMap<String, usize>>>(8); //#threads
|
||||||
|
let (merge_sender, merge_receiver) = bounded::<Option<HashMap<String, usize>>>(8);
|
||||||
|
// crossbeam_channel::bounded::<Option<HashMap<String, usize>>>(8); //#threads
|
||||||
|
|
||||||
|
// create new scope for the merge_counter clone
|
||||||
|
{
|
||||||
|
let merge_counter = Arc::clone(&merge_counter);
|
||||||
|
// create a single thread that waits for intermediate counts to process them in the overall result
|
||||||
|
thread::spawn(move || {
|
||||||
|
loop {
|
||||||
|
// handle Error case silently
|
||||||
|
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
|
||||||
|
if let Some(counter) = maybe_buffer {
|
||||||
|
// create a lock on the merge_counter (this is the only thread while processing). The main thread will read later
|
||||||
|
let mut merge_counter = merge_counter.lock().unwrap();
|
||||||
|
|
||||||
|
// update counts and discard intermediate result
|
||||||
|
for (word, count) in counter {
|
||||||
|
let entry = merge_counter.entry(word).or_insert(0);
|
||||||
|
*entry += count;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// None is the break signal (may be made nicer using enum)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// read a file of text with line separators
|
||||||
|
// when the buffer is full, pass it to a handle_line_buffer function
|
||||||
|
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
|
||||||
|
for line in reader.lines().flatten() {
|
||||||
|
if line_count < BUFFER_SIZE {
|
||||||
|
buffer.push(line);
|
||||||
|
line_count += 1;
|
||||||
|
} else {
|
||||||
|
// multi producer, single consumer. So multiple senders, using clone
|
||||||
|
let merge_sender = merge_sender.clone();
|
||||||
|
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
|
||||||
|
buffer = Vec::with_capacity(BUFFER_SIZE);
|
||||||
|
line_count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// last, partially filled buffer
|
||||||
|
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
|
||||||
|
|
||||||
|
// wait for processing to finish
|
||||||
|
worker_pool.wait();
|
||||||
|
|
||||||
|
for (word, count) in merge_counter.lock().unwrap().iter() {
|
||||||
|
// cutoff is 5 (inclusive min word count)
|
||||||
|
// TODO parametrize
|
||||||
|
if *count > 4 {
|
||||||
|
println!("{}: {}", word, count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<HashMap<String, usize>>>) {
|
||||||
|
let mut counter = HashMap::new();
|
||||||
|
|
||||||
|
for line in buffer {
|
||||||
|
for word in line.split(&[' ', ',', '.', ';']) {
|
||||||
|
let count = counter.entry(word.to_lowercase().to_owned()).or_insert(0);
|
||||||
|
*count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sender.send(Some(counter)).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
||||||
3
src/bin/mod.rs
Normal file
3
src/bin/mod.rs
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
mod count;
|
||||||
|
mod clean;
|
||||||
|
mod substitute;
|
||||||
127
src/bin/substitute.rs
Normal file
127
src/bin/substitute.rs
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
#![warn(clippy::all, clippy::pedantic)]
|
||||||
|
|
||||||
|
use std::{env, io, process, thread};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::hash::Hash;
|
||||||
|
use std::io::BufRead;
|
||||||
|
use chrono::{DateTime, Local};
|
||||||
|
use crossbeam_channel::bounded;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
|
use crate::worker::ThreadPool;
|
||||||
|
|
||||||
|
const BUFFER_SIZE: usize = 10_000;
|
||||||
|
|
||||||
|
fn main() -> Result<(), io::Error> {
|
||||||
|
let args: Vec<String> = env::args().collect();
|
||||||
|
if args.len() < 2 {
|
||||||
|
eprintln!("Usage: clean [whitelist-filename] input-filename");
|
||||||
|
process::exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let input_filename = if args.len() == 3 {
|
||||||
|
&args[2]
|
||||||
|
} else {
|
||||||
|
&args[1]
|
||||||
|
};
|
||||||
|
|
||||||
|
let start: DateTime<Local> = Local::now();
|
||||||
|
|
||||||
|
let file = File::open(input_filename)?;
|
||||||
|
let reader = io::BufReader::new(file);
|
||||||
|
|
||||||
|
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
|
||||||
|
let (merge_sender, merge_receiver) = bounded::<Option<Vec<String>>>(8);
|
||||||
|
|
||||||
|
{
|
||||||
|
thread::spawn(move || {
|
||||||
|
loop {
|
||||||
|
// handle Error case silently
|
||||||
|
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
|
||||||
|
if let Some(clean_buffer) = maybe_buffer {
|
||||||
|
for line in clean_buffer {
|
||||||
|
println!("{}", line);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// None is the break signal (may be made nicer using enum)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut line_count = 0usize;
|
||||||
|
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
|
||||||
|
for line in reader.lines().flatten() {
|
||||||
|
if line_count < BUFFER_SIZE {
|
||||||
|
buffer.push(line);
|
||||||
|
line_count += 1;
|
||||||
|
} else {
|
||||||
|
// multi producer, single consumer. So multiple senders, using clone
|
||||||
|
let merge_sender = merge_sender.clone();
|
||||||
|
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
|
||||||
|
buffer = Vec::with_capacity(BUFFER_SIZE);
|
||||||
|
line_count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// last, partially filled buffer
|
||||||
|
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
|
||||||
|
|
||||||
|
worker_pool.wait();
|
||||||
|
|
||||||
|
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<Vec<String>>>) {
|
||||||
|
lazy_static! {
|
||||||
|
static ref args: Vec<String> = env::args().collect();
|
||||||
|
static ref SUBST:HashMap<String, String> = if args.len() == 3 {
|
||||||
|
let whitelist_filename = &args[1];
|
||||||
|
read_whitelist(whitelist_filename).unwrap()
|
||||||
|
} else {
|
||||||
|
read_std_whitelist().unwrap()
|
||||||
|
};
|
||||||
|
static ref WHITELIST:HashMap<&'static str, &'static str> = SUBST.iter().map(|(s,t)| (s.as_str(),t.as_str())).collect::<HashMap<&'static str, &'static str>>();
|
||||||
|
}
|
||||||
|
let mut clean_buffer = vec![];
|
||||||
|
for line in buffer {
|
||||||
|
let mut clean_line = String::new();
|
||||||
|
line.graphemes(true).map(|g|*(SUBST.get(g)).or_else(||Some(g.to_owned()))).for_each(|c| clean_line.push_str(c.unwrap()));
|
||||||
|
clean_buffer.push(clean_line);
|
||||||
|
}
|
||||||
|
|
||||||
|
sender.send(Some(clean_buffer)).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_whitelist(filename: &String) -> Result<HashMap<String, String>, io::Error> {
|
||||||
|
let file = File::open(filename)?;
|
||||||
|
let reader = io::BufReader::new(file);
|
||||||
|
let mut substitutions = HashMap::new();
|
||||||
|
|
||||||
|
for line in reader.lines().flatten() {
|
||||||
|
let key_value = line.split(":").map(|e|e.to_owned()).collect::<Vec<String>>();
|
||||||
|
let source = key_value[0].clone();
|
||||||
|
let dest = key_value[1].clone();
|
||||||
|
source.graphemes(true).for_each(|token| { substitutions.insert(token.to_owned(), dest.clone()); });
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(substitutions)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_std_whitelist() -> Result<HashMap<String, String>, io::Error> {
|
||||||
|
let substitute_dat = include_str!("../../dat/default_substitute.dat");
|
||||||
|
|
||||||
|
let mut substitutions = HashMap::new();
|
||||||
|
|
||||||
|
for line in substitute_dat.split("\n") {
|
||||||
|
let key_value = line.split(":").map(|e|e.to_owned()).collect::<Vec<String>>();
|
||||||
|
let source = key_value[0].clone();
|
||||||
|
let dest = key_value[1].clone();
|
||||||
|
source.graphemes(true).for_each(|token| { substitutions.insert(token.to_owned(), dest.clone()); });
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(substitutions)
|
||||||
|
}
|
||||||
|
|
@ -3,6 +3,7 @@ pub mod util;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod mat;
|
pub mod mat;
|
||||||
pub mod pipelines;
|
pub mod pipelines;
|
||||||
|
mod worker;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) mod test;
|
pub(crate) mod test;
|
||||||
|
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
use std::collections::BTreeMap;
|
|
||||||
use std::collections::hash_map::DefaultHasher;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::hash::{Hash, Hasher};
|
|
||||||
use std::io::{self, BufRead};
|
|
||||||
|
|
||||||
pub fn count() -> anyhow::Result<()> {
|
|
||||||
let mut store: BTreeMap<String, usize> = BTreeMap::new();
|
|
||||||
let stdin = io::stdin();
|
|
||||||
for line in stdin.lock().lines() {
|
|
||||||
let line = line?;
|
|
||||||
for token in line.split(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) {
|
|
||||||
let count = store.entry(token.to_owned()).or_insert(0);
|
|
||||||
*count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (key, value) in store{
|
|
||||||
println!("{}:{}", key,value);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn hash(string: &str) -> u64 {
|
|
||||||
let mut hasher = DefaultHasher::new();
|
|
||||||
string.hash(&mut hasher);
|
|
||||||
hasher.finish()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn create_binary_bow(file: File) {}
|
|
||||||
95
src/worker.rs
Normal file
95
src/worker.rs
Normal file
|
|
@ -0,0 +1,95 @@
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
// custom threadpool implementation
|
||||||
|
// may be replaced by crate, if I find the right one. Have looked at crossbeam and rayon.
|
||||||
|
// Rayon par_iter for instance doesn't seem usable, because it lacks buffering, which I guess would increase context switching
|
||||||
|
// this is in principle an uncorroborated thought
|
||||||
|
// It does use crossbeam_channel::bounded, which is crucial to avoid memory overload (more than 30 Gbs instead of at most ~3 in the current implementation)
|
||||||
|
enum Message {
|
||||||
|
NewJob(Job),
|
||||||
|
Terminate,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ThreadPool {
|
||||||
|
workers: Vec<Worker>,
|
||||||
|
sender: crossbeam_channel::Sender<Message>,
|
||||||
|
}
|
||||||
|
|
||||||
|
type Job = Box<dyn FnOnce() + Send + 'static>;
|
||||||
|
|
||||||
|
impl ThreadPool {
|
||||||
|
/// Create a new threadpool
|
||||||
|
/// size: the number of threads in the pool
|
||||||
|
/// # Panics
|
||||||
|
#[must_use = "size must be >0"]
|
||||||
|
pub fn new(size: usize) -> Self {
|
||||||
|
assert!(size > 0);
|
||||||
|
|
||||||
|
// The max number of messages in the channel should be higher as the number of workerthreads grows
|
||||||
|
// to avoid idle threads.
|
||||||
|
let (sender, receiver) = crossbeam_channel::bounded(size * 2); // just a stab
|
||||||
|
let receiver = Arc::new(Mutex::new(receiver));
|
||||||
|
let mut workers = Vec::with_capacity(size);
|
||||||
|
|
||||||
|
for _ in 0..size {
|
||||||
|
workers.push(Worker::new(Arc::clone(&receiver)));
|
||||||
|
}
|
||||||
|
|
||||||
|
Self {
|
||||||
|
workers,
|
||||||
|
sender,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// submit a function for multithreaded processing
|
||||||
|
pub fn execute<F>(&self, f: F) where F: FnOnce() + Send + 'static {
|
||||||
|
let job = Box::new(f);
|
||||||
|
|
||||||
|
self.sender.send(Message::NewJob(job)).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// wait for workerthreads to finish
|
||||||
|
pub fn wait(&mut self) {
|
||||||
|
for worker in &mut self.workers {
|
||||||
|
self.sender.send(Message::Terminate).unwrap_or(());
|
||||||
|
if let Some(thread) = worker.thread.take() {
|
||||||
|
thread.join().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Worker {
|
||||||
|
thread: Option<thread::JoinHandle<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Worker {
|
||||||
|
fn new(receiver: Arc<Mutex<crossbeam_channel::Receiver<Message>>>) -> Self {
|
||||||
|
let thread = thread::spawn(move || {
|
||||||
|
loop {
|
||||||
|
// idle worker threads die quickly, so avoid this situation
|
||||||
|
// the timeout can be increased if that is not what you want
|
||||||
|
// For small workloads the timeout is necessary, because some theads may never get the Terminate command and thus prevent the process from ending.
|
||||||
|
let message_r = receiver.lock().unwrap().recv_timeout(Duration::from_secs(1));
|
||||||
|
if let Ok(message) = message_r {
|
||||||
|
match message {
|
||||||
|
Message::NewJob(job) => {
|
||||||
|
job();
|
||||||
|
}
|
||||||
|
Message::Terminate => {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Self {
|
||||||
|
thread: Some(thread),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue