rltk/src/bin/count.rs
Sander Hautvast b1f6802568
Some checks failed
Rust / build (push) Has been cancelled
inclusion of similar work
2022-12-18 17:36:43 +01:00

122 lines
4.3 KiB
Rust

#![warn(clippy::all, clippy::pedantic)]
use std::{env, process};
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead};
use std::io::Error;
use std::sync::{Arc, Mutex};
use std::thread;
use chrono::prelude::*;
use crossbeam_channel::bounded;
use crate::worker::ThreadPool;
const BUFFER_SIZE: usize = 10_000;
/// Creates counts for single words in a text (file). These can be used as preprocessing for (sub)word tokenization
/// Writes the result to a new file.
///
/// removes all interpunction and special characters
///
/// Uses a fork-join pattern using 8 threads (can be changed in the code).
///
pub fn main() -> Result<(), Error> {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: tokenize [filename]");
process::exit(-1);
}
let input_filename = &args[1];
let start: DateTime<Local> = Local::now();
eprintln!("started at {:?}", start);
let file = File::open(input_filename)?;
let reader = io::BufReader::new(file);
// counter for determining if the buffer is full (and ready to be passed to a worker thread)
let mut line_count = 0usize;
let mut worker_pool = ThreadPool::new(4); // TODO turn into cmdline argument
// this is the end result before writing to file
let merge_counter = Arc::new(Mutex::new(HashMap::new()));
// bounded channel for sending intermediate results (counts for a single buffer) to the merger
// let (merge_sender, merge_receiver) = crossbeam_channel::bounded::<Option<HashMap<String, usize>>>(8); //#threads
let (merge_sender, merge_receiver) = bounded::<Option<HashMap<String, usize>>>(8);
// crossbeam_channel::bounded::<Option<HashMap<String, usize>>>(8); //#threads
// create new scope for the merge_counter clone
{
let merge_counter = Arc::clone(&merge_counter);
// create a single thread that waits for intermediate counts to process them in the overall result
thread::spawn(move || {
loop {
// handle Error case silently
let maybe_buffer = merge_receiver.recv().unwrap_or(None);
if let Some(counter) = maybe_buffer {
// create a lock on the merge_counter (this is the only thread while processing). The main thread will read later
let mut merge_counter = merge_counter.lock().unwrap();
// update counts and discard intermediate result
for (word, count) in counter {
let entry = merge_counter.entry(word).or_insert(0);
*entry += count;
}
} else {
// None is the break signal (may be made nicer using enum)
break;
}
}
});
}
// read a file of text with line separators
// when the buffer is full, pass it to a handle_line_buffer function
let mut buffer = Vec::with_capacity(BUFFER_SIZE);
for line in reader.lines().flatten() {
if line_count < BUFFER_SIZE {
buffer.push(line);
line_count += 1;
} else {
// multi producer, single consumer. So multiple senders, using clone
let merge_sender = merge_sender.clone();
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
buffer = Vec::with_capacity(BUFFER_SIZE);
line_count = 0;
}
}
// last, partially filled buffer
worker_pool.execute(move || handle_line_buffer(buffer, &merge_sender));
// wait for processing to finish
worker_pool.wait();
for (word, count) in merge_counter.lock().unwrap().iter() {
// cutoff is 5 (inclusive min word count)
// TODO parametrize
if *count > 4 {
println!("{}: {}", word, count);
}
}
eprintln!("took {} seconds", Local::now().signed_duration_since(start));
Ok(())
}
fn handle_line_buffer(buffer: Vec<String>, sender: &crossbeam_channel::Sender<Option<HashMap<String, usize>>>) {
let mut counter = HashMap::new();
for line in buffer {
for word in line.split(&[' ', ',', '.', ';']) {
let count = counter.entry(word.to_lowercase().to_owned()).or_insert(0);
*count += 1;
}
}
sender.send(Some(counter)).unwrap();
}