From 8acf2a11d5d952afc245bd70804caf7f8559f753 Mon Sep 17 00:00:00 2001 From: Shautvast Date: Fri, 3 Mar 2023 16:30:13 +0100 Subject: [PATCH] bugs fixed, improved loading --- .gitignore | 2 +- README.md | 4 +- src/dataloader.rs | 61 ++++++++++-------- src/main.rs | 6 +- src/net.rs | 161 +++++++++++++++++++++++++++++++++------------- 5 files changed, 158 insertions(+), 76 deletions(-) diff --git a/.gitignore b/.gitignore index 8462796..1fa046f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ /target *.iml .idea -src/data/training.json \ No newline at end of file +src/data/training.jsonde \ No newline at end of file diff --git a/README.md b/README.md index 152c635..1f316a3 100644 --- a/README.md +++ b/README.md @@ -10,4 +10,6 @@ to do: * add unit tests * train using actual training data * evaluate with test/validation data -* make more efficient \ No newline at end of file +* make more efficient + +training_data/test_data not included diff --git a/src/dataloader.rs b/src/dataloader.rs index 44bcd6f..c91d644 100644 --- a/src/dataloader.rs +++ b/src/dataloader.rs @@ -1,62 +1,72 @@ -use std::iter::zip; +use std::fmt::Debug; + use rand::prelude::*; use serde::Deserialize; -pub fn load_data() -> Data { +pub fn load_data() -> (Data, Data) { // the mnist data is structured as // x: [[[pixels]],[[pixels]], etc], // y: [label1, label2, etc] // this is transformed to: // Data : Vec - // DataLine {inputs: Vec, label: f32} - let raw_data: RawData = serde_json::from_slice(include_bytes!("data/unittest.json")).unwrap(); - let mut vec = Vec::new(); - for (x, y) in zip(raw_data.x, raw_data.y) { - vec.push(DataLine { inputs: x, label: onehot(y) }); - } + // DataLine {inputs: Vec, label: f64} + let raw_training_data: Vec = serde_json::from_slice(include_bytes!("data/training.json")).unwrap(); + let raw_test_data: Vec = serde_json::from_slice(include_bytes!("data/test.json")).unwrap(); - Data(vec) + let train = vectorize(raw_training_data); + let test = vectorize(raw_test_data); + + (Data(train), Data(test)) +} + +fn vectorize(raw_training_data: Vec) -> Vec>{ + let mut result = Vec::new(); + for line in raw_training_data { + result.push(DataLine { inputs: line.x, label: onehot(line.y) }); + } + result } #[derive(Deserialize)] struct RawData { - x: Vec>, - y: Vec, + x: Vec, + y: u8, } /// X is type of input /// Y is type of output -pub struct DataLine { +#[derive(Debug, Clone)] +pub struct DataLine where X: Clone, Y: Clone { pub inputs: Vec, pub label: Y, } - -pub struct OneHotVector{ - pub val: usize +/// simple way to encode a onehot vector. An object that returns 1.0 if you get the 'right' index, or 0.0 otherwise +#[derive(Debug, Clone)] +pub struct OneHotVector { + pub val: usize, } -impl OneHotVector{ - fn new(val: usize) -> Self{ - Self{ +impl OneHotVector { + pub fn new(val: usize) -> Self { + Self { val } } - pub fn get(&self, index: usize) -> f32{ + pub fn get(&self, index: usize) -> f64 { if self.val == index { 1.0 } else { 0.0 } } - - } -pub struct Data(pub Vec>); +#[derive(Debug, Clone)] +pub struct Data(pub Vec>) where X: Clone, Y: Clone ; -impl Data { +impl Data where X: Clone, Y: Clone { pub fn shuffle(&mut self) { let mut rng = thread_rng(); self.0.shuffle(&mut rng); @@ -66,7 +76,7 @@ impl Data { self.0.len() } - pub fn is_empty(&self, ) -> bool{ + pub fn is_empty(&self) -> bool { self.0.is_empty() } @@ -77,12 +87,11 @@ impl Data { batches.push(&self.0[offset..offset + batch_size]); offset += batch_size; } - batches.push(&self.0[offset..self.0.len()]); batches } } /// returns a vector as matrix where y is one-hot encoded fn onehot(y: u8) -> OneHotVector { - OneHotVector::new(y as usize) + OneHotVector::new(y as usize) } \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 1f1a7f2..a3881b4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,10 @@ use mnist_rs::dataloader::load_data; fn main() { - let mut net = mnist_rs::net::Network::from(vec![784, 30, 10]); - let training_data = load_data(); + let mut net = mnist_rs::net::Network::gaussian(vec![784, 30, 10]); + let (training_data, test_data) = load_data(); - net.sgd(training_data, 30, 10, 3.0, &None); + net.sgd(training_data, 30, 1, 0.01, Some(test_data)); // let sizes = vec![5,3,2]; diff --git a/src/net.rs b/src/net.rs index 0681a34..020c64d 100644 --- a/src/net.rs +++ b/src/net.rs @@ -1,5 +1,5 @@ use std::iter::zip; -use std::ops::Add; +use std::ops::{Add, Sub}; use nalgebra::DMatrix; use rand::prelude::*; @@ -12,8 +12,8 @@ use crate::mat::add; pub struct Network { _sizes: Vec, num_layers: usize, - pub biases: Vec>, - pub weights: Vec>, + pub biases: Vec>, + pub weights: Vec>, } impl Network { @@ -27,25 +27,42 @@ impl Network { /// layer is assumed to be an input layer, and by convention we /// won't set any biases for those neurons, since biases are only /// ever used in computing the outputs from later layers. - pub fn from(sizes: Vec) -> Self { + pub fn gaussian(sizes: Vec) -> Self { Self { _sizes: sizes.clone(), num_layers: sizes.len(), - biases: biases(sizes[1..].to_vec()), - weights: weights(zip(sizes[..sizes.len() - 1].to_vec(), sizes[1..].to_vec()).collect()), + biases: biases(sizes[1..].to_vec(), |size: &usize| random_matrix(*size, 1)), + weights: weights(zip(sizes[..sizes.len() - 1].to_vec(), sizes[1..].to_vec()).collect(), + |size| random_matrix(size.1, size.0)), } } - fn feed_forward(&self, input: Vec) -> Vec { + /// Creates a network where all weights and biases are set to 1.0 + /// This is for testing the software itself + pub fn ones(sizes: Vec) -> Self { + Self { + _sizes: sizes.clone(), + num_layers: sizes.len(), + biases: biases(sizes[1..].to_vec(), |size: &usize| DMatrix::from_fn(*size, 1, |_, _| 1.0)), + weights: weights(zip(sizes[..sizes.len() - 1].to_vec(), sizes[1..].to_vec()).collect(), + |shape| DMatrix::from_fn(shape.1, shape.0, |_, _| 1.0)), + } + } + + fn feed_forward(&self, input: Vec) -> Vec { + self.feed_forward_activation(input, sigmoid_inplace) + } + + fn feed_forward_activation(&self, input: Vec, activation: fn(&mut f64)) -> Vec { let mut a = DMatrix::from_vec(input.len(), 1, input); for (b, w) in zip(&self.biases, &self.weights) { a = add(b.clone(), w * a).unwrap(); - a.apply(sigmoid_inplace); + a.apply(activation); } - a.column(1).iter().copied().collect() + a.column(0).iter().copied().collect() } - pub fn sgd(&mut self, mut training_data: Data, epochs: usize, minibatch_size: usize, eta: f32, test_data: &Option>) { + pub fn sgd(&mut self, mut training_data: Data, epochs: usize, minibatch_size: usize, eta: f64, test_data: Option>) { for j in 0..epochs { training_data.shuffle(); let mini_batches = training_data.as_batches(minibatch_size); @@ -53,7 +70,7 @@ impl Network { self.update_mini_batch(mini_batch, eta); } - if let Some(test_data) = test_data { + if let Some(test_data) = &test_data { println!("Epoch {}: {} / {}", j, self.evaluate(test_data), test_data.len()); } else { println!("Epoch {} complete", j); @@ -65,50 +82,54 @@ impl Network { /// gradient descent using backpropagation to a single mini batch. /// The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta`` /// is the learning rate. - fn update_mini_batch(&mut self, mini_batch: &[DataLine], eta: f32) { - let mut nabla_b: Vec> = self.biases.iter() + fn update_mini_batch(&mut self, mini_batch: &[DataLine], eta: f64) { + let mut nabla_b: Vec> = self.biases.iter() .map(|b| b.shape()) .map(|s| DMatrix::zeros(s.0, s.1)) .collect(); - let mut nabla_w: Vec> = self.weights.iter() + let mut nabla_w: Vec> = self.weights.iter() .map(|w| w.shape()) .map(|s| DMatrix::zeros(s.0, s.1)) .collect(); for line in mini_batch.iter() { let (delta_nabla_b, delta_nabla_w) = self.backprop(line.inputs.to_vec(), &line.label); + // nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] + // nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] nabla_b = zip(&nabla_b, &delta_nabla_b).map(|(nb, dnb)| nb.add(dnb)).collect(); nabla_w = zip(&nabla_w, &delta_nabla_w).map(|(nw, dnw)| nw.add(dnw)).collect(); } self.weights = zip(&self.weights, &nabla_w) - .map(|(w, nw)| (w.add_scalar(-eta / mini_batch.len() as f32)).component_mul(nw)).collect(); + .map(|(w, nw)| w.sub(nw.scale(eta / mini_batch.len() as f64))).collect(); + self.biases = zip(&self.biases, &nabla_b) - .map(|(b, nb)| (b.add_scalar(-eta / mini_batch.len() as f32)).component_mul(nb)).collect(); + .map(|(b, nb)| b.sub(nb.scale(eta / mini_batch.len() as f64))).collect(); } /// Return the number of test inputs for which the neural /// network outputs the correct result. Note that the neural /// network's output is assumed to be the index of whichever /// neuron in the final layer has the highest activation. - fn evaluate(&self, test_data: &Data) -> usize { + fn evaluate(&self, test_data: &Data) -> usize { let test_results: Vec<(usize, usize)> = test_data.0.iter() .map(|line| (argmax(self.feed_forward(line.inputs.clone())), line.label.val)) .collect(); - test_results.into_iter().filter(|(x, y)| x == y).count() + + test_results.into_iter().filter(|(x, y)| *x == *y).count() } /// Return a tuple `(nabla_b, nabla_w)` representing the /// gradient for the cost function C_x. `nabla_b` and /// `nabla_w` are layer-by-layer lists of matrices, similar /// to `self.biases` and `self.weights`. - fn backprop(&self, x: Vec, y: &OneHotVector) -> (Vec>, Vec>) { + fn backprop(&self, x: Vec, y: &OneHotVector) -> (Vec>, Vec>) { // zero_grad ie. set gradient to zero - let mut nabla_b: Vec> = self.biases.iter() + let mut nabla_b: Vec> = self.biases.iter() .map(|b| b.shape()) .map(|s| DMatrix::zeros(s.0, s.1)) .collect(); - let mut nabla_w: Vec> = self.weights.iter() + let mut nabla_w: Vec> = self.weights.iter() .map(|w| w.shape()) .map(|s| DMatrix::zeros(s.0, s.1)) .collect(); @@ -119,14 +140,15 @@ impl Network { let mut zs = vec![]; for (b, w) in zip(&self.biases, &self.weights) { - let z = add(w * &activation, b.clone()).unwrap(); + let z = (w * &activation)+b.clone(); zs.push(z.clone()); activation = z.map(sigmoid); activations.push(activation.clone()); } // backward pass // delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) - let delta: DMatrix = self.cost_derivative(&activations[activations.len() - 1], y).component_mul(&zs[zs.len() - 1].map(sigmoid_prime)); + let delta: DMatrix = cost_derivative(&activations[activations.len() - 1], y).component_mul(&zs[zs.len() - 1].map(sigmoid_prime)); + // println!("delta {:?}", delta); let index = nabla_b.len() - 1; nabla_b[index] = delta.clone(); @@ -136,75 +158,124 @@ impl Network { let lens_zs = zs.len(); for l in 2..self.num_layers { let z = &zs[lens_zs - l]; - let sp = z.map(sigmoid_prime); let weight = self.weights[self.weights.len() - l + 1].transpose(); - let delta2 = (weight * &delta).component_mul(&sp); + let delta = (weight * &delta).component_mul(&z.map(sigmoid_prime)); let len_nb = nabla_b.len(); - nabla_b[len_nb - l] = delta2.clone(); + nabla_b[len_nb - l] = delta.clone(); let len_nw = nabla_w.len(); - nabla_w[len_nw - l] = delta2 * activations[activations.len() - l - 1].transpose(); + nabla_w[len_nw - l] = delta * activations[activations.len() - l - 1].transpose(); } (nabla_b, nabla_w) } - - fn cost_derivative(&self, output_activations: &DMatrix, y: &OneHotVector) -> DMatrix { - // output_activations - y - let shape = output_activations.shape(); - DMatrix::from_iterator(shape.0, shape.1, output_activations.iter().enumerate() - .map(|(index, a)| a - y.get(index))) - } } -fn argmax(val: Vec) -> usize { +fn cost_derivative(output_activations: &DMatrix, y: &OneHotVector) -> DMatrix { + // output_activations - y + // println!("output {:?}", output_activations); + // println!("expected {:?}", y); + + let shape = output_activations.shape(); + let t = DMatrix::from_iterator(shape.0, shape.1, output_activations.iter().enumerate() + .map(|(index, a)| a - y.get(index))); + // println!("t {:?}",t); + t +} + +fn argmax(val: Vec) -> usize { let mut max = 0.0; let mut index = 0; for (i, x) in val.iter().enumerate() { + // print!("{},",x); if *x > max { index = i; max = *x; } } + // println!(); index } -fn biases(sizes: Vec) -> Vec> { - sizes.iter().map(|size| random_matrix(*size, 1)).collect() +fn biases(sizes: Vec, init: fn(&usize) -> DMatrix) -> Vec> { + sizes.iter().map(init).collect() } -fn weights(sizes: Vec<(usize, usize)>) -> Vec> { - sizes.iter().map(|size| random_matrix(size.1, size.0)).collect() +fn weights(sizes: Vec<(usize, usize)>, init: fn(&(usize, usize)) -> DMatrix) -> Vec> { + sizes.iter().map(init).collect() } -fn random_matrix(rows: usize, cols: usize) -> DMatrix { - let normal: Normal = Normal::new(0.0, 1.0).unwrap(); +fn random_matrix(rows: usize, cols: usize) -> DMatrix { + let normal: Normal = Normal::new(0.0, 1.0).unwrap(); DMatrix::from_fn(rows, cols, |_, _| normal.sample(&mut thread_rng())) } -fn sigmoid_inplace(val: &mut f32) { +fn sigmoid_inplace(val: &mut f64) { *val = sigmoid(*val); } -fn sigmoid(val: f32) -> f32 { +fn sigmoid(val: f64) -> f64 { 1.0 / (1.0 + (-val).exp()) } /// Derivative of the sigmoid function. -fn sigmoid_prime(val: f32) -> f32 { +fn sigmoid_prime(val: f64) -> f64 { sigmoid(val) * (1.0 - sigmoid(val)) } #[cfg(test)] mod test { + use std::convert::identity; use nalgebra::DMatrix; use super::*; #[test] fn test_sigmoid() { - let mut mat: DMatrix = DMatrix::from_vec(1, 1, vec![0.0]); + let mut mat: DMatrix = DMatrix::from_vec(1, 1, vec![0.0]); mat.apply(sigmoid_inplace); assert_eq!(mat.get(0), Some(&0.5)); } + + #[test] + fn test_sigmoid_inplace() { + let mut v = 10.0; + sigmoid_inplace(&mut v); + assert_eq!(0.9999546, v); + } + + #[test] + fn test_sigmoid_prime() { + assert_eq!(0.19661193324148185, sigmoid_prime(1.0)) + } + + #[test] + fn test_argmax() { + assert_eq!(5, argmax(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0])); + } + + #[test] + fn test_cost_derivative() { + let matrix = DMatrix::from_vec(4, 1, vec![0.0, 1.0, 2.0, -2.0]); + let delta = cost_derivative(&matrix, &OneHotVector::new(1)); + assert_eq!(delta, DMatrix::from_vec(4, 1, vec![0.0, 0.0, 2.0, -2.0])); + } + + #[test] + fn test_feedforward() { + // 2 layers of 2 units + let mut net = Network::ones(vec![2, 2]); + + let prediction = net.feed_forward_activation(vec![2.0, 2.0], |a| {}); + assert_eq!(prediction, vec![5.0, 5.0]) + } + + #[test] + fn test_sgd() { + // 2 layers of 2 units + let mut net = Network::ones(vec![2, 2]); + let data = Data(vec![DataLine { inputs: vec![1.0, 1.0], label: OneHotVector::new(1) }]); + net.sgd(data, 1, 1, 0.001, None); + println!("{:?}", net); + } } \ No newline at end of file