From 8acf2a11d5d952afc245bd70804caf7f8559f753 Mon Sep 17 00:00:00 2001
From: Shautvast <sander.hautvast@top-squad.nl>
Date: Fri, 3 Mar 2023 16:30:13 +0100
Subject: [PATCH] bugs fixed, improved loading

---
 .gitignore        |   2 +-
 README.md         |   4 +-
 src/dataloader.rs |  61 ++++++++++--------
 src/main.rs       |   6 +-
 src/net.rs        | 161 +++++++++++++++++++++++++++++++++-------------
 5 files changed, 158 insertions(+), 76 deletions(-)
diff --git a/.gitignore b/.gitignore
index 8462796..1fa046f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 /target
 *.iml
 .idea
-src/data/training.json
\ No newline at end of file
+src/data/training.jsonde
\ No newline at end of file
diff --git a/README.md b/README.md
index 152c635..1f316a3 100644
--- a/README.md
+++ b/README.md
@@ -10,4 +10,6 @@ to do:
 * add unit tests
 * train using actual training data
 * evaluate with test/validation data
-* make more efficient
\ No newline at end of file
+* make more efficient
+
+training_data/test_data not included
diff --git a/src/dataloader.rs b/src/dataloader.rs
index 44bcd6f..c91d644 100644
--- a/src/dataloader.rs
+++ b/src/dataloader.rs
@@ -1,62 +1,72 @@
-use std::iter::zip;
+use std::fmt::Debug;
+
 use rand::prelude::*;
 use serde::Deserialize;
 
-pub fn load_data() -> Data<f32, OneHotVector> {
+pub fn load_data() -> (Data<f64, OneHotVector>, Data<f64, OneHotVector>) {
     // the mnist data is structured as
     // x: [[[pixels]],[[pixels]], etc],
     // y: [label1, label2, etc]
     // this is transformed to:
     // Data : Vec<DataLine>
-    // DataLine {inputs: Vec<pixels as f32>, label: f32}
-    let raw_data: RawData = serde_json::from_slice(include_bytes!("data/unittest.json")).unwrap();
-    let mut vec = Vec::new();
-    for (x, y) in zip(raw_data.x, raw_data.y) {
-        vec.push(DataLine { inputs: x, label: onehot(y) });
-    }
+    // DataLine {inputs: Vec<pixels as f64>, label: f64}
+    let raw_training_data: Vec<RawData> = serde_json::from_slice(include_bytes!("data/training.json")).unwrap();
+    let raw_test_data: Vec<RawData> = serde_json::from_slice(include_bytes!("data/test.json")).unwrap();
 
-    Data(vec)
+    let train = vectorize(raw_training_data);
+    let test = vectorize(raw_test_data);
+
+    (Data(train), Data(test))
+}
+
+fn vectorize(raw_training_data: Vec<RawData>) -> Vec<DataLine<f64, OneHotVector>>{
+    let mut result = Vec::new();
+    for line in raw_training_data {
+        result.push(DataLine { inputs: line.x, label: onehot(line.y) });
+    }
+    result
 }
 
 #[derive(Deserialize)]
 struct RawData {
-    x: Vec<Vec<f32>>,
-    y: Vec<u8>,
+    x: Vec<f64>,
+    y: u8,
 }
 
 /// X is type of input
 /// Y is type of output
-pub struct DataLine<X, Y> {
+#[derive(Debug, Clone)]
+pub struct DataLine<X, Y> where X: Clone, Y: Clone {
     pub inputs: Vec<X>,
     pub label: Y,
 }
 
-
-pub struct OneHotVector{
-    pub val: usize
+/// simple way to encode a onehot vector. An object that returns 1.0 if you get the 'right' index, or 0.0 otherwise
+#[derive(Debug, Clone)]
+pub struct OneHotVector {
+    pub val: usize,
 }
 
-impl OneHotVector{
-    fn new(val: usize) -> Self{
-        Self{
+impl OneHotVector {
+    pub fn new(val: usize) -> Self {
+        Self {
             val
         }
     }
 
-    pub fn get(&self, index: usize) -> f32{
+    pub fn get(&self, index: usize) -> f64 {
         if self.val == index {
             1.0
         } else {
             0.0
         }
     }
-
-
 }
 
-pub struct Data<X, Y>(pub Vec<DataLine<X, Y>>);
+#[derive(Debug, Clone)]
+pub struct Data<X, Y>(pub Vec<DataLine<X, Y>>) where X: Clone, Y: Clone ;
 
-impl<X, Y> Data<X, Y> {
+impl<X, Y> Data<X, Y> where X: Clone, Y: Clone {
     pub fn shuffle(&mut self) {
         let mut rng = thread_rng();
         self.0.shuffle(&mut rng);
@@ -66,7 +76,7 @@ impl<X, Y> Data<X, Y> {
         self.0.len()
     }
 
-    pub fn is_empty(&self, ) -> bool{
+    pub fn is_empty(&self) -> bool {
         self.0.is_empty()
     }
 
@@ -77,12 +87,11 @@ impl<X, Y> Data<X, Y> {
             batches.push(&self.0[offset..offset + batch_size]);
             offset += batch_size;
         }
-        batches.push(&self.0[offset..self.0.len()]);
         batches
     }
 }
 
 /// returns a vector as matrix where y is one-hot encoded
 fn onehot(y: u8) -> OneHotVector {
-   OneHotVector::new(y as usize)
+    OneHotVector::new(y as usize)
 }
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index 1f1a7f2..a3881b4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,10 +1,10 @@
 use mnist_rs::dataloader::load_data;
 
 fn main() {
-    let mut net = mnist_rs::net::Network::from(vec![784, 30, 10]);
-    let training_data = load_data();
+    let mut net = mnist_rs::net::Network::gaussian(vec![784, 30, 10]);
+    let (training_data, test_data) = load_data();
 
-    net.sgd(training_data, 30, 10, 3.0, &None);
+    net.sgd(training_data, 30, 1, 0.01, Some(test_data));
 
 
     // let sizes = vec![5,3,2];
diff --git a/src/net.rs b/src/net.rs
index 0681a34..020c64d 100644
--- a/src/net.rs
+++ b/src/net.rs
@@ -1,5 +1,5 @@
 use std::iter::zip;
-use std::ops::Add;
+use std::ops::{Add, Sub};
 
 use nalgebra::DMatrix;
 use rand::prelude::*;
@@ -12,8 +12,8 @@ use crate::mat::add;
 pub struct Network {
     _sizes: Vec<usize>,
     num_layers: usize,
-    pub biases: Vec<DMatrix<f32>>,
-    pub weights: Vec<DMatrix<f32>>,
+    pub biases: Vec<DMatrix<f64>>,
+    pub weights: Vec<DMatrix<f64>>,
 }
 
 impl Network {
@@ -27,25 +27,42 @@ impl Network {
     /// layer is assumed to be an input layer, and by convention we
     /// won't set any biases for those neurons, since biases are only
     /// ever used in computing the outputs from later layers.
-    pub fn from(sizes: Vec<usize>) -> Self {
+    pub fn gaussian(sizes: Vec<usize>) -> Self {
         Self {
             _sizes: sizes.clone(),
             num_layers: sizes.len(),
-            biases: biases(sizes[1..].to_vec()),
-            weights: weights(zip(sizes[..sizes.len() - 1].to_vec(), sizes[1..].to_vec()).collect()),
+            biases: biases(sizes[1..].to_vec(), |size: &usize| random_matrix(*size, 1)),
+            weights: weights(zip(sizes[..sizes.len() - 1].to_vec(), sizes[1..].to_vec()).collect(),
+                             |size| random_matrix(size.1, size.0)),
         }
     }
 
-    fn feed_forward(&self, input: Vec<f32>) -> Vec<f32> {
+    /// Creates a network where all weights and biases are set to 1.0
+    /// This is for testing the software itself
+    pub fn ones(sizes: Vec<usize>) -> Self {
+        Self {
+            _sizes: sizes.clone(),
+            num_layers: sizes.len(),
+            biases: biases(sizes[1..].to_vec(), |size: &usize| DMatrix::from_fn(*size, 1, |_, _| 1.0)),
+            weights: weights(zip(sizes[..sizes.len() - 1].to_vec(), sizes[1..].to_vec()).collect(),
+                             |shape| DMatrix::from_fn(shape.1, shape.0, |_, _| 1.0)),
+        }
+    }
+
+    fn feed_forward(&self, input: Vec<f64>) -> Vec<f64> {
+        self.feed_forward_activation(input, sigmoid_inplace)
+    }
+
+    fn feed_forward_activation(&self, input: Vec<f64>, activation: fn(&mut f64)) -> Vec<f64> {
         let mut a = DMatrix::from_vec(input.len(), 1, input);
         for (b, w) in zip(&self.biases, &self.weights) {
             a = add(b.clone(), w * a).unwrap();
-            a.apply(sigmoid_inplace);
+            a.apply(activation);
         }
-        a.column(1).iter().copied().collect()
+        a.column(0).iter().copied().collect()
     }
 
-    pub fn sgd(&mut self, mut training_data: Data<f32, OneHotVector>, epochs: usize, minibatch_size: usize, eta: f32, test_data: &Option<Data<f32, OneHotVector>>) {
+    pub fn sgd(&mut self, mut training_data: Data<f64, OneHotVector>, epochs: usize, minibatch_size: usize, eta: f64, test_data: Option<Data<f64, OneHotVector>>) {
         for j in 0..epochs {
             training_data.shuffle();
             let mini_batches = training_data.as_batches(minibatch_size);
@@ -53,7 +70,7 @@ impl Network {
                 self.update_mini_batch(mini_batch, eta);
             }
 
-            if let Some(test_data) = test_data {
+            if let Some(test_data) = &test_data {
                 println!("Epoch {}: {} / {}", j, self.evaluate(test_data), test_data.len());
             } else {
                 println!("Epoch {} complete", j);
@@ -65,50 +82,54 @@ impl Network {
     /// gradient descent using backpropagation to a single mini batch.
     /// The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
     /// is the learning rate.
-    fn update_mini_batch(&mut self, mini_batch: &[DataLine<f32, OneHotVector>], eta: f32) {
-        let mut nabla_b: Vec<DMatrix<f32>> = self.biases.iter()
+    fn update_mini_batch(&mut self, mini_batch: &[DataLine<f64, OneHotVector>], eta: f64) {
+        let mut nabla_b: Vec<DMatrix<f64>> = self.biases.iter()
             .map(|b| b.shape())
             .map(|s| DMatrix::zeros(s.0, s.1))
             .collect();
-        let mut nabla_w: Vec<DMatrix<f32>> = self.weights.iter()
+        let mut nabla_w: Vec<DMatrix<f64>> = self.weights.iter()
             .map(|w| w.shape())
             .map(|s| DMatrix::zeros(s.0, s.1))
             .collect();
         for line in mini_batch.iter() {
             let (delta_nabla_b, delta_nabla_w) = self.backprop(line.inputs.to_vec(), &line.label);
 
+            // nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
+            // nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
             nabla_b = zip(&nabla_b, &delta_nabla_b).map(|(nb, dnb)| nb.add(dnb)).collect();
             nabla_w = zip(&nabla_w, &delta_nabla_w).map(|(nw, dnw)| nw.add(dnw)).collect();
         }
 
         self.weights = zip(&self.weights, &nabla_w)
-            .map(|(w, nw)| (w.add_scalar(-eta / mini_batch.len() as f32)).component_mul(nw)).collect();
+            .map(|(w, nw)| w.sub(nw.scale(eta / mini_batch.len() as f64))).collect();
+
         self.biases = zip(&self.biases, &nabla_b)
-            .map(|(b, nb)| (b.add_scalar(-eta / mini_batch.len() as f32)).component_mul(nb)).collect();
+            .map(|(b, nb)| b.sub(nb.scale(eta / mini_batch.len() as f64))).collect();
     }
 
     /// Return the number of test inputs for which the neural
     /// network outputs the correct result. Note that the neural
     /// network's output is assumed to be the index of whichever
     /// neuron in the final layer has the highest activation.
-    fn evaluate(&self, test_data: &Data<f32, OneHotVector>) -> usize {
+    fn evaluate(&self, test_data: &Data<f64, OneHotVector>) -> usize {
         let test_results: Vec<(usize, usize)> = test_data.0.iter()
             .map(|line| (argmax(self.feed_forward(line.inputs.clone())), line.label.val))
             .collect();
-        test_results.into_iter().filter(|(x, y)| x == y).count()
+
+        test_results.into_iter().filter(|(x, y)| *x == *y).count()
     }
 
     /// Return a tuple `(nabla_b, nabla_w)` representing the
     /// gradient for the cost function C_x.  `nabla_b` and
     /// `nabla_w` are layer-by-layer lists of matrices, similar
     /// to `self.biases` and `self.weights`.
-    fn backprop(&self, x: Vec<f32>, y: &OneHotVector) -> (Vec<DMatrix<f32>>, Vec<DMatrix<f32>>) {
+    fn backprop(&self, x: Vec<f64>, y: &OneHotVector) -> (Vec<DMatrix<f64>>, Vec<DMatrix<f64>>) {
         // zero_grad ie. set gradient to zero
-        let mut nabla_b: Vec<DMatrix<f32>> = self.biases.iter()
+        let mut nabla_b: Vec<DMatrix<f64>> = self.biases.iter()
             .map(|b| b.shape())
             .map(|s| DMatrix::zeros(s.0, s.1))
             .collect();
-        let mut nabla_w: Vec<DMatrix<f32>> = self.weights.iter()
+        let mut nabla_w: Vec<DMatrix<f64>> = self.weights.iter()
             .map(|w| w.shape())
             .map(|s| DMatrix::zeros(s.0, s.1))
             .collect();
@@ -119,14 +140,15 @@ impl Network {
         let mut zs = vec![];
 
         for (b, w) in zip(&self.biases, &self.weights) {
-            let z = add(w * &activation, b.clone()).unwrap();
+            let z = (w * &activation)+b.clone();
             zs.push(z.clone());
             activation = z.map(sigmoid);
             activations.push(activation.clone());
         }
         // backward pass
         // delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
-        let delta: DMatrix<f32> = self.cost_derivative(&activations[activations.len() - 1], y).component_mul(&zs[zs.len() - 1].map(sigmoid_prime));
+        let delta: DMatrix<f64> = cost_derivative(&activations[activations.len() - 1], y).component_mul(&zs[zs.len() - 1].map(sigmoid_prime));
+        // println!("delta {:?}", delta);
         let index = nabla_b.len() - 1;
         nabla_b[index] = delta.clone();
 
@@ -136,75 +158,124 @@ impl Network {
         let lens_zs = zs.len();
         for l in 2..self.num_layers {
             let z = &zs[lens_zs - l];
-            let sp = z.map(sigmoid_prime);
             let weight = self.weights[self.weights.len() - l + 1].transpose();
-            let delta2 = (weight * &delta).component_mul(&sp);
+            let delta = (weight * &delta).component_mul(&z.map(sigmoid_prime));
             let len_nb = nabla_b.len();
-            nabla_b[len_nb - l] = delta2.clone();
+            nabla_b[len_nb - l] = delta.clone();
             let len_nw = nabla_w.len();
-            nabla_w[len_nw - l] = delta2 * activations[activations.len() - l - 1].transpose();
+            nabla_w[len_nw - l] = delta * activations[activations.len() - l - 1].transpose();
         }
 
         (nabla_b, nabla_w)
     }
-
-    fn cost_derivative(&self, output_activations: &DMatrix<f32>, y: &OneHotVector) -> DMatrix<f32> {
-        // output_activations - y
-        let shape = output_activations.shape();
-        DMatrix::from_iterator(shape.0, shape.1, output_activations.iter().enumerate()
-            .map(|(index, a)| a - y.get(index)))
-    }
 }
 
-fn argmax(val: Vec<f32>) -> usize {
+fn cost_derivative(output_activations: &DMatrix<f64>, y: &OneHotVector) -> DMatrix<f64> {
+    // output_activations - y
+    // println!("output {:?}", output_activations);
+    // println!("expected {:?}", y);
+
+    let shape = output_activations.shape();
+    let t = DMatrix::from_iterator(shape.0, shape.1, output_activations.iter().enumerate()
+        .map(|(index, a)| a - y.get(index)));
+    // println!("t {:?}",t);
+    t
+}
+
+fn argmax(val: Vec<f64>) -> usize {
     let mut max = 0.0;
     let mut index = 0;
     for (i, x) in val.iter().enumerate() {
+        // print!("{},",x);
         if *x > max {
             index = i;
             max = *x;
         }
     }
+    // println!();
     index
 }
 
-fn biases(sizes: Vec<usize>) -> Vec<DMatrix<f32>> {
-    sizes.iter().map(|size| random_matrix(*size, 1)).collect()
+fn biases(sizes: Vec<usize>, init: fn(&usize) -> DMatrix<f64>) -> Vec<DMatrix<f64>> {
+    sizes.iter().map(init).collect()
 }
 
-fn weights(sizes: Vec<(usize, usize)>) -> Vec<DMatrix<f32>> {
-    sizes.iter().map(|size| random_matrix(size.1, size.0)).collect()
+fn weights(sizes: Vec<(usize, usize)>, init: fn(&(usize, usize)) -> DMatrix<f64>) -> Vec<DMatrix<f64>> {
+    sizes.iter().map(init).collect()
 }
 
-fn random_matrix(rows: usize, cols: usize) -> DMatrix<f32> {
-    let normal: Normal<f32> = Normal::new(0.0, 1.0).unwrap();
+fn random_matrix(rows: usize, cols: usize) -> DMatrix<f64> {
+    let normal: Normal<f64> = Normal::new(0.0, 1.0).unwrap();
 
     DMatrix::from_fn(rows, cols, |_, _| normal.sample(&mut thread_rng()))
 }
 
-fn sigmoid_inplace(val: &mut f32) {
+fn sigmoid_inplace(val: &mut f64) {
     *val = sigmoid(*val);
 }
 
-fn sigmoid(val: f32) -> f32 {
+fn sigmoid(val: f64) -> f64 {
     1.0 / (1.0 + (-val).exp())
 }
 
 /// Derivative of the sigmoid function.
-fn sigmoid_prime(val: f32) -> f32 {
+fn sigmoid_prime(val: f64) -> f64 {
     sigmoid(val) * (1.0 - sigmoid(val))
 }
 
 #[cfg(test)]
 mod test {
+    use std::convert::identity;
     use nalgebra::DMatrix;
 
     use super::*;
 
     #[test]
     fn test_sigmoid() {
-        let mut mat: DMatrix<f32> = DMatrix::from_vec(1, 1, vec![0.0]);
+        let mut mat: DMatrix<f64> = DMatrix::from_vec(1, 1, vec![0.0]);
         mat.apply(sigmoid_inplace);
         assert_eq!(mat.get(0), Some(&0.5));
     }
+
+    #[test]
+    fn test_sigmoid_inplace() {
+        let mut v = 10.0;
+        sigmoid_inplace(&mut v);
+        assert_eq!(0.9999546, v);
+    }
+
+    #[test]
+    fn test_sigmoid_prime() {
+        assert_eq!(0.19661193324148185, sigmoid_prime(1.0))
+    }
+
+    #[test]
+    fn test_argmax() {
+        assert_eq!(5, argmax(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0]));
+    }
+
+    #[test]
+    fn test_cost_derivative() {
+        let matrix = DMatrix::from_vec(4, 1, vec![0.0, 1.0, 2.0, -2.0]);
+        let delta = cost_derivative(&matrix, &OneHotVector::new(1));
+        assert_eq!(delta, DMatrix::from_vec(4, 1, vec![0.0, 0.0, 2.0, -2.0]));
+    }
+
+    #[test]
+    fn test_feedforward() {
+        // 2 layers of 2 units
+        let mut net = Network::ones(vec![2, 2]);
+
+        let prediction = net.feed_forward_activation(vec![2.0, 2.0], |a| {});
+        assert_eq!(prediction, vec![5.0, 5.0])
+    }
+
+    #[test]
+    fn test_sgd() {
+        // 2 layers of 2 units
+        let mut net = Network::ones(vec![2, 2]);
+        let data = Data(vec![DataLine { inputs: vec![1.0, 1.0], label: OneHotVector::new(1) }]);
+        net.sgd(data, 1, 1, 0.001, None);
+        println!("{:?}", net);
+    }
 }
\ No newline at end of file