diff --git a/convert_pickle.py b/convert_pickle.py
new file mode 100644
index 0000000..b69cf53
--- /dev/null
+++ b/convert_pickle.py
@@ -0,0 +1,22 @@
+import pickle
+import gzip
+import json
+
+# Load the data from the .pkl.gz file
+with gzip.open("mnist.pkl.gz", "rb") as f:
+    training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
+
+# Define a helper function to convert the data into JSON serializable format
+def convert_data(data):
+    features, labels = data
+    return [{"x": features[i].tolist(), "y": int(labels[i])} for i in range(len(features))]
+
+# Convert and save to JSON
+with open("training_data.json", "w") as train_json:
+    json.dump(convert_data(training_data), train_json)
+
+with open("validation_data.json", "w") as val_json:
+    json.dump(convert_data(validation_data), val_json)
+
+with open("test_data.json", "w") as test_json:
+    json.dump(convert_data(test_data), test_json)
\ No newline at end of file
diff --git a/src/dataloader.rs b/src/dataloader.rs
index c91d644..4787e63 100644
--- a/src/dataloader.rs
+++ b/src/dataloader.rs
@@ -1,4 +1,5 @@
 use std::fmt::Debug;
+use nalgebra::DMatrix;
 
 use rand::prelude::*;
 use serde::Deserialize;
@@ -10,8 +11,8 @@ pub fn load_data() -> (Data<f64, OneHotVector>, Data<f64, OneHotVector>) {
     // this is transformed to:
     // Data : Vec<DataLine>
     // DataLine {inputs: Vec<pixels as f64>, label: f64}
-    let raw_training_data: Vec<RawData> = serde_json::from_slice(include_bytes!("data/training.json")).unwrap();
-    let raw_test_data: Vec<RawData> = serde_json::from_slice(include_bytes!("data/test.json")).unwrap();
+    let raw_training_data: Vec<RawData> = serde_json::from_slice(include_bytes!("data/training_data.json")).unwrap();
+    let raw_test_data: Vec<RawData> = serde_json::from_slice(include_bytes!("data/test_data.json")).unwrap();
 
     let train = vectorize(raw_training_data);
     let test = vectorize(raw_test_data);
@@ -19,10 +20,10 @@ pub fn load_data() -> (Data<f64, OneHotVector>, Data<f64, OneHotVector>) {
     (Data(train), Data(test))
 }
 
-fn vectorize(raw_training_data: Vec<RawData>) -> Vec<DataLine<f64, OneHotVector>>{
+fn vectorize(raw_training_data: Vec<RawData>) -> Vec<DataLine<f64, OneHotVector>> {
     let mut result = Vec::new();
     for line in raw_training_data {
-        result.push(DataLine { inputs: line.x, label: onehot(line.y) });
+        result.push(DataLine { inputs: DMatrix::from_vec(line.x.len(), 1, line.x), label: onehot(line.y) });
     }
     result
 }
@@ -37,7 +38,7 @@ struct RawData {
 /// Y is type of output
 #[derive(Debug, Clone)]
 pub struct DataLine<X, Y> where X: Clone, Y: Clone {
-    pub inputs: Vec<X>,
+    pub inputs: DMatrix<X>,
     pub label: Y,
 }
 
@@ -64,7 +65,7 @@ impl OneHotVector {
 }
 
 #[derive(Debug, Clone)]
-pub struct Data<X, Y>(pub Vec<DataLine<X, Y>>) where X: Clone, Y: Clone ;
+pub struct Data<X, Y>(pub Vec<DataLine<X, Y>>) where X: Clone, Y: Clone;
 
 impl<X, Y> Data<X, Y> where X: Clone, Y: Clone {
     pub fn shuffle(&mut self) {
diff --git a/src/main.rs b/src/main.rs
index a3881b4..7240a8c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,16 +1,12 @@
 use mnist_rs::dataloader::load_data;
+use std::time::Instant;
 
 fn main() {
     let mut net = mnist_rs::net::Network::gaussian(vec![784, 30, 10]);
     let (training_data, test_data) = load_data();
 
-    net.sgd(training_data, 30, 1, 0.01, Some(test_data));
-
-
-    // let sizes = vec![5,3,2];
-    // let net = mnist_rs::net::Network::from(sizes);
-    // println!("biases {:?}", net.biases.iter().map(|b|b.shape()).collect::<Vec<(usize,usize)>>());
-    // println!("weights {:?}", net.weights.iter().map(|b|b.shape()).collect::<Vec<(usize,usize)>>());
-
+    let t0 = Instant::now();
 
+    net.sgd(training_data, 30, 10, 3.0, Some(test_data));
+    println!("{}", t0.elapsed().as_millis());
 }
\ No newline at end of file
diff --git a/src/net.rs b/src/net.rs
index b91bc2e..7a7089d 100644
--- a/src/net.rs
+++ b/src/net.rs
@@ -48,17 +48,17 @@ impl Network {
         }
     }
 
-    fn feed_forward(&self, input: Vec<f64>) -> Vec<f64> {
+    fn feed_forward(&self, input: &DMatrix<f64>) -> DMatrix<f64> {
         self.feed_forward_activation(input, sigmoid_inplace)
     }
 
-    fn feed_forward_activation(&self, input: Vec<f64>, activation: fn(&mut f64)) -> Vec<f64> {
-        let mut a = DMatrix::from_vec(input.len(), 1, input);
+    fn feed_forward_activation(&self, input: &DMatrix<f64>, activation: fn(&mut f64)) -> DMatrix<f64> {
+        let mut a = input.clone();
         for (b, w) in zip(&self.biases, &self.weights) {
-            a = b.clone()+ w * a;
+            a = b + w * a;
             a.apply(activation);
         }
-        a.column(0).iter().copied().collect()
+        a
     }
 
     pub fn sgd(&mut self, mut training_data: Data<f64, OneHotVector>, epochs: usize, minibatch_size: usize, eta: f64, test_data: Option<Data<f64, OneHotVector>>) {
@@ -82,12 +82,10 @@ impl Network {
     /// The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
     /// is the learning rate.
     fn update_mini_batch(&mut self, mini_batch: &[DataLine<f64, OneHotVector>], eta: f64) {
-        let  (mut nabla_b, mut nabla_w) = self.zero_gradient();
+        let (mut nabla_b, mut nabla_w) = self.zero_gradient();
         for line in mini_batch.iter() {
-            let (delta_nabla_b, delta_nabla_w) = self.backprop(line.inputs.to_vec(), &line.label);
+            let (delta_nabla_b, delta_nabla_w) = self.backprop(&line.inputs, &line.label);
 
-            // nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
-            // nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
             nabla_b = zip(&nabla_b, &delta_nabla_b).map(|(nb, dnb)| nb.add(dnb)).collect();
             nabla_w = zip(&nabla_w, &delta_nabla_w).map(|(nw, dnw)| nw.add(dnw)).collect();
         }
@@ -105,7 +103,7 @@ impl Network {
     /// neuron in the final layer has the highest activation.
     fn evaluate(&self, test_data: &Data<f64, OneHotVector>) -> usize {
         let test_results: Vec<(usize, usize)> = test_data.0.iter()
-            .map(|line| (argmax(self.feed_forward(line.inputs.clone())), line.label.val))
+            .map(|line| (argmax(self.feed_forward(&line.inputs)), line.label.val))
             .collect();
 
         test_results.into_iter().filter(|(x, y)| *x == *y).count()
@@ -115,30 +113,27 @@ impl Network {
     /// gradient for the cost function C_x.  `nabla_b` and
     /// `nabla_w` are layer-by-layer lists of matrices, similar
     /// to `self.biases` and `self.weights`.
-    fn backprop(&self, x: Vec<f64>, y: &OneHotVector) -> (Vec<DMatrix<f64>>, Vec<DMatrix<f64>>) {
-        let  (mut nabla_b, mut nabla_w) = self.zero_gradient();
+    fn backprop(&self, x: &DMatrix<f64>, y: &OneHotVector) -> (Vec<DMatrix<f64>>, Vec<DMatrix<f64>>) {
+        let (mut nabla_b, mut nabla_w) = self.zero_gradient();
 
         // feedforward
-        let mut activation = DMatrix::from_vec(x.len(), 1, x);
+        let mut activation = x.clone();
         let mut activations = vec![activation.clone()];
         let mut zs = vec![];
 
         for (b, w) in zip(&self.biases, &self.weights) {
-            let z = (w * &activation)+b.clone();
+            let z = (w * activation) + b;
             zs.push(z.clone());
             activation = z.map(sigmoid);
             activations.push(activation.clone());
         }
         // backward pass
-        // delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
         let delta: DMatrix<f64> = cost_derivative(&activations[activations.len() - 1], y).component_mul(&zs[zs.len() - 1].map(sigmoid_prime));
-        // println!("delta {:?}", delta);
         let index = nabla_b.len() - 1;
         nabla_b[index] = delta.clone();
 
         let index = nabla_w.len() - 1;
-        let ac = &activations[activations.len() - 2].transpose();
-        nabla_w[index] = &delta * ac;
+        nabla_w[index] = &delta * (&activations[activations.len() - 2].transpose());
         let lens_zs = zs.len();
         for l in 2..self.num_layers {
             let z = &zs[lens_zs - l];
@@ -164,32 +159,25 @@ impl Network {
             .collect();
         (nabla_b, nabla_w)
     }
-
 }
 
 fn cost_derivative(output_activations: &DMatrix<f64>, y: &OneHotVector) -> DMatrix<f64> {
-    // output_activations - y
-    // println!("output {:?}", output_activations);
-    // println!("expected {:?}", y);
-
     let shape = output_activations.shape();
-    let t = DMatrix::from_iterator(shape.0, shape.1, output_activations.iter().enumerate()
-        .map(|(index, a)| a - y.get(index)));
-    // println!("t {:?}",t);
-    t
+     DMatrix::from_iterator(shape.0, shape.1, output_activations.iter().enumerate()
+        .map(|(index, a)| a - y.get(index)))
 }
 
-fn argmax(val: Vec<f64>) -> usize {
+/// index of max value
+/// only meaningful for single row or column matrix
+fn argmax(val: DMatrix<f64>) -> usize {
     let mut max = 0.0;
     let mut index = 0;
     for (i, x) in val.iter().enumerate() {
-        // print!("{},",x);
         if *x > max {
             index = i;
             max = *x;
         }
     }
-    // println!();
     index
 }
 
@@ -247,7 +235,7 @@ mod test {
 
     #[test]
     fn test_argmax() {
-        assert_eq!(5, argmax(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0]));
+        assert_eq!(5, argmax(DMatrix::from_vec(10, 1, vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0])));
     }
 
     #[test]
@@ -262,15 +250,15 @@ mod test {
         // 2 layers of 2 units
         let mut net = Network::ones(vec![2, 2]);
 
-        let prediction = net.feed_forward_activation(vec![2.0, 2.0], |a| {});
-        assert_eq!(prediction, vec![5.0, 5.0])
+        let prediction = net.feed_forward_activation(&DMatrix::from_vec(2, 1, vec![2.0, 2.0]), |a| {});
+        assert_eq!(prediction, DMatrix::from_vec(2, 1, vec![5.0, 5.0]))
     }
 
     #[test]
     fn test_sgd() {
         // 2 layers of 2 units
         let mut net = Network::ones(vec![2, 2]);
-        let data = Data(vec![DataLine { inputs: vec![1.0, 1.0], label: OneHotVector::new(1) }]);
+        let data = Data(vec![DataLine { inputs: DMatrix::from_vec(2, 1, vec![1.0, 1.0]), label: OneHotVector::new(1) }]);
         net.sgd(data, 1, 1, 0.001, None);
         println!("{:?}", net);
     }