From c3e532f1c1f3d2c103c4e643c9adf4b958209031 Mon Sep 17 00:00:00 2001
From: Corey Lowman <clowman1993@gmail.com>
Date: Thu, 23 Feb 2023 17:38:59 -0500
Subject: [PATCH] Fast alloc and binary kernel optimizations

---
 Cargo.toml                                    |  3 +-
 src/tensor/cpu/allocate.rs                    | 34 +++++++++---
 src/tensor/cpu/device.rs                      |  2 +-
 src/tensor/cpu/iterate.rs                     | 55 ++++++++++++++-----
 src/tensor/cpu/mod.rs                         |  2 +-
 src/tensor_ops/utilities/binary_op_macros.cuh | 12 ++--
 src/tensor_ops/utilities/cpu_kernels.rs       | 39 +++++++------
 src/tensor_ops/utilities/cuda_kernels.rs      |  4 --
 8 files changed, 97 insertions(+), 54 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 9a8a69ad6..a8654bda2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,8 +35,9 @@ cudarc = { version = "0.7.2", default-features = false, optional = true }
 num-traits = { version = "0.2.15", default-features = false }
 
 [features]
-default = ["std", "numpy"]
+default = ["std", "numpy", "fast_alloc"]
 std = ["no-std-compat/std", "rand/std", "rand_distr/std", "cudarc?/std", "matrixmultiply/threading"]
+fast_alloc = ["std"]
 nightly = []
 numpy = ["dep:zip", "std"]
 cblas = ["dep:cblas-sys", "dep:libc"]
diff --git a/src/tensor/cpu/allocate.rs b/src/tensor/cpu/allocate.rs
index b533185cb..ae903f632 100644
--- a/src/tensor/cpu/allocate.rs
+++ b/src/tensor/cpu/allocate.rs
@@ -20,9 +20,18 @@ impl<S: Shape, E: Default + Clone> StridedArray<S, E> {
     pub(crate) fn try_new_with(shape: S, elem: E) -> Result<Self, CpuError> {
         let numel = shape.num_elements();
         let strides: S::Concrete = shape.strides();
-        let mut data: Vec<E> = Vec::new();
-        data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
-        data.resize(numel, elem);
+
+        #[cfg(feature = "fast_alloc")]
+        let data = std::vec![elem; numel];
+
+        #[cfg(not(feature = "fast_alloc"))]
+        let data = {
+            let mut data: Vec<E> = Vec::new();
+            data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
+            data.resize(numel, elem);
+            data
+        };
+
         let data = Arc::new(data);
         Ok(StridedArray {
             data,
@@ -32,13 +41,22 @@ impl<S: Shape, E: Default + Clone> StridedArray<S, E> {
     }
 
     #[inline]
-    pub(crate) fn try_new_like(other: &Self, elem: E) -> Result<Self, CpuError> {
+    pub(crate) fn try_new_like(other: &Self) -> Result<Self, CpuError> {
         let numel = other.data.len();
         let shape = other.shape;
         let strides = other.strides;
-        let mut data: Vec<E> = Vec::new();
-        data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
-        data.resize(numel, elem);
+
+        #[cfg(feature = "fast_alloc")]
+        let data = std::vec![Default::default(); numel];
+
+        #[cfg(not(feature = "fast_alloc"))]
+        let data = {
+            let mut data: Vec<E> = Vec::new();
+            data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
+            data.resize(numel, Default::default());
+            data
+        };
+
         let data = Arc::new(data);
         Ok(StridedArray {
             data,
@@ -88,7 +106,7 @@ impl<E: Unit> SampleTensor<E> for Cpu {
         src: &S,
         distr: D,
     ) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
-        let mut storage = StridedArray::try_new_with(*src.shape(), Default::default())?;
+        let mut storage = StridedArray::new(*src.shape())?;
         {
             let mut rng = self.rng.lock().unwrap();
             for v in storage.buf_iter_mut() {
diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs
index 53f3e21cc..dd038513d 100644
--- a/src/tensor/cpu/device.rs
+++ b/src/tensor/cpu/device.rs
@@ -88,7 +88,7 @@ impl DeviceStorage for Cpu {
         &self,
         storage: &Self::Storage<S, E>,
     ) -> Result<Self::Storage<S, E>, Self::Err> {
-        StridedArray::try_new_like(storage, Default::default())
+        StridedArray::try_new_like(storage)
     }
 
     fn random_u64(&self) -> u64 {
diff --git a/src/tensor/cpu/iterate.rs b/src/tensor/cpu/iterate.rs
index 01949ffbd..ccff7acf3 100644
--- a/src/tensor/cpu/iterate.rs
+++ b/src/tensor/cpu/iterate.rs
@@ -3,33 +3,50 @@ use crate::shapes::{BroadcastStridesTo, Shape};
 use std::sync::Arc;
 use std::vec::Vec;
 
-struct NdIndex<S: Shape> {
+pub(crate) struct NdIndex<S: Shape> {
     indices: S::Concrete,
     shape: S::Concrete,
     strides: S::Concrete,
     next: Option<usize>,
+    contiguous: Option<usize>,
 }
 
 impl<S: Shape> NdIndex<S> {
-    fn new(shape: S, strides: S::Concrete) -> Self {
-        let indices: S::Concrete = Default::default();
-        let i: usize = strides
-            .into_iter()
-            .zip(indices.into_iter())
-            .map(|(a, b)| a * b)
-            .sum();
+    #[inline]
+    pub(crate) fn new(shape: S, strides: S::Concrete) -> Self {
         Self {
-            indices,
+            indices: Default::default(),
             shape: shape.concrete(),
             strides,
-            next: Some(i),
+            next: Some(0),
+            contiguous: (strides == shape.strides()).then(|| shape.num_elements()),
         }
     }
 }
 
 impl<S: Shape> NdIndex<S> {
     #[inline(always)]
-    fn get_with_idx(&mut self) -> Option<(usize, S::Concrete)> {
+    pub(crate) fn next(&mut self) -> Option<usize> {
+        match self.contiguous {
+            Some(numel) => match self.next.as_mut() {
+                Some(i) => {
+                    let idx = *i;
+                    let next = idx + 1;
+                    if next >= numel {
+                        self.next = None;
+                    } else {
+                        *i = next;
+                    }
+                    Some(idx)
+                }
+                None => None,
+            },
+            None => self.next_with_idx().map(|(i, _)| i),
+        }
+    }
+
+    #[inline(always)]
+    fn next_with_idx(&mut self) -> Option<(usize, S::Concrete)> {
         match (S::NUM_DIMS, self.next.as_mut()) {
             (_, None) => None,
             (0, Some(i)) => {
@@ -85,14 +102,17 @@ pub(crate) struct StridedMutIndexIter<'a, S: Shape, E> {
 }
 
 impl<S: Shape, E: Clone> StridedArray<S, E> {
+    #[inline]
     pub(crate) fn buf_iter(&self) -> std::slice::Iter<'_, E> {
         self.data.iter()
     }
 
+    #[inline]
     pub(crate) fn buf_iter_mut(&mut self) -> std::slice::IterMut<'_, E> {
         std::sync::Arc::make_mut(&mut self.data).iter_mut()
     }
 
+    #[inline]
     pub(crate) fn iter(&self) -> StridedRefIter<S, E> {
         StridedRefIter {
             data: self.data.as_ref(),
@@ -100,6 +120,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
         }
     }
 
+    #[inline]
     pub(crate) fn iter_mut(&mut self) -> StridedMutIter<S, E> {
         StridedMutIter {
             data: std::sync::Arc::make_mut(&mut self.data),
@@ -107,6 +128,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
         }
     }
 
+    #[inline]
     pub(crate) fn iter_with_index(&self) -> StridedRefIndexIter<S, E> {
         StridedRefIndexIter {
             data: self.data.as_ref(),
@@ -114,6 +136,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
         }
     }
 
+    #[inline]
     pub(crate) fn iter_mut_with_index(&mut self) -> StridedMutIndexIter<S, E> {
         StridedMutIndexIter {
             data: std::sync::Arc::make_mut(&mut self.data),
@@ -123,6 +146,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
 }
 
 impl<S: Shape, E: Clone> StridedArray<S, E> {
+    #[inline]
     pub(crate) fn iter_as<Axes, Dst: Shape>(&self, dst: &Dst) -> StridedRefIter<Dst, E>
     where
         S: BroadcastStridesTo<Dst, Axes>,
@@ -133,6 +157,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
         }
     }
 
+    #[inline]
     pub(crate) fn iter_mut_as<Axes, Dst: Shape>(&mut self, dst: &Dst) -> StridedMutIter<Dst, E>
     where
         S: BroadcastStridesTo<Dst, Axes>,
@@ -155,7 +180,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedRefIter<'q, S, E> {
     type Item<'a> = &'a E where Self: 'a;
     #[inline(always)]
     fn next(&'_ mut self) -> Option<Self::Item<'_>> {
-        self.index.get_with_idx().map(|(i, _)| &self.data[i])
+        self.index.next().map(|i| &self.data[i])
     }
 }
 
@@ -163,7 +188,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedMutIter<'q, S, E> {
     type Item<'a> = &'a mut E where Self: 'a;
     #[inline(always)]
     fn next(&'_ mut self) -> Option<Self::Item<'_>> {
-        self.index.get_with_idx().map(|(i, _)| &mut self.data[i])
+        self.index.next().map(|i| &mut self.data[i])
     }
 }
 
@@ -172,7 +197,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedRefIndexIter<'q, S, E> {
     #[inline(always)]
     fn next(&'_ mut self) -> Option<Self::Item<'_>> {
         self.index
-            .get_with_idx()
+            .next_with_idx()
             .map(|(i, idx)| (&self.data[i], idx))
     }
 }
@@ -182,7 +207,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedMutIndexIter<'q, S, E> {
     #[inline(always)]
     fn next(&'_ mut self) -> Option<Self::Item<'_>> {
         self.index
-            .get_with_idx()
+            .next_with_idx()
             .map(|(i, idx)| (&mut self.data[i], idx))
     }
 }
diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs
index 78e36afc2..48ae35ec4 100644
--- a/src/tensor/cpu/mod.rs
+++ b/src/tensor/cpu/mod.rs
@@ -5,7 +5,7 @@ mod iterate;
 mod views;
 
 pub(crate) use device::StridedArray;
-pub(crate) use iterate::LendingIterator;
+pub(crate) use iterate::{LendingIterator, NdIndex};
 pub(crate) use views::{View, ViewMut};
 
 pub use device::{Cpu, CpuError};
diff --git a/src/tensor_ops/utilities/binary_op_macros.cuh b/src/tensor_ops/utilities/binary_op_macros.cuh
index a5d71c0bd..d0764920e 100644
--- a/src/tensor_ops/utilities/binary_op_macros.cuh
+++ b/src/tensor_ops/utilities/binary_op_macros.cuh
@@ -10,8 +10,7 @@ extern "C" __global__ void FORWARD( \
     const size_t *lhs_strides, \
     const TYPENAME *rhs, \
     const size_t *rhs_strides, \
-    TYPENAME *out, \
-    const size_t *out_strides \
+    TYPENAME *out \
 ) { \
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \
     if (i >= numel) { \
@@ -20,7 +19,6 @@ extern "C" __global__ void FORWARD( \
 \
     unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides); \
     unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides); \
-    unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides); \
 \
     TYPENAME x = lhs[lhs_i]; \
     TYPENAME y = rhs[rhs_i]; \
@@ -28,7 +26,7 @@ extern "C" __global__ void FORWARD( \
 \
     FUNC\
 \
-    out[out_i] = fx; \
+    out[i] = fx; \
 } \
 \
 extern "C" __global__ void BACKWARD( \
@@ -42,8 +40,7 @@ extern "C" __global__ void BACKWARD( \
     const TYPENAME *rhs, \
     TYPENAME *grad_rhs, \
     const size_t *rhs_strides, \
-    const TYPENAME *grad_out, \
-    const size_t *out_strides \
+    const TYPENAME *grad_out \
 ) { \
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \
     if (i >= numel) { \
@@ -52,11 +49,10 @@ extern "C" __global__ void BACKWARD( \
 \
     unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides); \
     unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides); \
-    unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides); \
 \
     TYPENAME x = lhs[lhs_i]; \
     TYPENAME y = rhs[rhs_i]; \
-    TYPENAME go = grad_out[out_i]; \
+    TYPENAME go = grad_out[i]; \
 \
     TYPENAME dfdx, dfdy; \
     DERIVATIVES \
diff --git a/src/tensor_ops/utilities/cpu_kernels.rs b/src/tensor_ops/utilities/cpu_kernels.rs
index 7e9b7724a..fb446aa1f 100644
--- a/src/tensor_ops/utilities/cpu_kernels.rs
+++ b/src/tensor_ops/utilities/cpu_kernels.rs
@@ -1,7 +1,7 @@
 use super::ops::{BinaryKernel, UnaryKernel};
 use crate::{
     shapes::{Dtype, Shape},
-    tensor::cpu::{Cpu, LendingIterator, StridedArray},
+    tensor::cpu::{Cpu, LendingIterator, NdIndex, StridedArray},
 };
 
 pub trait UnaryDerivative<E> {
@@ -22,6 +22,8 @@ impl<E: Dtype, Op: UnaryDerivative<E>> UnaryKernel<Op, E> for Cpu {
         inp: &Self::Storage<S, E>,
     ) -> Result<Self::Storage<S, E>, Self::Err> {
         let mut out: Self::Storage<S, E> = inp.clone();
+        // NOTE: we can iterate over buf here because we know inp & out
+        // have exact same strides due to clone.
         for x in out.buf_iter_mut() {
             *x = op.f(x);
         }
@@ -52,10 +54,13 @@ impl<E: Dtype, Op: BinaryDerivative<E>> BinaryKernel<Op, E> for Cpu {
         rhs: &Self::Storage<S, E>,
     ) -> Result<Self::Storage<S, E>, Self::Err> {
         let mut out: Self::Storage<S, E> = StridedArray::new(lhs.shape)?;
+
         let mut lhs_iter = lhs.iter();
         let mut rhs_iter = rhs.iter();
-        let mut out_iter = out.iter_mut();
-        while let Some((o, (l, r))) = out_iter.next().zip(lhs_iter.next().zip(rhs_iter.next())) {
+        // NOTE: we can use buf_iter_mut() here because StridedArray::new makes a contiguous array
+        for o in out.buf_iter_mut() {
+            let l = lhs_iter.next().unwrap();
+            let r = rhs_iter.next().unwrap();
             *o = op.f(l, r);
         }
         Ok(out)
@@ -69,19 +74,21 @@ impl<E: Dtype, Op: BinaryDerivative<E>> BinaryKernel<Op, E> for Cpu {
         grad_rhs: &mut Self::Storage<S, E>,
         grad_out: &Self::Storage<S, E>,
     ) -> Result<(), Self::Err> {
-        let mut lhs_iter = lhs.iter();
-        let mut rhs_iter = rhs.iter();
-        let mut grad_lhs_iter = grad_lhs.iter_mut();
-        let mut grad_rhs_iter = grad_rhs.iter_mut();
-        let mut grad_out_iter = grad_out.iter();
-        for _ in 0..lhs.shape.num_elements() {
-            let l = lhs_iter.next().unwrap();
-            let r = rhs_iter.next().unwrap();
-            let go = *grad_out_iter.next().unwrap();
-            let gl = grad_lhs_iter.next().unwrap();
-            *gl += op.dfdx(l, r) * go;
-            let gr = grad_rhs_iter.next().unwrap();
-            *gr += op.dfdy(l, r) * go;
+        let mut lhs_idx = NdIndex::new(lhs.shape, lhs.strides);
+        let mut rhs_idx = NdIndex::new(rhs.shape, rhs.strides);
+        let lhs_buf = lhs.data.as_ref();
+        let rhs_buf = rhs.data.as_ref();
+        let grad_lhs_buf = std::sync::Arc::make_mut(&mut grad_lhs.data);
+        let grad_rhs_buf = std::sync::Arc::make_mut(&mut grad_rhs.data);
+        // NOTE: we can use .buf_iter() here because we know the outcome of this op is
+        // contiguous from forward
+        for &go in grad_out.buf_iter() {
+            let lhs_i = lhs_idx.next().unwrap();
+            let rhs_i = rhs_idx.next().unwrap();
+            let l = &lhs_buf[lhs_i];
+            let r = &rhs_buf[rhs_i];
+            grad_lhs_buf[lhs_i] += op.dfdx(l, r) * go;
+            grad_rhs_buf[rhs_i] += op.dfdy(l, r) * go;
         }
         Ok(())
     }
diff --git a/src/tensor_ops/utilities/cuda_kernels.rs b/src/tensor_ops/utilities/cuda_kernels.rs
index 92388b81c..9fb7a9b6d 100644
--- a/src/tensor_ops/utilities/cuda_kernels.rs
+++ b/src/tensor_ops/utilities/cuda_kernels.rs
@@ -138,7 +138,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
         let dims: CudaSlice<usize> = self.dev.take_async(shape.concrete().into())?;
         let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
         let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
-        let out_strides: CudaSlice<usize> = self.dev.take_async(strides.into())?;
 
         let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap();
         let cfg = LaunchConfig::for_num_elems(numel as u32);
@@ -152,7 +151,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
             rhs.data.as_ref(), // const float *rhs,
             &rhs_strides,      // const size_t *rhs_strides,
             &mut storage,      // float *out,
-            &out_strides,      // const size_t *out_strides
         );
         unsafe { fwd_fn.launch_async(cfg, params) }?;
         Ok(CudaArray {
@@ -177,7 +175,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
         let dims: CudaSlice<usize> = self.dev.take_async(lhs.shape.concrete().into())?;
         let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
         let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
-        let out_strides: CudaSlice<usize> = self.dev.take_async(grad_out.strides.into())?;
 
         let cfg = LaunchConfig::for_num_elems(numel as u32);
         let params = (
@@ -192,7 +189,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
             Arc::make_mut(&mut grad_rhs.data), // float *grad_rhs,
             &rhs_strides,                      // const size_t *rhs_strides,
             grad_out.data.as_ref(),            // const float *grad_out,
-            &out_strides,                      // const size_t *out_strides
         );
         unsafe { bwd_fn.launch_async(cfg, params) }?;
         Ok(())