From c3e532f1c1f3d2c103c4e643c9adf4b958209031 Mon Sep 17 00:00:00 2001 From: Corey Lowman Date: Thu, 23 Feb 2023 17:38:59 -0500 Subject: [PATCH] Fast alloc and binary kernel optimizations --- Cargo.toml | 3 +- src/tensor/cpu/allocate.rs | 34 +++++++++--- src/tensor/cpu/device.rs | 2 +- src/tensor/cpu/iterate.rs | 55 ++++++++++++++----- src/tensor/cpu/mod.rs | 2 +- src/tensor_ops/utilities/binary_op_macros.cuh | 12 ++-- src/tensor_ops/utilities/cpu_kernels.rs | 39 +++++++------ src/tensor_ops/utilities/cuda_kernels.rs | 4 -- 8 files changed, 97 insertions(+), 54 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9a8a69ad6..a8654bda2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,8 +35,9 @@ cudarc = { version = "0.7.2", default-features = false, optional = true } num-traits = { version = "0.2.15", default-features = false } [features] -default = ["std", "numpy"] +default = ["std", "numpy", "fast_alloc"] std = ["no-std-compat/std", "rand/std", "rand_distr/std", "cudarc?/std", "matrixmultiply/threading"] +fast_alloc = ["std"] nightly = [] numpy = ["dep:zip", "std"] cblas = ["dep:cblas-sys", "dep:libc"] diff --git a/src/tensor/cpu/allocate.rs b/src/tensor/cpu/allocate.rs index b533185cb..ae903f632 100644 --- a/src/tensor/cpu/allocate.rs +++ b/src/tensor/cpu/allocate.rs @@ -20,9 +20,18 @@ impl StridedArray { pub(crate) fn try_new_with(shape: S, elem: E) -> Result { let numel = shape.num_elements(); let strides: S::Concrete = shape.strides(); - let mut data: Vec = Vec::new(); - data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?; - data.resize(numel, elem); + + #[cfg(feature = "fast_alloc")] + let data = std::vec![elem; numel]; + + #[cfg(not(feature = "fast_alloc"))] + let data = { + let mut data: Vec = Vec::new(); + data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?; + data.resize(numel, elem); + data + }; + let data = Arc::new(data); Ok(StridedArray { data, @@ -32,13 +41,22 @@ impl StridedArray { } #[inline] - pub(crate) fn try_new_like(other: &Self, elem: E) -> Result { + pub(crate) fn try_new_like(other: &Self) -> Result { let numel = other.data.len(); let shape = other.shape; let strides = other.strides; - let mut data: Vec = Vec::new(); - data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?; - data.resize(numel, elem); + + #[cfg(feature = "fast_alloc")] + let data = std::vec![Default::default(); numel]; + + #[cfg(not(feature = "fast_alloc"))] + let data = { + let mut data: Vec = Vec::new(); + data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?; + data.resize(numel, Default::default()); + data + }; + let data = Arc::new(data); Ok(StridedArray { data, @@ -88,7 +106,7 @@ impl SampleTensor for Cpu { src: &S, distr: D, ) -> Result, Self::Err> { - let mut storage = StridedArray::try_new_with(*src.shape(), Default::default())?; + let mut storage = StridedArray::new(*src.shape())?; { let mut rng = self.rng.lock().unwrap(); for v in storage.buf_iter_mut() { diff --git a/src/tensor/cpu/device.rs b/src/tensor/cpu/device.rs index 53f3e21cc..dd038513d 100644 --- a/src/tensor/cpu/device.rs +++ b/src/tensor/cpu/device.rs @@ -88,7 +88,7 @@ impl DeviceStorage for Cpu { &self, storage: &Self::Storage, ) -> Result, Self::Err> { - StridedArray::try_new_like(storage, Default::default()) + StridedArray::try_new_like(storage) } fn random_u64(&self) -> u64 { diff --git a/src/tensor/cpu/iterate.rs b/src/tensor/cpu/iterate.rs index 01949ffbd..ccff7acf3 100644 --- a/src/tensor/cpu/iterate.rs +++ b/src/tensor/cpu/iterate.rs @@ -3,33 +3,50 @@ use crate::shapes::{BroadcastStridesTo, Shape}; use std::sync::Arc; use std::vec::Vec; -struct NdIndex { +pub(crate) struct NdIndex { indices: S::Concrete, shape: S::Concrete, strides: S::Concrete, next: Option, + contiguous: Option, } impl NdIndex { - fn new(shape: S, strides: S::Concrete) -> Self { - let indices: S::Concrete = Default::default(); - let i: usize = strides - .into_iter() - .zip(indices.into_iter()) - .map(|(a, b)| a * b) - .sum(); + #[inline] + pub(crate) fn new(shape: S, strides: S::Concrete) -> Self { Self { - indices, + indices: Default::default(), shape: shape.concrete(), strides, - next: Some(i), + next: Some(0), + contiguous: (strides == shape.strides()).then(|| shape.num_elements()), } } } impl NdIndex { #[inline(always)] - fn get_with_idx(&mut self) -> Option<(usize, S::Concrete)> { + pub(crate) fn next(&mut self) -> Option { + match self.contiguous { + Some(numel) => match self.next.as_mut() { + Some(i) => { + let idx = *i; + let next = idx + 1; + if next >= numel { + self.next = None; + } else { + *i = next; + } + Some(idx) + } + None => None, + }, + None => self.next_with_idx().map(|(i, _)| i), + } + } + + #[inline(always)] + fn next_with_idx(&mut self) -> Option<(usize, S::Concrete)> { match (S::NUM_DIMS, self.next.as_mut()) { (_, None) => None, (0, Some(i)) => { @@ -85,14 +102,17 @@ pub(crate) struct StridedMutIndexIter<'a, S: Shape, E> { } impl StridedArray { + #[inline] pub(crate) fn buf_iter(&self) -> std::slice::Iter<'_, E> { self.data.iter() } + #[inline] pub(crate) fn buf_iter_mut(&mut self) -> std::slice::IterMut<'_, E> { std::sync::Arc::make_mut(&mut self.data).iter_mut() } + #[inline] pub(crate) fn iter(&self) -> StridedRefIter { StridedRefIter { data: self.data.as_ref(), @@ -100,6 +120,7 @@ impl StridedArray { } } + #[inline] pub(crate) fn iter_mut(&mut self) -> StridedMutIter { StridedMutIter { data: std::sync::Arc::make_mut(&mut self.data), @@ -107,6 +128,7 @@ impl StridedArray { } } + #[inline] pub(crate) fn iter_with_index(&self) -> StridedRefIndexIter { StridedRefIndexIter { data: self.data.as_ref(), @@ -114,6 +136,7 @@ impl StridedArray { } } + #[inline] pub(crate) fn iter_mut_with_index(&mut self) -> StridedMutIndexIter { StridedMutIndexIter { data: std::sync::Arc::make_mut(&mut self.data), @@ -123,6 +146,7 @@ impl StridedArray { } impl StridedArray { + #[inline] pub(crate) fn iter_as(&self, dst: &Dst) -> StridedRefIter where S: BroadcastStridesTo, @@ -133,6 +157,7 @@ impl StridedArray { } } + #[inline] pub(crate) fn iter_mut_as(&mut self, dst: &Dst) -> StridedMutIter where S: BroadcastStridesTo, @@ -155,7 +180,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedRefIter<'q, S, E> { type Item<'a> = &'a E where Self: 'a; #[inline(always)] fn next(&'_ mut self) -> Option> { - self.index.get_with_idx().map(|(i, _)| &self.data[i]) + self.index.next().map(|i| &self.data[i]) } } @@ -163,7 +188,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedMutIter<'q, S, E> { type Item<'a> = &'a mut E where Self: 'a; #[inline(always)] fn next(&'_ mut self) -> Option> { - self.index.get_with_idx().map(|(i, _)| &mut self.data[i]) + self.index.next().map(|i| &mut self.data[i]) } } @@ -172,7 +197,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedRefIndexIter<'q, S, E> { #[inline(always)] fn next(&'_ mut self) -> Option> { self.index - .get_with_idx() + .next_with_idx() .map(|(i, idx)| (&self.data[i], idx)) } } @@ -182,7 +207,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedMutIndexIter<'q, S, E> { #[inline(always)] fn next(&'_ mut self) -> Option> { self.index - .get_with_idx() + .next_with_idx() .map(|(i, idx)| (&mut self.data[i], idx)) } } diff --git a/src/tensor/cpu/mod.rs b/src/tensor/cpu/mod.rs index 78e36afc2..48ae35ec4 100644 --- a/src/tensor/cpu/mod.rs +++ b/src/tensor/cpu/mod.rs @@ -5,7 +5,7 @@ mod iterate; mod views; pub(crate) use device::StridedArray; -pub(crate) use iterate::LendingIterator; +pub(crate) use iterate::{LendingIterator, NdIndex}; pub(crate) use views::{View, ViewMut}; pub use device::{Cpu, CpuError}; diff --git a/src/tensor_ops/utilities/binary_op_macros.cuh b/src/tensor_ops/utilities/binary_op_macros.cuh index a5d71c0bd..d0764920e 100644 --- a/src/tensor_ops/utilities/binary_op_macros.cuh +++ b/src/tensor_ops/utilities/binary_op_macros.cuh @@ -10,8 +10,7 @@ extern "C" __global__ void FORWARD( \ const size_t *lhs_strides, \ const TYPENAME *rhs, \ const size_t *rhs_strides, \ - TYPENAME *out, \ - const size_t *out_strides \ + TYPENAME *out \ ) { \ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \ if (i >= numel) { \ @@ -20,7 +19,6 @@ extern "C" __global__ void FORWARD( \ \ unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides); \ unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides); \ - unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides); \ \ TYPENAME x = lhs[lhs_i]; \ TYPENAME y = rhs[rhs_i]; \ @@ -28,7 +26,7 @@ extern "C" __global__ void FORWARD( \ \ FUNC\ \ - out[out_i] = fx; \ + out[i] = fx; \ } \ \ extern "C" __global__ void BACKWARD( \ @@ -42,8 +40,7 @@ extern "C" __global__ void BACKWARD( \ const TYPENAME *rhs, \ TYPENAME *grad_rhs, \ const size_t *rhs_strides, \ - const TYPENAME *grad_out, \ - const size_t *out_strides \ + const TYPENAME *grad_out \ ) { \ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \ if (i >= numel) { \ @@ -52,11 +49,10 @@ extern "C" __global__ void BACKWARD( \ \ unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides); \ unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides); \ - unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides); \ \ TYPENAME x = lhs[lhs_i]; \ TYPENAME y = rhs[rhs_i]; \ - TYPENAME go = grad_out[out_i]; \ + TYPENAME go = grad_out[i]; \ \ TYPENAME dfdx, dfdy; \ DERIVATIVES \ diff --git a/src/tensor_ops/utilities/cpu_kernels.rs b/src/tensor_ops/utilities/cpu_kernels.rs index 7e9b7724a..fb446aa1f 100644 --- a/src/tensor_ops/utilities/cpu_kernels.rs +++ b/src/tensor_ops/utilities/cpu_kernels.rs @@ -1,7 +1,7 @@ use super::ops::{BinaryKernel, UnaryKernel}; use crate::{ shapes::{Dtype, Shape}, - tensor::cpu::{Cpu, LendingIterator, StridedArray}, + tensor::cpu::{Cpu, LendingIterator, NdIndex, StridedArray}, }; pub trait UnaryDerivative { @@ -22,6 +22,8 @@ impl> UnaryKernel for Cpu { inp: &Self::Storage, ) -> Result, Self::Err> { let mut out: Self::Storage = inp.clone(); + // NOTE: we can iterate over buf here because we know inp & out + // have exact same strides due to clone. for x in out.buf_iter_mut() { *x = op.f(x); } @@ -52,10 +54,13 @@ impl> BinaryKernel for Cpu { rhs: &Self::Storage, ) -> Result, Self::Err> { let mut out: Self::Storage = StridedArray::new(lhs.shape)?; + let mut lhs_iter = lhs.iter(); let mut rhs_iter = rhs.iter(); - let mut out_iter = out.iter_mut(); - while let Some((o, (l, r))) = out_iter.next().zip(lhs_iter.next().zip(rhs_iter.next())) { + // NOTE: we can use buf_iter_mut() here because StridedArray::new makes a contiguous array + for o in out.buf_iter_mut() { + let l = lhs_iter.next().unwrap(); + let r = rhs_iter.next().unwrap(); *o = op.f(l, r); } Ok(out) @@ -69,19 +74,21 @@ impl> BinaryKernel for Cpu { grad_rhs: &mut Self::Storage, grad_out: &Self::Storage, ) -> Result<(), Self::Err> { - let mut lhs_iter = lhs.iter(); - let mut rhs_iter = rhs.iter(); - let mut grad_lhs_iter = grad_lhs.iter_mut(); - let mut grad_rhs_iter = grad_rhs.iter_mut(); - let mut grad_out_iter = grad_out.iter(); - for _ in 0..lhs.shape.num_elements() { - let l = lhs_iter.next().unwrap(); - let r = rhs_iter.next().unwrap(); - let go = *grad_out_iter.next().unwrap(); - let gl = grad_lhs_iter.next().unwrap(); - *gl += op.dfdx(l, r) * go; - let gr = grad_rhs_iter.next().unwrap(); - *gr += op.dfdy(l, r) * go; + let mut lhs_idx = NdIndex::new(lhs.shape, lhs.strides); + let mut rhs_idx = NdIndex::new(rhs.shape, rhs.strides); + let lhs_buf = lhs.data.as_ref(); + let rhs_buf = rhs.data.as_ref(); + let grad_lhs_buf = std::sync::Arc::make_mut(&mut grad_lhs.data); + let grad_rhs_buf = std::sync::Arc::make_mut(&mut grad_rhs.data); + // NOTE: we can use .buf_iter() here because we know the outcome of this op is + // contiguous from forward + for &go in grad_out.buf_iter() { + let lhs_i = lhs_idx.next().unwrap(); + let rhs_i = rhs_idx.next().unwrap(); + let l = &lhs_buf[lhs_i]; + let r = &rhs_buf[rhs_i]; + grad_lhs_buf[lhs_i] += op.dfdx(l, r) * go; + grad_rhs_buf[rhs_i] += op.dfdy(l, r) * go; } Ok(()) } diff --git a/src/tensor_ops/utilities/cuda_kernels.rs b/src/tensor_ops/utilities/cuda_kernels.rs index 92388b81c..9fb7a9b6d 100644 --- a/src/tensor_ops/utilities/cuda_kernels.rs +++ b/src/tensor_ops/utilities/cuda_kernels.rs @@ -138,7 +138,6 @@ impl + AsKernelParam> BinaryKernel for let dims: CudaSlice = self.dev.take_async(shape.concrete().into())?; let lhs_strides: CudaSlice = self.dev.take_async(lhs.strides.into())?; let rhs_strides: CudaSlice = self.dev.take_async(rhs.strides.into())?; - let out_strides: CudaSlice = self.dev.take_async(strides.into())?; let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap(); let cfg = LaunchConfig::for_num_elems(numel as u32); @@ -152,7 +151,6 @@ impl + AsKernelParam> BinaryKernel for rhs.data.as_ref(), // const float *rhs, &rhs_strides, // const size_t *rhs_strides, &mut storage, // float *out, - &out_strides, // const size_t *out_strides ); unsafe { fwd_fn.launch_async(cfg, params) }?; Ok(CudaArray { @@ -177,7 +175,6 @@ impl + AsKernelParam> BinaryKernel for let dims: CudaSlice = self.dev.take_async(lhs.shape.concrete().into())?; let lhs_strides: CudaSlice = self.dev.take_async(lhs.strides.into())?; let rhs_strides: CudaSlice = self.dev.take_async(rhs.strides.into())?; - let out_strides: CudaSlice = self.dev.take_async(grad_out.strides.into())?; let cfg = LaunchConfig::for_num_elems(numel as u32); let params = ( @@ -192,7 +189,6 @@ impl + AsKernelParam> BinaryKernel for Arc::make_mut(&mut grad_rhs.data), // float *grad_rhs, &rhs_strides, // const size_t *rhs_strides, grad_out.data.as_ref(), // const float *grad_out, - &out_strides, // const size_t *out_strides ); unsafe { bwd_fn.launch_async(cfg, params) }?; Ok(())