+
Skip to content

Adds fast_alloc feature and binary kernel optimizations #481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ cudarc = { version = "0.7.2", default-features = false, optional = true }
num-traits = { version = "0.2.15", default-features = false }

[features]
default = ["std", "numpy"]
default = ["std", "numpy", "fast_alloc"]
std = ["no-std-compat/std", "rand/std", "rand_distr/std", "cudarc?/std", "matrixmultiply/threading"]
fast_alloc = ["std"]
nightly = []
numpy = ["dep:zip", "std"]
cblas = ["dep:cblas-sys", "dep:libc"]
Expand Down
34 changes: 26 additions & 8 deletions src/tensor/cpu/allocate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,18 @@ impl<S: Shape, E: Default + Clone> StridedArray<S, E> {
pub(crate) fn try_new_with(shape: S, elem: E) -> Result<Self, CpuError> {
let numel = shape.num_elements();
let strides: S::Concrete = shape.strides();
let mut data: Vec<E> = Vec::new();
data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
data.resize(numel, elem);

#[cfg(feature = "fast_alloc")]
let data = std::vec![elem; numel];

#[cfg(not(feature = "fast_alloc"))]
let data = {
let mut data: Vec<E> = Vec::new();
data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
data.resize(numel, elem);
data
};

let data = Arc::new(data);
Ok(StridedArray {
data,
Expand All @@ -32,13 +41,22 @@ impl<S: Shape, E: Default + Clone> StridedArray<S, E> {
}

#[inline]
pub(crate) fn try_new_like(other: &Self, elem: E) -> Result<Self, CpuError> {
pub(crate) fn try_new_like(other: &Self) -> Result<Self, CpuError> {
let numel = other.data.len();
let shape = other.shape;
let strides = other.strides;
let mut data: Vec<E> = Vec::new();
data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
data.resize(numel, elem);

#[cfg(feature = "fast_alloc")]
let data = std::vec![Default::default(); numel];

#[cfg(not(feature = "fast_alloc"))]
let data = {
let mut data: Vec<E> = Vec::new();
data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
data.resize(numel, Default::default());
data
};

let data = Arc::new(data);
Ok(StridedArray {
data,
Expand Down Expand Up @@ -88,7 +106,7 @@ impl<E: Unit> SampleTensor<E> for Cpu {
src: &S,
distr: D,
) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
let mut storage = StridedArray::try_new_with(*src.shape(), Default::default())?;
let mut storage = StridedArray::new(*src.shape())?;
{
let mut rng = self.rng.lock().unwrap();
for v in storage.buf_iter_mut() {
Expand Down
2 changes: 1 addition & 1 deletion src/tensor/cpu/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ impl DeviceStorage for Cpu {
&self,
storage: &Self::Storage<S, E>,
) -> Result<Self::Storage<S, E>, Self::Err> {
StridedArray::try_new_like(storage, Default::default())
StridedArray::try_new_like(storage)
}

fn random_u64(&self) -> u64 {
Expand Down
55 changes: 40 additions & 15 deletions src/tensor/cpu/iterate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,50 @@ use crate::shapes::{BroadcastStridesTo, Shape};
use std::sync::Arc;
use std::vec::Vec;

struct NdIndex<S: Shape> {
pub(crate) struct NdIndex<S: Shape> {
indices: S::Concrete,
shape: S::Concrete,
strides: S::Concrete,
next: Option<usize>,
contiguous: Option<usize>,
}

impl<S: Shape> NdIndex<S> {
fn new(shape: S, strides: S::Concrete) -> Self {
let indices: S::Concrete = Default::default();
let i: usize = strides
.into_iter()
.zip(indices.into_iter())
.map(|(a, b)| a * b)
.sum();
#[inline]
pub(crate) fn new(shape: S, strides: S::Concrete) -> Self {
Self {
indices,
indices: Default::default(),
shape: shape.concrete(),
strides,
next: Some(i),
next: Some(0),
contiguous: (strides == shape.strides()).then(|| shape.num_elements()),
}
}
}

impl<S: Shape> NdIndex<S> {
#[inline(always)]
fn get_with_idx(&mut self) -> Option<(usize, S::Concrete)> {
pub(crate) fn next(&mut self) -> Option<usize> {
match self.contiguous {
Some(numel) => match self.next.as_mut() {
Some(i) => {
let idx = *i;
let next = idx + 1;
if next >= numel {
self.next = None;
} else {
*i = next;
}
Some(idx)
}
None => None,
},
None => self.next_with_idx().map(|(i, _)| i),
}
}

#[inline(always)]
fn next_with_idx(&mut self) -> Option<(usize, S::Concrete)> {
match (S::NUM_DIMS, self.next.as_mut()) {
(_, None) => None,
(0, Some(i)) => {
Expand Down Expand Up @@ -85,35 +102,41 @@ pub(crate) struct StridedMutIndexIter<'a, S: Shape, E> {
}

impl<S: Shape, E: Clone> StridedArray<S, E> {
#[inline]
pub(crate) fn buf_iter(&self) -> std::slice::Iter<'_, E> {
self.data.iter()
}

#[inline]
pub(crate) fn buf_iter_mut(&mut self) -> std::slice::IterMut<'_, E> {
std::sync::Arc::make_mut(&mut self.data).iter_mut()
}

#[inline]
pub(crate) fn iter(&self) -> StridedRefIter<S, E> {
StridedRefIter {
data: self.data.as_ref(),
index: NdIndex::new(self.shape, self.strides),
}
}

#[inline]
pub(crate) fn iter_mut(&mut self) -> StridedMutIter<S, E> {
StridedMutIter {
data: std::sync::Arc::make_mut(&mut self.data),
index: NdIndex::new(self.shape, self.strides),
}
}

#[inline]
pub(crate) fn iter_with_index(&self) -> StridedRefIndexIter<S, E> {
StridedRefIndexIter {
data: self.data.as_ref(),
index: NdIndex::new(self.shape, self.strides),
}
}

#[inline]
pub(crate) fn iter_mut_with_index(&mut self) -> StridedMutIndexIter<S, E> {
StridedMutIndexIter {
data: std::sync::Arc::make_mut(&mut self.data),
Expand All @@ -123,6 +146,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
}

impl<S: Shape, E: Clone> StridedArray<S, E> {
#[inline]
pub(crate) fn iter_as<Axes, Dst: Shape>(&self, dst: &Dst) -> StridedRefIter<Dst, E>
where
S: BroadcastStridesTo<Dst, Axes>,
Expand All @@ -133,6 +157,7 @@ impl<S: Shape, E: Clone> StridedArray<S, E> {
}
}

#[inline]
pub(crate) fn iter_mut_as<Axes, Dst: Shape>(&mut self, dst: &Dst) -> StridedMutIter<Dst, E>
where
S: BroadcastStridesTo<Dst, Axes>,
Expand All @@ -155,15 +180,15 @@ impl<'q, S: Shape, E> LendingIterator for StridedRefIter<'q, S, E> {
type Item<'a> = &'a E where Self: 'a;
#[inline(always)]
fn next(&'_ mut self) -> Option<Self::Item<'_>> {
self.index.get_with_idx().map(|(i, _)| &self.data[i])
self.index.next().map(|i| &self.data[i])
}
}

impl<'q, S: Shape, E> LendingIterator for StridedMutIter<'q, S, E> {
type Item<'a> = &'a mut E where Self: 'a;
#[inline(always)]
fn next(&'_ mut self) -> Option<Self::Item<'_>> {
self.index.get_with_idx().map(|(i, _)| &mut self.data[i])
self.index.next().map(|i| &mut self.data[i])
}
}

Expand All @@ -172,7 +197,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedRefIndexIter<'q, S, E> {
#[inline(always)]
fn next(&'_ mut self) -> Option<Self::Item<'_>> {
self.index
.get_with_idx()
.next_with_idx()
.map(|(i, idx)| (&self.data[i], idx))
}
}
Expand All @@ -182,7 +207,7 @@ impl<'q, S: Shape, E> LendingIterator for StridedMutIndexIter<'q, S, E> {
#[inline(always)]
fn next(&'_ mut self) -> Option<Self::Item<'_>> {
self.index
.get_with_idx()
.next_with_idx()
.map(|(i, idx)| (&mut self.data[i], idx))
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/tensor/cpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ mod iterate;
mod views;

pub(crate) use device::StridedArray;
pub(crate) use iterate::LendingIterator;
pub(crate) use iterate::{LendingIterator, NdIndex};
pub(crate) use views::{View, ViewMut};

pub use device::{Cpu, CpuError};
12 changes: 4 additions & 8 deletions src/tensor_ops/utilities/binary_op_macros.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ extern "C" __global__ void FORWARD( \
const size_t *lhs_strides, \
const TYPENAME *rhs, \
const size_t *rhs_strides, \
TYPENAME *out, \
const size_t *out_strides \
TYPENAME *out \
) { \
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \
if (i >= numel) { \
Expand All @@ -20,15 +19,14 @@ extern "C" __global__ void FORWARD( \
\
unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides); \
unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides); \
unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides); \
\
TYPENAME x = lhs[lhs_i]; \
TYPENAME y = rhs[rhs_i]; \
TYPENAME fx; \
\
FUNC\
\
out[out_i] = fx; \
out[i] = fx; \
} \
\
extern "C" __global__ void BACKWARD( \
Expand All @@ -42,8 +40,7 @@ extern "C" __global__ void BACKWARD( \
const TYPENAME *rhs, \
TYPENAME *grad_rhs, \
const size_t *rhs_strides, \
const TYPENAME *grad_out, \
const size_t *out_strides \
const TYPENAME *grad_out \
) { \
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; \
if (i >= numel) { \
Expand All @@ -52,11 +49,10 @@ extern "C" __global__ void BACKWARD( \
\
unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides); \
unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides); \
unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides); \
\
TYPENAME x = lhs[lhs_i]; \
TYPENAME y = rhs[rhs_i]; \
TYPENAME go = grad_out[out_i]; \
TYPENAME go = grad_out[i]; \
\
TYPENAME dfdx, dfdy; \
DERIVATIVES \
Expand Down
39 changes: 23 additions & 16 deletions src/tensor_ops/utilities/cpu_kernels.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::ops::{BinaryKernel, UnaryKernel};
use crate::{
shapes::{Dtype, Shape},
tensor::cpu::{Cpu, LendingIterator, StridedArray},
tensor::cpu::{Cpu, LendingIterator, NdIndex, StridedArray},
};

pub trait UnaryDerivative<E> {
Expand All @@ -22,6 +22,8 @@ impl<E: Dtype, Op: UnaryDerivative<E>> UnaryKernel<Op, E> for Cpu {
inp: &Self::Storage<S, E>,
) -> Result<Self::Storage<S, E>, Self::Err> {
let mut out: Self::Storage<S, E> = inp.clone();
// NOTE: we can iterate over buf here because we know inp & out
// have exact same strides due to clone.
for x in out.buf_iter_mut() {
*x = op.f(x);
}
Expand Down Expand Up @@ -52,10 +54,13 @@ impl<E: Dtype, Op: BinaryDerivative<E>> BinaryKernel<Op, E> for Cpu {
rhs: &Self::Storage<S, E>,
) -> Result<Self::Storage<S, E>, Self::Err> {
let mut out: Self::Storage<S, E> = StridedArray::new(lhs.shape)?;

let mut lhs_iter = lhs.iter();
let mut rhs_iter = rhs.iter();
let mut out_iter = out.iter_mut();
while let Some((o, (l, r))) = out_iter.next().zip(lhs_iter.next().zip(rhs_iter.next())) {
// NOTE: we can use buf_iter_mut() here because StridedArray::new makes a contiguous array
for o in out.buf_iter_mut() {
let l = lhs_iter.next().unwrap();
let r = rhs_iter.next().unwrap();
*o = op.f(l, r);
}
Ok(out)
Expand All @@ -69,19 +74,21 @@ impl<E: Dtype, Op: BinaryDerivative<E>> BinaryKernel<Op, E> for Cpu {
grad_rhs: &mut Self::Storage<S, E>,
grad_out: &Self::Storage<S, E>,
) -> Result<(), Self::Err> {
let mut lhs_iter = lhs.iter();
let mut rhs_iter = rhs.iter();
let mut grad_lhs_iter = grad_lhs.iter_mut();
let mut grad_rhs_iter = grad_rhs.iter_mut();
let mut grad_out_iter = grad_out.iter();
for _ in 0..lhs.shape.num_elements() {
let l = lhs_iter.next().unwrap();
let r = rhs_iter.next().unwrap();
let go = *grad_out_iter.next().unwrap();
let gl = grad_lhs_iter.next().unwrap();
*gl += op.dfdx(l, r) * go;
let gr = grad_rhs_iter.next().unwrap();
*gr += op.dfdy(l, r) * go;
let mut lhs_idx = NdIndex::new(lhs.shape, lhs.strides);
let mut rhs_idx = NdIndex::new(rhs.shape, rhs.strides);
let lhs_buf = lhs.data.as_ref();
let rhs_buf = rhs.data.as_ref();
let grad_lhs_buf = std::sync::Arc::make_mut(&mut grad_lhs.data);
let grad_rhs_buf = std::sync::Arc::make_mut(&mut grad_rhs.data);
// NOTE: we can use .buf_iter() here because we know the outcome of this op is
// contiguous from forward
for &go in grad_out.buf_iter() {
let lhs_i = lhs_idx.next().unwrap();
let rhs_i = rhs_idx.next().unwrap();
let l = &lhs_buf[lhs_i];
let r = &rhs_buf[rhs_i];
grad_lhs_buf[lhs_i] += op.dfdx(l, r) * go;
grad_rhs_buf[rhs_i] += op.dfdy(l, r) * go;
}
Ok(())
}
Expand Down
4 changes: 0 additions & 4 deletions src/tensor_ops/utilities/cuda_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
let dims: CudaSlice<usize> = self.dev.take_async(shape.concrete().into())?;
let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
let out_strides: CudaSlice<usize> = self.dev.take_async(strides.into())?;

let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
Expand All @@ -152,7 +151,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
rhs.data.as_ref(), // const float *rhs,
&rhs_strides, // const size_t *rhs_strides,
&mut storage, // float *out,
&out_strides, // const size_t *out_strides
);
unsafe { fwd_fn.launch_async(cfg, params) }?;
Ok(CudaArray {
Expand All @@ -177,7 +175,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
let dims: CudaSlice<usize> = self.dev.take_async(lhs.shape.concrete().into())?;
let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
let out_strides: CudaSlice<usize> = self.dev.take_async(grad_out.strides.into())?;

let cfg = LaunchConfig::for_num_elems(numel as u32);
let params = (
Expand All @@ -192,7 +189,6 @@ impl<E: Dtype, K: BinaryOpCudaKernel<E> + AsKernelParam> BinaryKernel<K, E> for
Arc::make_mut(&mut grad_rhs.data), // float *grad_rhs,
&rhs_strides, // const size_t *rhs_strides,
grad_out.data.as_ref(), // const float *grad_out,
&out_strides, // const size_t *out_strides
);
unsafe { bwd_fn.launch_async(cfg, params) }?;
Ok(())
Expand Down
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载