+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@ mnist = "0.5.0"
indicatif = "0.16.2"

[build-dependencies]
rustc_version = "0.4.0"
rustc_version = "0.4.0"
glob = "0.3.0"
31 changes: 31 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,41 @@ fn main() {
println!("cargo:rustc-cfg=feature=\"nightly\"");
}

#[cfg(feature = "cuda")]
cuda::build_ptx();

#[cfg(feature = "intel-mkl")]
intel_mkl::link().unwrap();
}

#[cfg(feature = "cuda")]
mod cuda {
pub fn build_ptx() {
// TODO build ptx file in source tree and don't call nvcc if so
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be done later, I'm not even sure if it's necessary. Once we have all the kernels in place we can see, but no need to complicate something that's pretty simple atm

let out_dir = std::env::var("OUT_DIR").unwrap();
let kernel_paths: Vec<std::path::PathBuf> = glob::glob("src/**/*.cu")
.unwrap()
.map(|p| p.unwrap())
.collect();
println!("cargo:warning=Found kernels {kernel_paths:?}");
for kernel_path in kernel_paths {
println!("cargo:rerun-if-changed={}", kernel_path.display());
let output = std::process::Command::new("nvcc")
.arg("--ptx")
.args(["--output-directory", &out_dir])
.arg(&kernel_path)
.output()
.unwrap();

assert!(
output.status.success(),
"nvcc error while compiling {kernel_path:?}: {:?}",
output
);
}
}
}

#[cfg(feature = "intel-mkl")]
mod intel_mkl {
//! This script links to Intel MKL when the `intel-mkl` feature is enabled.
Expand Down
3 changes: 2 additions & 1 deletion src/shapes/shape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ pub trait Shape:
+ std::ops::IndexMut<usize>
+ Send
+ Sync
+ IntoIterator<Item = usize>;
+ IntoIterator<Item = usize>
+ Into<std::vec::Vec<usize>>;

/// All the axes of this shape
type AllAxes: Axes;
Expand Down
29 changes: 29 additions & 0 deletions src/tensor_ops/abs/abs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
struct AbsKernelOp {};

extern "C" __global__ void abs_forward(
const AbsKernelOp op,
const size_t numel,
const float *inp,
float *out
) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= numel) {
return;
}
out[i] = abs(inp[i]);
}

extern "C" __global__ void abs_backward(
const AbsKernelOp op,
const size_t numel,
const float *inp,
float *grad_inp,
const float *grad_out
) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= numel) {
return;
}
float dx = inp[i] == 0.0 ? 0.0 : (signbit(inp[i]) ? 1.0 : -1.0);
grad_inp[i] += dx * grad_out[i];
}
26 changes: 8 additions & 18 deletions src/tensor_ops/abs/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
use crate::{shapes::Shape, tensor::Cuda, tensor_ops::ops::UnaryKernel};
use crate::tensor_ops::cuda_kernels::UnaryOpCudaKernel;

impl UnaryKernel<super::AbsKernelOp, f32> for Cuda {
fn forward<S: Shape>(
&self,
op: super::AbsKernelOp,
inp: &Self::Storage<S, f32>,
) -> Result<Self::Storage<S, f32>, Self::Err> {
todo!()
}
fn backward<S: Shape>(
&self,
op: super::AbsKernelOp,
inp: &Self::Storage<S, f32>,
grad_inp: &mut Self::Storage<S, f32>,
grad_out: &Self::Storage<S, f32>,
) -> Result<(), Self::Err> {
todo!()
}
unsafe impl cudarc::device::AsKernelParam for super::AbsKernelOp {}

impl UnaryOpCudaKernel for super::AbsKernelOp {
const PTX_SRC: &'static str = include_str!(concat!(env!("OUT_DIR"), "/abs.ptx"));
const MODULE_NAME: &'static str = "abs";
const FWD_FN_NAME: &'static str = "abs_forward";
const BWD_FN_NAME: &'static str = "abs_backward";
}
1 change: 1 addition & 0 deletions src/tensor_ops/abs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod cuda_kernel;
use super::ops::{try_unary_op, UnaryKernel};
use crate::{gradients::Tape, shapes::*, tensor::Tensor};

#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct AbsKernelOp;

Expand Down
172 changes: 172 additions & 0 deletions src/tensor_ops/cuda_kernels.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
use crate::{
shapes::Shape,
tensor::cuda::{Cuda, CudaArray},
tensor_ops::ops::{BinaryKernel, UnaryKernel},
};
use cudarc::device::{AsKernelParam, CudaSlice, LaunchAsync, LaunchConfig};
use std::sync::Arc;

pub trait UnaryOpCudaKernel {
/// Compiled by build.rs
const PTX_SRC: &'static str;

/// Unique name for the kernel
const MODULE_NAME: &'static str;

/// Name of function in the .cu file
const FWD_FN_NAME: &'static str;

/// Name of function in the .cu file
const BWD_FN_NAME: &'static str;

const ALL_FN_NAMES: [&'static str; 2] = [Self::FWD_FN_NAME, Self::BWD_FN_NAME];
}

impl<K: UnaryOpCudaKernel + AsKernelParam> UnaryKernel<K, f32> for Cuda {
fn forward<S: Shape>(
&self,
op: K,
inp: &Self::Storage<S, f32>,
) -> Result<Self::Storage<S, f32>, Self::Err> {
if !self.dev.has_func(K::MODULE_NAME, K::FWD_FN_NAME) {
self.dev
.load_ptx(K::PTX_SRC.into(), K::MODULE_NAME, &K::ALL_FN_NAMES)?;
}

let numel = inp.data.len();
let mut storage = self.dev.alloc_zeros_async::<f32>(numel)?;

let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably make a helper method for computing a good version of this launch config - need to take advantage of threads & blocks.

let params = (
op,
numel, // const size_t numel,
inp.data.as_ref(), // const float *inp,
&mut storage, // float *out
);
unsafe { fwd_fn.launch_async(cfg, params) }?;

Ok(CudaArray {
data: Arc::new(storage),
shape: inp.shape,
strides: inp.strides,
})
}

fn backward<S: Shape>(
&self,
op: K,
inp: &Self::Storage<S, f32>,
grad_inp: &mut Self::Storage<S, f32>,
grad_out: &Self::Storage<S, f32>,
) -> Result<(), Self::Err> {
let bwd_fn = self.dev.get_func(K::MODULE_NAME, K::BWD_FN_NAME).unwrap();
let numel = inp.data.len();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let params = (
op,
numel, // const size_t numel,
inp.data.as_ref(), // const float *inp,
Arc::make_mut(&mut grad_inp.data), // float *grad_inp,
grad_out.data.as_ref(), // const float *grad_out
);
unsafe { bwd_fn.launch_async(cfg, params) }?;
Ok(())
}
}

pub trait BinaryOpCudaKernel {
/// Compiled by build.rs
const PTX_SRC: &'static str;

/// Unique name for the kernel
const MODULE_NAME: &'static str;

/// Name of function in the .cu file
const FWD_FN_NAME: &'static str;

/// Name of function in the .cu file
const BWD_FN_NAME: &'static str;

const ALL_FN_NAMES: [&'static str; 2] = [Self::FWD_FN_NAME, Self::BWD_FN_NAME];
}

impl<K: BinaryOpCudaKernel> BinaryKernel<K, f32> for Cuda {
fn forward<S: Shape>(
&self,
_: K,
lhs: &Self::Storage<S, f32>,
rhs: &Self::Storage<S, f32>,
) -> Result<Self::Storage<S, f32>, Self::Err> {
if !self.dev.has_func(K::MODULE_NAME, K::FWD_FN_NAME) {
self.dev
.load_ptx(K::PTX_SRC.into(), K::MODULE_NAME, &K::ALL_FN_NAMES)?;
}

let shape = lhs.shape;
let strides = lhs.shape.strides();
let numel = shape.num_elements();

let mut storage = self.dev.alloc_zeros_async::<f32>(numel)?;

let dims: CudaSlice<usize> = self.dev.take_async(shape.concrete().into())?;
let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
let out_strides: CudaSlice<usize> = self.dev.take_async(strides.into())?;

let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
&dims, // const size_t *dims,
lhs.data.as_ref(), // const float *lhs,
&lhs_strides, // const size_t *lhs_strides,
rhs.data.as_ref(), // const float *rhs,
&rhs_strides, // const size_t *rhs_strides,
&mut storage, // float *out,
&out_strides, // const size_t *out_strides
);
unsafe { fwd_fn.launch_async(cfg, params) }?;
Ok(CudaArray {
data: Arc::new(storage),
shape,
strides,
})
}

fn backward<S: Shape>(
&self,
_: K,
lhs: &Self::Storage<S, f32>,
grad_lhs: &mut Self::Storage<S, f32>,
rhs: &Self::Storage<S, f32>,
grad_rhs: &mut Self::Storage<S, f32>,
grad_out: &Self::Storage<S, f32>,
) -> Result<(), Self::Err> {
let bwd_fn = self.dev.get_func(K::MODULE_NAME, K::BWD_FN_NAME).unwrap();
let numel = lhs.shape.num_elements();

let dims: CudaSlice<usize> = self.dev.take_async(lhs.shape.concrete().into())?;
let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
let out_strides: CudaSlice<usize> = self.dev.take_async(grad_out.strides.into())?;
Comment on lines +150 to +153
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These same values were also allocated in the forward call - a potential improvement for the future is pre-allocating them. Though these are only used in binary ops - if a tensor is only ever used in a unary op then it doesn't need to allocate these


let cfg = LaunchConfig::for_num_elems(numel as u32);
let params = (
numel, // const size_t numel,
S::NUM_DIMS, // const size_t num_dims,
&dims, // const size_t *dims,
lhs.data.as_ref(), // const float *lhs,
Arc::make_mut(&mut grad_lhs.data), // float *grad_lhs,
&lhs_strides, // const size_t *lhs_strides,
rhs.data.as_ref(), // const float *rhs,
Arc::make_mut(&mut grad_rhs.data), // float *grad_rhs,
&rhs_strides, // const size_t *rhs_strides,
grad_out.data.as_ref(), // const float *grad_out,
&out_strides, // const size_t *out_strides
);
unsafe { bwd_fn.launch_async(cfg, params) }?;
Ok(())
}
}
70 changes: 70 additions & 0 deletions src/tensor_ops/div/binary_div.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
__device__ unsigned int get_strided_index(
unsigned int idx,
const size_t num_dims,
const size_t *dims,
const size_t *strides
) {
unsigned int strided_i = 0;
for (unsigned int d = 0; d < num_dims; d++) {
unsigned int dim_idx = num_dims - 1 - d;
strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
idx /= dims[dim_idx];
}
return strided_i;
}

extern "C" __global__ void binary_div_forward(
const size_t numel,
const size_t num_dims,
const size_t *dims,
const float *lhs,
const size_t *lhs_strides,
const float *rhs,
const size_t *rhs_strides,
float *out,
const size_t *out_strides
) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= numel) {
return;
}

unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides);
unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides);
unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides);

out[out_i] = lhs[lhs_i] / rhs[rhs_i];
}

extern "C" __global__ void binary_div_backward(
const size_t numel,
const size_t num_dims,
const size_t *dims,
const float *lhs,
float *grad_lhs,
const size_t *lhs_strides,
const float *rhs,
float *grad_rhs,
const size_t *rhs_strides,
const float *grad_out,
const size_t *out_strides
) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= numel) {
return;
}

unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides);
unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides);
unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides);

auto x = lhs[lhs_i];
auto y = rhs[rhs_i];
auto go = grad_out[out_i];

float dfdx = 1.0 / y;
grad_lhs[lhs_i] += dfdx * go;

float dfdy = -x / (y * y);
grad_rhs[rhs_i] += dfdy * go;
}
4 changes: 2 additions & 2 deletions src/tensor_ops/div/cpu_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ use crate::tensor_ops::cpu_kernels::{BinaryDerivative, UnaryDerivative};

impl UnaryDerivative<f32> for super::ScalarDivKernelOp<f32> {
fn f(&self, x: &f32) -> f32 {
x / self.0
x / self.scalar
}
fn df(&self, _: &f32) -> f32 {
1.0 / self.0
1.0 / self.scalar
}
}

Expand Down
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载