coreylowman · coreylowman · Jan 7, 2023 · Dec 16, 2022 · Dec 18, 2022 · Dec 18, 2022
diff --git a/Cargo.toml b/Cargo.toml
@@ -50,4 +50,5 @@ mnist = "0.5.0"
 indicatif = "0.16.2"
 
 [build-dependencies]
-rustc_version = "0.4.0"
+rustc_version = "0.4.0"
+glob = "0.3.0"
diff --git a/build.rs b/build.rs
@@ -8,10 +8,41 @@ fn main() {
         println!("cargo:rustc-cfg=feature=\"nightly\"");
     }
 
+    #[cfg(feature = "cuda")]
+    cuda::build_ptx();
+
     #[cfg(feature = "intel-mkl")]
     intel_mkl::link().unwrap();
 }
 
+#[cfg(feature = "cuda")]
+mod cuda {
+    pub fn build_ptx() {
+        // TODO build ptx file in source tree and don't call nvcc if so
+        let out_dir = std::env::var("OUT_DIR").unwrap();
+        let kernel_paths: Vec<std::path::PathBuf> = glob::glob("src/**/*.cu")
+            .unwrap()
+            .map(|p| p.unwrap())
+            .collect();
+        println!("cargo:warning=Found kernels {kernel_paths:?}");
+        for kernel_path in kernel_paths {
+            println!("cargo:rerun-if-changed={}", kernel_path.display());
+            let output = std::process::Command::new("nvcc")
+                .arg("--ptx")
+                .args(["--output-directory", &out_dir])
+                .arg(&kernel_path)
+                .output()
+                .unwrap();
+
+            assert!(
+                output.status.success(),
+                "nvcc error while compiling {kernel_path:?}: {:?}",
+                output
+            );
+        }
+    }
+}
+
 #[cfg(feature = "intel-mkl")]
 mod intel_mkl {
     //! This script links to Intel MKL when the `intel-mkl` feature is enabled.

diff --git a/src/shapes/shape.rs b/src/shapes/shape.rs
@@ -101,7 +101,8 @@ pub trait Shape:
         + std::ops::IndexMut<usize>
         + Send
         + Sync
-        + IntoIterator<Item = usize>;
+        + IntoIterator<Item = usize>
+        + Into<std::vec::Vec<usize>>;
 
     /// All the axes of this shape
     type AllAxes: Axes;

diff --git a/src/tensor_ops/abs/abs.cu b/src/tensor_ops/abs/abs.cu
@@ -0,0 +1,29 @@
+struct AbsKernelOp {};
+
+extern "C" __global__ void abs_forward(
+    const AbsKernelOp op,
+    const size_t numel,
+    const float *inp,
+    float *out
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+    out[i] = abs(inp[i]);
+}
+
+extern "C" __global__ void abs_backward(
+    const AbsKernelOp op,
+    const size_t numel,
+    const float *inp,
+    float *grad_inp,
+    const float *grad_out
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+    float dx = inp[i] == 0.0 ? 0.0 : (signbit(inp[i]) ? 1.0 : -1.0);
+    grad_inp[i] += dx * grad_out[i];
+}
diff --git a/src/tensor_ops/abs/cuda_kernel.rs b/src/tensor_ops/abs/cuda_kernel.rs
@@ -1,20 +1,10 @@
-use crate::{shapes::Shape, tensor::Cuda, tensor_ops::ops::UnaryKernel};
+use crate::tensor_ops::cuda_kernels::UnaryOpCudaKernel;
 
-impl UnaryKernel<super::AbsKernelOp, f32> for Cuda {
-    fn forward<S: Shape>(
-        &self,
-        op: super::AbsKernelOp,
-        inp: &Self::Storage<S, f32>,
-    ) -> Result<Self::Storage<S, f32>, Self::Err> {
-        todo!()
-    }
-    fn backward<S: Shape>(
-        &self,
-        op: super::AbsKernelOp,
-        inp: &Self::Storage<S, f32>,
-        grad_inp: &mut Self::Storage<S, f32>,
-        grad_out: &Self::Storage<S, f32>,
-    ) -> Result<(), Self::Err> {
-        todo!()
-    }
+unsafe impl cudarc::device::AsKernelParam for super::AbsKernelOp {}
+
+impl UnaryOpCudaKernel for super::AbsKernelOp {
+    const PTX_SRC: &'static str = include_str!(concat!(env!("OUT_DIR"), "/abs.ptx"));
+    const MODULE_NAME: &'static str = "abs";
+    const FWD_FN_NAME: &'static str = "abs_forward";
+    const BWD_FN_NAME: &'static str = "abs_backward";
 }
diff --git a/src/tensor_ops/abs/mod.rs b/src/tensor_ops/abs/mod.rs
@@ -6,6 +6,7 @@ mod cuda_kernel;
 use super::ops::{try_unary_op, UnaryKernel};
 use crate::{gradients::Tape, shapes::*, tensor::Tensor};
 
+#[repr(C)]
 #[derive(Debug, Default, Copy, Clone)]
 pub struct AbsKernelOp;
 

diff --git a/src/tensor_ops/cuda_kernels.rs b/src/tensor_ops/cuda_kernels.rs
@@ -0,0 +1,172 @@
+use crate::{
+    shapes::Shape,
+    tensor::cuda::{Cuda, CudaArray},
+    tensor_ops::ops::{BinaryKernel, UnaryKernel},
+};
+use cudarc::device::{AsKernelParam, CudaSlice, LaunchAsync, LaunchConfig};
+use std::sync::Arc;
+
+pub trait UnaryOpCudaKernel {
+    /// Compiled by build.rs
+    const PTX_SRC: &'static str;
+
+    /// Unique name for the kernel
+    const MODULE_NAME: &'static str;
+
+    /// Name of function in the .cu file
+    const FWD_FN_NAME: &'static str;
+
+    /// Name of function in the .cu file
+    const BWD_FN_NAME: &'static str;
+
+    const ALL_FN_NAMES: [&'static str; 2] = [Self::FWD_FN_NAME, Self::BWD_FN_NAME];
+}
+
+impl<K: UnaryOpCudaKernel + AsKernelParam> UnaryKernel<K, f32> for Cuda {
+    fn forward<S: Shape>(
+        &self,
+        op: K,
+        inp: &Self::Storage<S, f32>,
+    ) -> Result<Self::Storage<S, f32>, Self::Err> {
+        if !self.dev.has_func(K::MODULE_NAME, K::FWD_FN_NAME) {
+            self.dev
+                .load_ptx(K::PTX_SRC.into(), K::MODULE_NAME, &K::ALL_FN_NAMES)?;
+        }
+
+        let numel = inp.data.len();
+        let mut storage = self.dev.alloc_zeros_async::<f32>(numel)?;
+
+        let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            op,
+            numel,             // const size_t numel,
+            inp.data.as_ref(), // const float *inp,
+            &mut storage,      // float *out
+        );
+        unsafe { fwd_fn.launch_async(cfg, params) }?;
+
+        Ok(CudaArray {
+            data: Arc::new(storage),
+            shape: inp.shape,
+            strides: inp.strides,
+        })
+    }
+
+    fn backward<S: Shape>(
+        &self,
+        op: K,
+        inp: &Self::Storage<S, f32>,
+        grad_inp: &mut Self::Storage<S, f32>,
+        grad_out: &Self::Storage<S, f32>,
+    ) -> Result<(), Self::Err> {
+        let bwd_fn = self.dev.get_func(K::MODULE_NAME, K::BWD_FN_NAME).unwrap();
+        let numel = inp.data.len();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            op,
+            numel,                             // const size_t numel,
+            inp.data.as_ref(),                 // const float *inp,
+            Arc::make_mut(&mut grad_inp.data), // float *grad_inp,
+            grad_out.data.as_ref(),            // const float *grad_out
+        );
+        unsafe { bwd_fn.launch_async(cfg, params) }?;
+        Ok(())
+    }
+}
+
+pub trait BinaryOpCudaKernel {
+    /// Compiled by build.rs
+    const PTX_SRC: &'static str;
+
+    /// Unique name for the kernel
+    const MODULE_NAME: &'static str;
+
+    /// Name of function in the .cu file
+    const FWD_FN_NAME: &'static str;
+
+    /// Name of function in the .cu file
+    const BWD_FN_NAME: &'static str;
+
+    const ALL_FN_NAMES: [&'static str; 2] = [Self::FWD_FN_NAME, Self::BWD_FN_NAME];
+}
+
+impl<K: BinaryOpCudaKernel> BinaryKernel<K, f32> for Cuda {
+    fn forward<S: Shape>(
+        &self,
+        _: K,
+        lhs: &Self::Storage<S, f32>,
+        rhs: &Self::Storage<S, f32>,
+    ) -> Result<Self::Storage<S, f32>, Self::Err> {
+        if !self.dev.has_func(K::MODULE_NAME, K::FWD_FN_NAME) {
+            self.dev
+                .load_ptx(K::PTX_SRC.into(), K::MODULE_NAME, &K::ALL_FN_NAMES)?;
+        }
+
+        let shape = lhs.shape;
+        let strides = lhs.shape.strides();
+        let numel = shape.num_elements();
+
+        let mut storage = self.dev.alloc_zeros_async::<f32>(numel)?;
+
+        let dims: CudaSlice<usize> = self.dev.take_async(shape.concrete().into())?;
+        let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
+        let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
+        let out_strides: CudaSlice<usize> = self.dev.take_async(strides.into())?;
+
+        let fwd_fn = self.dev.get_func(K::MODULE_NAME, K::FWD_FN_NAME).unwrap();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            numel,             // const size_t numel,
+            S::NUM_DIMS,       // const size_t num_dims,
+            &dims,             // const size_t *dims,
+            lhs.data.as_ref(), // const float *lhs,
+            &lhs_strides,      // const size_t *lhs_strides,
+            rhs.data.as_ref(), // const float *rhs,
+            &rhs_strides,      // const size_t *rhs_strides,
+            &mut storage,      // float *out,
+            &out_strides,      // const size_t *out_strides
+        );
+        unsafe { fwd_fn.launch_async(cfg, params) }?;
+        Ok(CudaArray {
+            data: Arc::new(storage),
+            shape,
+            strides,
+        })
+    }
+
+    fn backward<S: Shape>(
+        &self,
+        _: K,
+        lhs: &Self::Storage<S, f32>,
+        grad_lhs: &mut Self::Storage<S, f32>,
+        rhs: &Self::Storage<S, f32>,
+        grad_rhs: &mut Self::Storage<S, f32>,
+        grad_out: &Self::Storage<S, f32>,
+    ) -> Result<(), Self::Err> {
+        let bwd_fn = self.dev.get_func(K::MODULE_NAME, K::BWD_FN_NAME).unwrap();
+        let numel = lhs.shape.num_elements();
+
+        let dims: CudaSlice<usize> = self.dev.take_async(lhs.shape.concrete().into())?;
+        let lhs_strides: CudaSlice<usize> = self.dev.take_async(lhs.strides.into())?;
+        let rhs_strides: CudaSlice<usize> = self.dev.take_async(rhs.strides.into())?;
+        let out_strides: CudaSlice<usize> = self.dev.take_async(grad_out.strides.into())?;
+
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            numel,                             // const size_t numel,
+            S::NUM_DIMS,                       // const size_t num_dims,
+            &dims,                             // const size_t *dims,
+            lhs.data.as_ref(),                 // const float *lhs,
+            Arc::make_mut(&mut grad_lhs.data), // float *grad_lhs,
+            &lhs_strides,                      // const size_t *lhs_strides,
+            rhs.data.as_ref(),                 // const float *rhs,
+            Arc::make_mut(&mut grad_rhs.data), // float *grad_rhs,
+            &rhs_strides,                      // const size_t *rhs_strides,
+            grad_out.data.as_ref(),            // const float *grad_out,
+            &out_strides,                      // const size_t *out_strides
+        );
+        unsafe { bwd_fn.launch_async(cfg, params) }?;
+        Ok(())
+    }
+}
diff --git a/src/tensor_ops/div/binary_div.cu b/src/tensor_ops/div/binary_div.cu
@@ -0,0 +1,70 @@
+__device__ unsigned int get_strided_index(
+    unsigned int idx,
+    const size_t num_dims,
+    const size_t *dims,
+    const size_t *strides
+) {
+    unsigned int strided_i = 0;
+    for (unsigned int d = 0; d < num_dims; d++) {
+        unsigned int dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+extern "C" __global__ void binary_div_forward(
+    const size_t numel,
+    const size_t num_dims,
+    const size_t *dims,
+    const float *lhs,
+    const size_t *lhs_strides,
+    const float *rhs,
+    const size_t *rhs_strides,
+    float *out,
+    const size_t *out_strides
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+
+    unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides);
+    unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides);
+    unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides);
+
+    out[out_i] = lhs[lhs_i] / rhs[rhs_i];
+}
+
+extern "C" __global__ void binary_div_backward(
+    const size_t numel,
+    const size_t num_dims,
+    const size_t *dims,
+    const float *lhs,
+    float *grad_lhs,
+    const size_t *lhs_strides,
+    const float *rhs,
+    float *grad_rhs,
+    const size_t *rhs_strides,
+    const float *grad_out,
+    const size_t *out_strides
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+
+    unsigned int lhs_i = get_strided_index(i, num_dims, dims, lhs_strides);
+    unsigned int rhs_i = get_strided_index(i, num_dims, dims, rhs_strides);
+    unsigned int out_i = get_strided_index(i, num_dims, dims, out_strides);
+
+    auto x = lhs[lhs_i];
+    auto y = rhs[rhs_i];
+    auto go = grad_out[out_i];
+
+    float dfdx = 1.0 / y;
+    grad_lhs[lhs_i] += dfdx * go;
+
+    float dfdy = -x / (y * y);
+    grad_rhs[rhs_i] += dfdy * go;
+}
diff --git a/src/tensor_ops/div/cpu_kernel.rs b/src/tensor_ops/div/cpu_kernel.rs
@@ -2,10 +2,10 @@ use crate::tensor_ops::cpu_kernels::{BinaryDerivative, UnaryDerivative};
 
 impl UnaryDerivative<f32> for super::ScalarDivKernelOp<f32> {
     fn f(&self, x: &f32) -> f32 {
-        x / self.0
+        x / self.scalar
     }
     fn df(&self, _: &f32) -> f32 {
-        1.0 / self.0
+        1.0 / self.scalar
     }
 }