coreylowman · coreylowman · Jan 8, 2023 · Jan 7, 2023 · Jan 8, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -31,7 +31,7 @@ matrixmultiply = { version = "0.3.2", default-features = false }
 zip = { version = "0.6.2", default-features = false, optional = true }
 cblas-sys = { version = "0.1.4", default-features = false, optional = true }
 libc = { version = "0.2", default-features = false, optional = true }
-cudarc = { version = "0.5.0", default-features = false, optional = true }
+cudarc = { version = "0.5.1", default-features = false, optional = true }
 
 [features]
 default = ["std", "numpy"]

diff --git a/src/tensor_ops/broadcast_to/broadcast_to.cu b/src/tensor_ops/broadcast_to/broadcast_to.cu
@@ -0,0 +1,11 @@
+extern "C" __global__ void sum(
+    const size_t numel,
+    const float *inp,
+    float *out
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+    out[i] += inp[i];
+}
diff --git a/src/tensor_ops/broadcast_to/cuda_kernel.rs b/src/tensor_ops/broadcast_to/cuda_kernel.rs
@@ -1,4 +1,10 @@
-use crate::{shapes::*, tensor::Cuda};
+use crate::shapes::*;
+use crate::tensor::cuda::{Cuda, CudaArray};
+
+use cudarc::device::{LaunchAsync, LaunchConfig};
+use std::sync::Arc;
+
+const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/broadcast_to.ptx"));
 
 impl<E: Dtype> super::BroadcastKernel<E> for Cuda {
     fn forward<Src: Shape, Dst: Shape, Ax: Axes>(
@@ -9,7 +15,11 @@ impl<E: Dtype> super::BroadcastKernel<E> for Cuda {
     where
         Src: BroadcastShapeTo<Dst, Ax>,
     {
-        todo!()
+        Ok(CudaArray {
+            data: inp.data.clone(),
+            shape: dst,
+            strides: inp.shape.broadcast_strides(inp.strides),
+        })
     }
     fn backward<Src: Shape, Dst: Shape, Ax: Axes>(
         &self,
@@ -19,6 +29,21 @@ impl<E: Dtype> super::BroadcastKernel<E> for Cuda {
     where
         Src: BroadcastShapeTo<Dst, Ax>,
     {
-        todo!()
+        if !self.dev.has_func("broadcast_to", "sum") {
+            self.dev
+                .load_ptx(PTX_SRC.into(), "broadcast_to", &["sum"])?;
+        }
+
+        let f = self.dev.get_func("broadcast_to", "sum").unwrap();
+
+        let numel = grad_inp.data.len();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            numel,                             // const size_t numel,
+            grad_out.data.as_ref(),            // const float *inp,
+            Arc::make_mut(&mut grad_inp.data), // float *out
+        );
+        unsafe { f.launch_async(cfg, params) }?;
+        Ok(())
     }
 }
diff --git a/src/tensor_ops/permute_to/cuda_kernel.rs b/src/tensor_ops/permute_to/cuda_kernel.rs
@@ -1,5 +1,10 @@
 use crate::shapes::*;
-use crate::tensor::Cuda;
+use crate::tensor::cuda::{Cuda, CudaArray};
+
+use cudarc::device::{LaunchAsync, LaunchConfig};
+use std::sync::Arc;
+
+const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/permute_to.ptx"));
 
 impl<E: Dtype> super::PermuteKernel<E> for Cuda {
     fn forward<Src: Shape, Dst: Shape, Ax: Axes>(
@@ -9,7 +14,11 @@ impl<E: Dtype> super::PermuteKernel<E> for Cuda {
     where
         Src: PermuteShapeTo<Dst, Ax>,
     {
-        todo!()
+        Ok(CudaArray {
+            data: inp.data.clone(),
+            shape: inp.shape.permuted(),
+            strides: inp.shape.permute_strides(inp.strides),
+        })
     }
     fn backward<Src: Shape, Dst: Shape, Ax: Axes>(
         &self,
@@ -19,6 +28,20 @@ impl<E: Dtype> super::PermuteKernel<E> for Cuda {
     where
         Src: PermuteShapeTo<Dst, Ax>,
     {
-        todo!()
+        if !self.dev.has_func("permute_to", "sum") {
+            self.dev.load_ptx(PTX_SRC.into(), "permute_to", &["sum"])?;
+        }
+
+        let f = self.dev.get_func("permute_to", "sum").unwrap();
+
+        let numel = grad_inp.data.len();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            numel,                             // const size_t numel,
+            grad_out.data.as_ref(),            // const float *inp,
+            Arc::make_mut(&mut grad_inp.data), // float *out
+        );
+        unsafe { f.launch_async(cfg, params) }?;
+        Ok(())
     }
 }
diff --git a/src/tensor_ops/permute_to/permute_to.cu b/src/tensor_ops/permute_to/permute_to.cu
@@ -0,0 +1,11 @@
+extern "C" __global__ void sum(
+    const size_t numel,
+    const float *inp,
+    float *out
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+    out[i] += inp[i];
+}