coreylowman · coreylowman · Jan 17, 2023 · Jan 17, 2023
diff --git a/src/tensor_ops/dropout/cuda_kernel.rs b/src/tensor_ops/dropout/cuda_kernel.rs
@@ -1,12 +1,58 @@
-use crate::{shapes::Shape, tensor::Cuda, tensor_ops::ops::UnaryKernel};
+use crate::{
+    shapes::Shape,
+    tensor::cuda::{Cuda, CudaArray},
+    tensor_ops::ops::UnaryKernel,
+};
+
+use std::{sync::Arc, vec::Vec};
+
+use cudarc::driver::{LaunchAsync, LaunchConfig};
+
+use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand_distr::Standard;
+
+const MODULE_NAME: &str = "dropout";
+const FWD_FN_NAME: &str = "dropout_forward";
+const BWD_FN_NAME: &str = "dropout_backward";
+const ALL_FN_NAMES: [&str; 2] = [FWD_FN_NAME, BWD_FN_NAME];
+const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/dropout.ptx"));
 
 impl UnaryKernel<super::DropoutKernelOp, f32> for Cuda {
     fn forward<S: Shape>(
         &self,
         op: super::DropoutKernelOp,
         inp: &Self::Storage<S, f32>,
     ) -> Result<Self::Storage<S, f32>, Self::Err> {
-        todo!()
+        let noise = {
+            let mut rng = StdRng::seed_from_u64(op.seed);
+            let mut noise: Vec<f32> = Vec::with_capacity(inp.data.len());
+            noise.resize_with(inp.data.len(), || rng.sample(Standard));
+            self.dev.take_async(noise)
+        }?;
+
+        if !self.dev.has_func(MODULE_NAME, FWD_FN_NAME) {
+            self.dev
+                .load_ptx(PTX_SRC.into(), MODULE_NAME, &ALL_FN_NAMES)?;
+        }
+
+        let numel = inp.data.len();
+        let mut storage = self.dev.alloc_zeros_async::<f32>(numel)?;
+
+        let fwd_fn = self.dev.get_func(MODULE_NAME, FWD_FN_NAME).unwrap();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            op.prob,           // const float prob,
+            numel,             // const size_t numel,
+            inp.data.as_ref(), // const float *inp,
+            &noise,            // const float *noise,
+            &mut storage,      // float *out
+        );
+        unsafe { fwd_fn.launch_async(cfg, params) }?;
+        Ok(CudaArray {
+            data: Arc::new(storage),
+            shape: inp.shape,
+            strides: inp.strides,
+        })
     }
     fn backward<S: Shape>(
         &self,
@@ -15,6 +61,23 @@ impl UnaryKernel<super::DropoutKernelOp, f32> for Cuda {
         grad_inp: &mut Self::Storage<S, f32>,
         grad_out: &Self::Storage<S, f32>,
     ) -> Result<(), Self::Err> {
-        todo!()
+        let noise = {
+            let mut rng = StdRng::seed_from_u64(op.seed);
+            let mut noise: Vec<f32> = Vec::with_capacity(inp.data.len());
+            noise.resize_with(inp.data.len(), || rng.sample(Standard));
+            self.dev.take_async(noise)
+        }?;
+        let bwd_fn = self.dev.get_func(MODULE_NAME, BWD_FN_NAME).unwrap();
+        let numel = inp.data.len();
+        let cfg = LaunchConfig::for_num_elems(numel as u32);
+        let params = (
+            op.prob,                           // const float prob,
+            numel,                             // const size_t numel,
+            &noise,                            // const float *noise,
+            Arc::make_mut(&mut grad_inp.data), // float *grad_inp,
+            grad_out.data.as_ref(),            // const float *grad_out
+        );
+        unsafe { bwd_fn.launch_async(cfg, params) }?;
+        Ok(())
     }
 }
diff --git a/src/tensor_ops/dropout/dropout.cu b/src/tensor_ops/dropout/dropout.cu
@@ -0,0 +1,30 @@
+extern "C" __global__ void dropout_forward(
+    const float prob,
+    const size_t numel,
+    const float *inp,
+    const float *noise,
+    float *out
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+
+    float scalar = (noise[i] < prob) ? 0.0 : (1.0 / (1.0 - prob));
+    out[i] = inp[i] * scalar;
+}
+
+extern "C" __global__ void dropout_backward(
+    const float prob,
+    const size_t numel,
+    const float *noise,
+    float *grad_inp,
+    const float *grad_out
+) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+
+    grad_inp[i] += (noise[i] < prob) ? 0.0 : (grad_out[i] / (1.0 - prob));
+}
diff --git a/src/tensor_ops/dropout/mod.rs b/src/tensor_ops/dropout/mod.rs
@@ -6,6 +6,7 @@ mod cuda_kernel;
 use super::ops::{try_unary_op, UnaryKernel};
 use crate::{gradients::Tape, shapes::*, tensor::Tensor};
 
+#[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct DropoutKernelOp {
     pub seed: u64,
@@ -52,9 +53,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<DropoutKernelOp, E>, T: Tape<D>> Tensor<
 
 #[cfg(test)]
 mod tests {
-    use crate::tensor::*;
-    use crate::tensor_ops::*;
-    use crate::tests::{assert_close, TestDevice};
+    use crate::{tensor::*, tensor_ops::*, tests::*};
 
     #[test]
     fn test_dropout_all_0d() {
@@ -94,9 +93,9 @@ mod tests {
         assert_close(&r.array(), &[[0.125, 0.25, -0.5], [0.0, 0.0, 1.25]]);
         // NOTE: .exp() so we ensure result grad is used properly
         let g = r.exp().mean().backward();
-        assert_eq!(
-            g.get(&t).array(),
-            [[0.47214523, 0.5350107, 0.2527211], [0.0, 0.0, 1.4543099]]
+        assert_close(
+            &g.get(&t).array(),
+            &[[0.47214523, 0.5350107, 0.2527211], [0.0, 0.0, 1.4543099]],
         );
     }
 }