+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 66 additions & 3 deletions src/tensor_ops/dropout/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,58 @@
use crate::{shapes::Shape, tensor::Cuda, tensor_ops::ops::UnaryKernel};
use crate::{
shapes::Shape,
tensor::cuda::{Cuda, CudaArray},
tensor_ops::ops::UnaryKernel,
};

use std::{sync::Arc, vec::Vec};

use cudarc::driver::{LaunchAsync, LaunchConfig};

use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::Standard;

const MODULE_NAME: &str = "dropout";
const FWD_FN_NAME: &str = "dropout_forward";
const BWD_FN_NAME: &str = "dropout_backward";
const ALL_FN_NAMES: [&str; 2] = [FWD_FN_NAME, BWD_FN_NAME];
const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/dropout.ptx"));

impl UnaryKernel<super::DropoutKernelOp, f32> for Cuda {
fn forward<S: Shape>(
&self,
op: super::DropoutKernelOp,
inp: &Self::Storage<S, f32>,
) -> Result<Self::Storage<S, f32>, Self::Err> {
todo!()
let noise = {
let mut rng = StdRng::seed_from_u64(op.seed);
let mut noise: Vec<f32> = Vec::with_capacity(inp.data.len());
noise.resize_with(inp.data.len(), || rng.sample(Standard));
self.dev.take_async(noise)
}?;

if !self.dev.has_func(MODULE_NAME, FWD_FN_NAME) {
self.dev
.load_ptx(PTX_SRC.into(), MODULE_NAME, &ALL_FN_NAMES)?;
}

let numel = inp.data.len();
let mut storage = self.dev.alloc_zeros_async::<f32>(numel)?;

let fwd_fn = self.dev.get_func(MODULE_NAME, FWD_FN_NAME).unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let params = (
op.prob, // const float prob,
numel, // const size_t numel,
inp.data.as_ref(), // const float *inp,
&noise, // const float *noise,
&mut storage, // float *out
);
unsafe { fwd_fn.launch_async(cfg, params) }?;
Ok(CudaArray {
data: Arc::new(storage),
shape: inp.shape,
strides: inp.strides,
})
}
fn backward<S: Shape>(
&self,
Expand All @@ -15,6 +61,23 @@ impl UnaryKernel<super::DropoutKernelOp, f32> for Cuda {
grad_inp: &mut Self::Storage<S, f32>,
grad_out: &Self::Storage<S, f32>,
) -> Result<(), Self::Err> {
todo!()
let noise = {
let mut rng = StdRng::seed_from_u64(op.seed);
let mut noise: Vec<f32> = Vec::with_capacity(inp.data.len());
noise.resize_with(inp.data.len(), || rng.sample(Standard));
self.dev.take_async(noise)
}?;
let bwd_fn = self.dev.get_func(MODULE_NAME, BWD_FN_NAME).unwrap();
let numel = inp.data.len();
let cfg = LaunchConfig::for_num_elems(numel as u32);
let params = (
op.prob, // const float prob,
numel, // const size_t numel,
&noise, // const float *noise,
Arc::make_mut(&mut grad_inp.data), // float *grad_inp,
grad_out.data.as_ref(), // const float *grad_out
);
unsafe { bwd_fn.launch_async(cfg, params) }?;
Ok(())
}
}
30 changes: 30 additions & 0 deletions src/tensor_ops/dropout/dropout.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
extern "C" __global__ void dropout_forward(
const float prob,
const size_t numel,
const float *inp,
const float *noise,
float *out
) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= numel) {
return;
}

float scalar = (noise[i] < prob) ? 0.0 : (1.0 / (1.0 - prob));
out[i] = inp[i] * scalar;
}

extern "C" __global__ void dropout_backward(
const float prob,
const size_t numel,
const float *noise,
float *grad_inp,
const float *grad_out
) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= numel) {
return;
}

grad_inp[i] += (noise[i] < prob) ? 0.0 : (grad_out[i] / (1.0 - prob));
}
11 changes: 5 additions & 6 deletions src/tensor_ops/dropout/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod cuda_kernel;
use super::ops::{try_unary_op, UnaryKernel};
use crate::{gradients::Tape, shapes::*, tensor::Tensor};

#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct DropoutKernelOp {
pub seed: u64,
Expand Down Expand Up @@ -52,9 +53,7 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<DropoutKernelOp, E>, T: Tape<D>> Tensor<

#[cfg(test)]
mod tests {
use crate::tensor::*;
use crate::tensor_ops::*;
use crate::tests::{assert_close, TestDevice};
use crate::{tensor::*, tensor_ops::*, tests::*};

#[test]
fn test_dropout_all_0d() {
Expand Down Expand Up @@ -94,9 +93,9 @@ mod tests {
assert_close(&r.array(), &[[0.125, 0.25, -0.5], [0.0, 0.0, 1.25]]);
// NOTE: .exp() so we ensure result grad is used properly
let g = r.exp().mean().backward();
assert_eq!(
g.get(&t).array(),
[[0.47214523, 0.5350107, 0.2527211], [0.0, 0.0, 1.4543099]]
assert_close(
&g.get(&t).array(),
&[[0.47214523, 0.5350107, 0.2527211], [0.0, 0.0, 1.4543099]],
);
}
}
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载