coreylowman · coreylowman · Jul 20, 2023 · Jul 6, 2023 · Jul 13, 2023 · Jul 13, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -37,6 +37,7 @@ memmap2 = { version = "0.5", default-features = false, optional = true }
 half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
 gemm = { version = "0.15.4", default-features = false, optional = true }
 rayon = { version = "1.7.0", optional = true }
+libm = "0.2.7"
 
 [dev-dependencies]
 tempfile = "3.3.0"

diff --git a/src/nn/activations.rs b/src/nn/activations.rs
@@ -25,7 +25,12 @@ macro_rules! activation_impls {
 }
 
 activation_impls!(ReLU, try_relu, #[doc="Calls [relu()]."]);
-activation_impls!(GeLU, try_gelu, #[doc="Calls [gelu()]."]);
+activation_impls!(FastGeLU, try_fast_gelu, #[doc="Calls [fast_gelu()]. This corresponds to `torch.nn.GELU(approximate='tanh')` in pytorch."]);
+activation_impls!(
+    AccurateGeLU,
+    try_accurate_gelu, 
+    #[doc=r#"Calls [accurate_gelu()]. The GeLU is defined as x * Phi(x) where Phi is the cumulative distribution function of a standard Normal Distribution. 
+It is often implemented with a fast approximation using tanh (see [GeLU]). This corresponds to pytorch `torch.nn.GELU(approximate='none')` in pytorch."#]);
 activation_impls!(Sin, try_sin, #[doc="Calls [sin()]."]);
 activation_impls!(Cos, try_cos, #[doc="Calls [cos()]."]);
 activation_impls!(Ln, try_ln, #[doc="Calls [ln()]."]);
@@ -38,6 +43,26 @@ activation_impls!(Abs, try_abs, #[doc="Calls [abs()]."]);
 activation_impls!(Softmax, try_softmax, #[doc="Calls [softmax()]."]);
 activation_impls!(LogSoftmax, try_log_softmax, #[doc="Calls [log_softmax()]."]);
 
+/// Use [FastGeLU] instead
+#[deprecated(since = "0.12.0", note = "please use `FastGeLU` instead")]
+#[derive(Default, Debug, Clone, Copy)]
+pub struct GeLU;
+
+#[allow(deprecated)]
+impl ZeroSizedModule for GeLU {}
+#[allow(deprecated)]
+impl NonMutableModule for GeLU {}
+
+#[allow(deprecated)]
+impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for GeLU {
+    type Output = Tensor<S, E, D, T>;
+    type Error = D::Err;
+
+    fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, D::Err> {
+        input.try_fast_gelu()
+    }
+}
+
 /// Calls [prelu()] with constant value - defaults to 0.05
 #[derive(Debug, Clone, Copy)]
 pub struct LeakyReLU<E: Dtype>(pub E);
@@ -64,6 +89,9 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
 mod tests {
     use crate::{nn::*, tests::TestDevice};
 
+    #[allow(deprecated)]
+    use super::GeLU;
+
     use super::*;
 
     #[test]
@@ -76,14 +104,29 @@ mod tests {
     }
 
     #[test]
-    fn test_nn_activations_gelu() {
+    fn test_nn_activations_accurate_gelu() {
         let dev: TestDevice = Default::default();
         let t = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
-        let r1 = GeLU.forward_mut(t.clone());
-        let r2 = gelu(t);
+        let r1 = AccurateGeLU.forward_mut(t.clone());
+        let r2 = accurate_gelu(t);
         assert_eq!(r1.array(), r2.array());
     }
 
+    #[test]
+    fn test_nn_activations_fast_gelu() {
+        let dev: TestDevice = Default::default();
+        let t = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let r1 = FastGeLU.forward_mut(t.clone());
+        #[allow(deprecated)]
+        let r2 = GeLU.forward_mut(t.clone());
+        let r3 = fast_gelu(t.clone());
+        #[allow(deprecated)]
+        let r4 = gelu(t);
+        assert_eq!(r1.array(), r2.array());
+        assert_eq!(r1.array(), r3.array());
+        assert_eq!(r1.array(), r4.array());
+    }
+
     #[test]
     fn test_nn_activations_sin() {
         let dev: TestDevice = Default::default();

diff --git a/src/tensor_ops/accurate_gelu/accurate_gelu.cu b/src/tensor_ops/accurate_gelu/accurate_gelu.cu
@@ -0,0 +1,47 @@
+#include "unary_op_macros.cuh"
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+struct AccurateGeLUKernelOp {};
+
+template <typename T> __device__ T accurate_gelu_fwd(T x) {
+    T one = 1.0;
+    T half = 0.5;
+    T alpha = M_SQRT1_2;
+    return half * x * (one + erfg(x * alpha))
+}
+
+template <typename T> __device__ T accurate_gelu_bwd(T x) {
+    T one = 1.0;
+    T half = 0.5;
+    T alpha = M_SQRT1_2;
+    T x_sq = x * x;
+    T norm = expg(M_2_SQRTPI * half * x_sq);
+
+    T left = half * x;
+    T right = one + erfg(alph * x);
+
+    T left_derivative = half * right;
+
+    T right_derivative = left * normal_dist;
+
+    return left_derivative + right_derivative;
+}
+
+UNARY_OP(__half, accurate_gelu_fwd_f16, accurate_gelu_bwd_f16,
+    AccurateGeLUKernelOp,
+    accurate_gelu_fwd(x),
+    accurate_gelu_bwd(x)
+)
+
+UNARY_OP(float, accurate_gelu_fwd_f32, accurate_gelu_bwd_f32,
+    AccurateGeLUKernelOp,
+    accurate_gelu_fwd(x),
+    accurate_gelu_bwd(x)
+)
+
+UNARY_OP(double, accurate_gelu_fwd_f64, accurate_gelu_bwd_f64,
+    AccurateGeLUKernelOp,
+    accurate_gelu_fwd(x),
+    accurate_gelu_bwd(x)
+)
diff --git a/src/tensor_ops/accurate_gelu/cpu_kernel.rs b/src/tensor_ops/accurate_gelu/cpu_kernel.rs
@@ -0,0 +1,55 @@
+use crate::tensor_ops::cpu_kernels::UnaryDerivative;
+#[cfg(feature = "f16")]
+use half::f16;
+use libm::{erf, erff};
+use num_traits::{Float, FloatConst};
+
+trait Erf {
+    fn erf(self) -> Self;
+}
+
+#[cfg(feature = "f16")]
+impl Erf for f16 {
+    fn erf(self) -> Self {
+        f16::from_f32(erff(f16::to_f32(self)))
+    }
+}
+
+impl Erf for f64 {
+    fn erf(self) -> Self {
+        erf(self)
+    }
+}
+
+impl Erf for f32 {
+    fn erf(self) -> Self {
+        erff(self)
+    }
+}
+
+impl<F: Float + FloatConst + Erf> UnaryDerivative<F> for super::AccurateGeLUKernelOp {
+    const DF_USES_FX: bool = false;
+    const HAS_CONST_DF: bool = false;
+    #[inline(always)]
+    fn f(&self, &x: &F) -> F {
+        let alpha = F::FRAC_1_SQRT_2();
+        F::from(0.5).unwrap() * x * (F::one() + (x * alpha).erf())
+    }
+
+    #[inline(always)]
+    fn df(&self, &x: &F) -> F {
+        let half = F::from(0.5).unwrap();
+        let alpha = F::FRAC_1_SQRT_2();
+        let x_sq = x * x;
+        let normal_dist = F::FRAC_2_SQRT_PI() * (F::from(0.5).unwrap() * x_sq.neg()).exp();
+
+        let left = half * x;
+        let right = F::one() + (alpha * x).erf();
+
+        let left_derivative = half * right;
+
+        let right_derivative = left * normal_dist;
+
+        left_derivative + right_derivative
+    }
+}
diff --git a/src/tensor_ops/accurate_gelu/cuda_kernel.rs b/src/tensor_ops/accurate_gelu/cuda_kernel.rs
@@ -0,0 +1,29 @@
+use super::AccurateGeLUKernelOp;
+use crate::tensor_ops::cuda_kernels::cuda_unary;
+
+unsafe impl cudarc::driver::DeviceRepr for super::AccurateGeLUKernelOp {}
+
+const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/accurate_gelu.ptx"));
+
+#[cfg(feature = "f16")]
+cuda_unary!(
+    AccurateGeLUKernelOp,
+    half::f16,
+    PTX,
+    "accurate_gelu_fwd_f16",
+    "accurate_gelu_bwd_f16"
+);
+cuda_unary!(
+    AccurateGeLUKernelOp,
+    f32,
+    PTX,
+    "accurate_gelu_fwd_f32",
+    "accurate_gelu_bwd_f32"
+);
+cuda_unary!(
+    AccurateGeLUKernelOp,
+    f64,
+    PTX,
+    "accurate_gelu_fwd_f64",
+    "accurate_gelu_bwd_f64"
+);
diff --git a/src/tensor_ops/accurate_gelu/mod.rs b/src/tensor_ops/accurate_gelu/mod.rs
@@ -0,0 +1,75 @@
+mod cpu_kernel;
+
+#[cfg(feature = "cuda")]
+mod cuda_kernel;
+
+use super::ops::{try_unary_op, UnaryKernel};
+use crate::{shapes::*, tensor::*};
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct AccurateGeLUKernelOp;
+
+/// [Accurate Gaussian Linear Unit (GeLU)](https://paperswithcode.com/method/gelu). This is defined as `x * Phi(x)` where `Phi(x)` is the cumulative
+/// distribution function of a standard normal distribution. This can be calculated via the Error
+/// Function `erf(x)` using
+/// ```text
+/// 0.5 * x * (1.0 + erf(x / 2.0.sqrt()))
+/// ```
+/// As an accurate error function is [computationally expensive](https://en.wikipedia.org/wiki/Error_function#Numerical_approximations) it is
+/// possible to approximate the Gaussian Linear Unit with a hyperbolic tangent function `tanh`
+///
+/// ```text
+/// GeLU(x) ~ 0.5 ∗ x ∗ (1.0 + tanh((sqrt(2.0/π) ∗ (x + 0.044715 ∗ x^3)))
+/// ```
+///
+/// See [fast_gelu](super::fast_gelu::fast_gelu) to use this approximation
+///
+///
+/// Examples:
+/// ```rust
+/// # use dfdx::prelude::*;
+/// # let dev: Cpu = Default::default();
+/// let t = dev.tensor([-1.0, 0.0, 1.0, 2.0]);
+/// let r = t.accurate_gelu();
+/// ```
+pub fn accurate_gelu<S: Shape, E: Dtype, D: UnaryKernel<AccurateGeLUKernelOp, E>, T: Tape<E, D>>(
+    t: Tensor<S, E, D, T>,
+) -> Tensor<S, E, D, T> {
+    t.accurate_gelu()
+}
+
+impl<S: Shape, E: Dtype, D: UnaryKernel<AccurateGeLUKernelOp, E>, T: Tape<E, D>>
+    Tensor<S, E, D, T>
+{
+    /// See [accurate_gelu]
+    pub fn accurate_gelu(self) -> Self {
+        self.try_accurate_gelu().unwrap()
+    }
+    /// See [accurate_gelu]
+    pub fn try_accurate_gelu(self) -> Result<Self, D::Err> {
+        try_unary_op(AccurateGeLUKernelOp, self)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{tensor::*, tensor_ops::*, tests::*};
+
+    #[test]
+    fn test_accurate_gelu() {
+        let dev: TestDevice = Default::default();
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
+        let r = x.leaky_trace().accurate_gelu();
+
+        assert_close_to_literal!(r, [-0.04550027, -0.15865525, 0.0, 0.84134471, 1.9544997,]);
+        // NOTE: call .exp() to make sure we cover cases where .gelu() uses the result's gradient
+        let g = r.exp().mean().backward();
+        assert_close_to_literal!(
+            g.get(&x),
+            [-0.024835737, -0.03132311, 0.1, 0.5490418, 1.59559]
+        );
+    }
+}
diff --git a/src/tensor_ops/gelu/cpu_kernel.rs → src/tensor_ops/fast_gelu/cpu_kernel.rs b/src/tensor_ops/gelu/cpu_kernel.rs → src/tensor_ops/fast_gelu/cpu_kernel.rs
@@ -1,7 +1,7 @@
 use crate::tensor_ops::cpu_kernels::UnaryDerivative;
 use num_traits::{Float, FloatConst};
 
-impl<F: Float + FloatConst> UnaryDerivative<F> for super::GeLUKernelOp {
+impl<F: Float + FloatConst> UnaryDerivative<F> for super::FastGeLUKernelOp {
     const DF_USES_FX: bool = false;
     const HAS_CONST_DF: bool = false;
     #[inline(always)]

diff --git a/src/tensor_ops/fast_gelu/cuda_kernel.rs b/src/tensor_ops/fast_gelu/cuda_kernel.rs
@@ -0,0 +1,29 @@
+use super::FastGeLUKernelOp;
+use crate::tensor_ops::cuda_kernels::cuda_unary;
+
+unsafe impl cudarc::driver::DeviceRepr for super::FastGeLUKernelOp {}
+
+const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/fast_gelu.ptx"));
+
+#[cfg(feature = "f16")]
+cuda_unary!(
+    FastGeLUKernelOp,
+    half::f16,
+    PTX,
+    "fast_gelu_fwd_f16",
+    "fast_gelu_bwd_f16"
+);
+cuda_unary!(
+    FastGeLUKernelOp,
+    f32,
+    PTX,
+    "fast_gelu_fwd_f32",
+    "fast_gelu_bwd_f32"
+);
+cuda_unary!(
+    FastGeLUKernelOp,
+    f64,
+    PTX,
+    "fast_gelu_fwd_f64",
+    "fast_gelu_bwd_f64"
+);
diff --git a/src/tensor_ops/gelu/gelu.cu → src/tensor_ops/fast_gelu/fast_gelu.cu b/src/tensor_ops/gelu/gelu.cu → src/tensor_ops/fast_gelu/fast_gelu.cu
@@ -2,10 +2,9 @@
 #define _USE_MATH_DEFINES
 #include <math.h>
 
-struct GeLUKernelOp {};
+struct FastGeLUKernelOp {};
 
-template<typename T>
-__device__ T gelu_fwd(T x) {
+template <typename T> __device__ T fast_gelu_fwd(T x) {
     T fastCoeff = 0.044715;
     T one = 1.0;
     T half = 0.5;
@@ -16,8 +15,7 @@ __device__ T gelu_fwd(T x) {
     return half * x * (one + tanhg(beta * alpha));
 }
 
-template<typename T>
-__device__ T gelu_bwd(T x) {
+template <typename T> __device__ T fast_gelu_bwd(T x) {
     T one = 1.0;
     T three = 3.0;
     T half = 0.5;
@@ -30,7 +28,7 @@ __device__ T gelu_bwd(T x) {
 
     T left = half * x;
     T right = one + tanh_inner;
-    
+
     T left_derivative = half * right;
 
     T tanh_derivative = one - tanh_inner * tanh_inner;
@@ -39,17 +37,17 @@ __device__ T gelu_bwd(T x) {
     return left_derivative + right_derivative;
 }
 
-UNARY_OP(__half, gelu_fwd_f16, gelu_bwd_f16, GeLUKernelOp,
-    gelu_fwd(x),
-    gelu_bwd(x)
+UNARY_OP(__half, fast_gelu_fwd_f16, fast_gelu_bwd_f16, FastGeLUKernelOp,
+    fast_gelu_fwd(x),
+    fast_gelu_bwd(x)
 )
 
-UNARY_OP(float, gelu_fwd_f32, gelu_bwd_f32, GeLUKernelOp,
-    gelu_fwd(x),
-    gelu_bwd(x)
+UNARY_OP(float, fast_gelu_fwd_f32, fast_gelu_bwd_f32, FastGeLUKernelOp,
+    fast_gelu_fwd(x),
+    fast_gelu_bwd(x)
 )
 
-UNARY_OP(double, gelu_fwd_f64, gelu_bwd_f64, GeLUKernelOp,
-    gelu_fwd(x),
-    gelu_bwd(x)
+UNARY_OP(double, fast_gelu_fwd_f64, fast_gelu_bwd_f64, FastGeLUKernelOp,
+    fast_gelu_fwd(x),
+    fast_gelu_bwd(x)
 )