coreylowman · coreylowman · Apr 2, 2023 · Apr 2, 2023 · Apr 2, 2023
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
@@ -2,8 +2,8 @@
 
 github: coreylowman
 patreon: dfdx
+ko_fi: coreylowman
 open_collective: # Replace with a single Open Collective username
-ko_fi: # Replace with a single Ko-fi username
 tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 liberapay: # Replace with a single Liberapay username

diff --git a/.github/workflows/cargo-test-nightly.yml b/.github/workflows/cargo-test-nightly.yml
@@ -0,0 +1,25 @@
+on: [pull_request]
+
+jobs:
+  cargo-test:
+    name: cargo-test-nightly
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: nightly
+          override: true
+      - name: test CPU f32
+        uses: actions-rs/cargo@v1
+        with:
+          command: test
+          args: --features nightly,safetensors,numpy
+      - name: test CPU f64
+        uses: actions-rs/cargo@v1
+        with:
+          command: test
+          args: --features nightly,test-f64,safetensors,numpy
diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml
@@ -11,17 +11,9 @@ jobs:
           - ubuntu-latest
           - macOS-latest
           - windows-latest
-        toolchain:
-          - stable
-          - nightly
 
     steps:
       - uses: actions/checkout@v2
-      - uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: ${{ matrix.toolchain }}
-          override: true
       - name: test CPU f32
         uses: actions-rs/cargo@v1
         with:
@@ -31,4 +23,4 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: test
-          args: --features test-f64,safetensors,numpy
+          args: --features test-f64,safetensors,numpy
diff --git a/src/nn/activations.rs b/src/nn/activations.rs
@@ -48,19 +48,16 @@ pub struct Softmax;
 impl ZeroSizedModule for Softmax {}
 impl NonMutableModule for Softmax {}
 
-impl<Ax: Axes, S, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Softmax
-where
-    S: Shape<LastAxis = Ax> + ReduceShape<Ax>,
-{
+impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for Softmax {
     type Output = Tensor<S, E, D, T>;
     type Error = D::Err;
 
     fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, D::Err> {
-        input.try_softmax::<Ax>()
+        input.try_softmax::<S::LastAxis>()
     }
 }
 
-/// Calls [prelu()] with constant value.
+/// Calls [prelu()] with constant value - defaults to 0.05
 #[derive(Debug, Clone, Copy)]
 pub struct LeakyReLU<E: Dtype>(pub E);
 
@@ -73,11 +70,7 @@ impl<E: Dtype> Default for LeakyReLU<E> {
 impl<E: Dtype> ZeroSizedModule for LeakyReLU<E> {}
 impl<E: Dtype> NonMutableModule for LeakyReLU<E> {}
 
-impl<S: ConstShape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
-    for LeakyReLU<E>
-where
-    Tensor<S, E, D, T>: TryPReLU<E>,
-{
+impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for LeakyReLU<E> {
     type Output = Tensor<S, E, D, T>;
     type Error = <Tensor<S, E, D, T> as HasErr>::Err;
 
@@ -116,16 +109,13 @@ pub struct PReLU<E: Dtype, D: Device<E>> {
 
 impl<E: Dtype, D: Device<E>> NonMutableModule for PReLU<E, D> {}
 
-impl<S: ConstShape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>>
-    for PReLU<E, D>
-where
-    Tensor<S, E, D, T>: TryPReLU<Tensor<S, E, D, NoneTape>>,
-{
+impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<S, E, D, T>> for PReLU<E, D> {
     type Output = Tensor<S, E, D, T>;
     type Error = <Tensor<S, E, D, T> as HasErr>::Err;
 
     fn try_forward(&self, input: Tensor<S, E, D, T>) -> Result<Self::Output, Self::Error> {
-        input.try_prelu(self.a.retaped().broadcast())
+        let scale = self.a.retaped::<T>().try_broadcast_like(&input.shape)?;
+        input.try_prelu(scale)
     }
 }
 
@@ -162,29 +152,25 @@ impl<C: ConstDim, E: Dtype, D: Device<E>> NonMutableModule for PReLU1D<C, E, D>
 
 impl<C: ConstDim, E: Dtype, D: Device<E>, T: Tape<E, D>> Module<Tensor<(C,), E, D, T>>
     for PReLU1D<C, E, D>
-where
-    Tensor<(C,), E, D, T>: TryPReLU<Tensor<(C,), E, D, NoneTape>>,
 {
     type Output = Tensor<(C,), E, D, T>;
-
     type Error = <Tensor<(C,), E, D, T> as HasErr>::Err;
 
     fn try_forward(&self, input: Tensor<(C,), E, D, T>) -> Result<Self::Output, Self::Error> {
-        input.try_prelu(self.a.retaped())
+        input.try_prelu(self.a.retaped::<T>())
     }
 }
 
 macro_rules! prelu1d {
     (($($InDims:tt),*), $Axes:ty) => {
-        impl<E: Dtype, D: Device<E>, T: Tape<E, D> + Merge<NoneTape>, $($InDims: ConstDim),*> Module<Tensor<($($InDims),*), E, D, T>> for PReLU1D<C,E, D>
+        impl<E: Dtype, D: Device<E>, T: Tape<E, D>, $($InDims: ConstDim),*> Module<Tensor<($($InDims),*), E, D, T>> for PReLU1D<C,E, D>
         where ($($InDims),*): ReduceShapeTo<(C,), $Axes>,
-        Tensor<($($InDims),*), E, D, T>: TryPReLU<Tensor<($($InDims),*), E, D, NoneTape>>,
         {
             type Output = Tensor<($($InDims),*), E, D, T>;
             type Error = <Tensor<($($InDims),*), E, D, T> as HasErr>::Err;
 
             fn try_forward(&self, input: Tensor<($($InDims),*), E, D, T>) -> Result<Self::Output, Self::Error> {
-                input.try_prelu(self.a.retaped().broadcast())
+                input.try_prelu(self.a.retaped::<T>().broadcast())
             }
         }
     };
@@ -401,7 +387,7 @@ mod tests {
         let out = model.forward(t);
         assert_close(
             &out.array(),
-            &[-0.04820138, -0.03807970, 0.0, 0.76159415, 0.96402758],
+            &[-0.04820138, -0.0380797, 0.0, 0.7615941, 0.9640275],
         )
     }
 }
diff --git a/src/nn/batchnorm2d.rs b/src/nn/batchnorm2d.rs
@@ -42,7 +42,10 @@ where
 
     let centered = x.try_sub(mean_chan.try_broadcast_like(&shape)?)?;
 
-    let var_chan = centered.retaped::<T>().square().mean::<Rank1<C>, _>();
+    let var_chan = centered
+        .retaped::<T>()
+        .try_square()?
+        .try_mean::<Rank1<C>, _>()?;
 
     // NOTE: uses unbiased variance in running estimate
     var.try_axpy(E::ONE - momentum, &var_chan, momentum * n / (n - E::ONE))?;
@@ -76,11 +79,10 @@ where
     let shape = *x.shape();
 
     // statistics for normalizing
-    let std = (var.clone() + epsilon).try_sqrt()?;
-    let mean = mean.clone();
+    let std = (var.clone().try_add(epsilon)?).try_sqrt()?;
 
     // normalize & affine
-    let x = x.try_sub(mean.try_broadcast_like(&shape)?)?;
+    let x = x.try_sub(mean.clone().try_broadcast_like(&shape)?)?;
     let x = x.try_div(std.try_broadcast_like(&shape)?)?;
     let x = x.try_mul(scale.clone().try_broadcast_like(&shape)?)?;
     x.try_add(bias.clone().try_broadcast_like(&shape)?)

diff --git a/src/tensor_ops/prelu/mod.rs → src/tensor_ops/prelu.rs b/src/tensor_ops/prelu/mod.rs → src/tensor_ops/prelu.rs
@@ -1,74 +1,82 @@
-use num_traits::Zero;
-
 use crate::{shapes::*, tensor::*};
 
-use super::{
-    cmp::{LtKernelOp, ScalarCmpKernel},
-    BroadcastTo, ChooseFrom, Device,
-};
+use super::{BroadcastTo, ChooseFrom, Device};
 
-/// [Parametric Rectified Linear Unit (PReLU)](https://pytorch.org/docs/stable/generated/torch.nn.PReLU.html). `max(0, t) + a*min(0, t)`
+/// [Parametric Rectified Linear Unit (PReLU)](https://pytorch.org/docs/stable/generated/torch.nn.PReLU.html). `max(0, lhs) + rhs*min(0, lhs)`
+///
+/// In other words, for each element i:
+/// - if lhs[i] < 0, use `lhs[i] * rhs[i]`
+/// - if lhs[i] >= 0, use `lhs[i]`
+///
 ///
 /// Examples:
 /// ```rust
 /// # use dfdx::prelude::*;
 /// # let dev: Cpu = Default::default();
 /// let t = dev.tensor([-1.0, 0.0, 1.0, 2.0]);
 /// let a = dev.tensor([0.05,0.05,0.05,0.05]);
-/// let r = prelu(t, a);
+/// let r = t.prelu(a);
 /// assert_eq!(r.array(), [-0.05, 0.0, 1.0, 2.0]);
 /// ```
-
 pub fn prelu<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D> + Merge<R>, R: Default>(
     lhs: Tensor<S, E, D, T>,
     rhs: Tensor<S, E, D, R>,
-) -> Tensor<S, E, D, T>
-where
-    Tensor<S, E, D, T>: TryPReLU<Tensor<S, E, D, R>>,
-{
+) -> Tensor<S, E, D, T> {
     lhs.prelu(rhs)
 }
 
+/// Computes `prelu`, but with a scalar value. `max(0, t) + a*min(0, t)`
 pub fn leakyrelu<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>>(
     lhs: Tensor<S, E, D, T>,
     rhs: E,
-) -> Tensor<S, E, D, T>
-where
-    Tensor<S, E, D, T>: TryPReLU<E>,
-{
+) -> Tensor<S, E, D, T> {
     lhs.prelu(rhs)
 }
 
+/// [Parametric Rectified Linear Unit (PReLU)](https://pytorch.org/docs/stable/generated/torch.nn.PReLU.html). `max(0, lhs) + rhs*min(0, lhs)`
+///
+/// In other words, for each element i:
+/// - if lhs[i] < 0, use `lhs[i] * rhs[i]`
+/// - if lhs[i] >= 0, use `lhs[i]`
+///
+///
+/// Examples:
+/// ```rust
+/// # use dfdx::prelude::*;
+/// # let dev: Cpu = Default::default();
+/// let t = dev.tensor([-1.0, 0.0, 1.0, 2.0]);
+/// let a = dev.tensor([0.05,0.05,0.05,0.05]);
+/// let r = prelu(t, a);
+/// assert_eq!(r.array(), [-0.05, 0.0, 1.0, 2.0]);
+/// ```
 pub trait TryPReLU<T = Self>: HasErr {
-    fn try_prelu(self, rhs: T) -> Result<Self, Self::Err>;
-
     fn prelu(self, rhs: T) -> Self {
         self.try_prelu(rhs).unwrap()
     }
+
+    fn try_prelu(self, rhs: T) -> Result<Self, Self::Err>;
 }
 
-impl<S: Shape, E: Dtype + Zero, D, LhsTape: Tape<E, D>, R> TryPReLU<Tensor<S, E, D, R>>
+impl<S: Shape, E: Dtype, D, LhsTape: Tape<E, D>, R> TryPReLU<Tensor<S, E, D, R>>
     for Tensor<S, E, D, LhsTape>
 where
-    D: Device<E> + ScalarCmpKernel<LtKernelOp, E>,
+    D: Device<E>,
     LhsTape: Merge<R>,
 {
     /// See [prelu]
     fn try_prelu(self, rhs: Tensor<S, E, D, R>) -> Result<Self, Self::Err> {
         let scaled = self.with_empty_tape() * rhs;
-        Ok(self.scalar_lt(E::zero()).choose(scaled, self))
+        Ok(self.scalar_lt(E::default()).choose(scaled, self))
     }
 }
 
-impl<S: Shape, E: Dtype + Zero, D: Device<E> + ScalarCmpKernel<LtKernelOp, E>, T: Tape<E, D>>
-    TryPReLU<E> for Tensor<S, E, D, T>
-{
+impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> TryPReLU<E> for Tensor<S, E, D, T> {
     /// See [prelu]
     fn try_prelu(self, rhs: E) -> Result<Self, Self::Err> {
         let dev = self.device.clone();
         let scale = dev.tensor(rhs).retaped::<T>().broadcast_like(self.shape());
         let scaled = self.with_empty_tape() * scale;
-        Ok(self.scalar_lt(E::zero()).choose(scaled, self))
+        Ok(self.scalar_lt(E::default()).choose(scaled, self))
     }
 }
 

diff --git a/src/tensor_ops/utilities/device.rs b/src/tensor_ops/utilities/device.rs
@@ -62,6 +62,12 @@ pub trait Device<E: Dtype>:
 
     // boolean operations
     + super::super::boolean::BooleanKernel
+    + super::super::cmp::ScalarCmpKernel<super::super::cmp::EqKernelOp, E>
+    + super::super::cmp::ScalarCmpKernel<super::super::cmp::NeKernelOp, E>
+    + super::super::cmp::ScalarCmpKernel<super::super::cmp::GtKernelOp, E>
+    + super::super::cmp::ScalarCmpKernel<super::super::cmp::GeKernelOp, E>
+    + super::super::cmp::ScalarCmpKernel<super::super::cmp::LtKernelOp, E>
+    + super::super::cmp::ScalarCmpKernel<super::super::cmp::LeKernelOp, E>
 
     // unary
     + UnaryKernel<super::super::abs::AbsKernelOp, E>