coreylowman · coreylowman · Feb 15, 2023 · Feb 14, 2023
diff --git a/examples/01-tensor.rs b/examples/01-tensor.rs
@@ -2,47 +2,53 @@
 
 use dfdx::{
     shapes::{Const, HasShape, Rank1, Rank2, Rank3},
-    tensor::{AsArray, Cpu, OnesTensor, SampleTensor, Tensor, TensorFrom, ZerosTensor},
+    tensor::{AsArray, OnesTensor, SampleTensor, Tensor, TensorFrom, ZerosTensor},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 fn main() {
     // a device is required to create & modify tensors.
     // we will use the Cpu device here for simplicity
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     // easily create tensors using the `TensorFromArray::tensor` method of devices
     // tensors are generic over the:
     // 1. Shape (in this case a rank 1 (1 dimension) array with 5 elements)
     // 2. Data type (in this case the default of `f32`)
     // 3. The device they are stored on (in this case the default of `Cpu`)
     // 4. A tape - see examples/04-gradients.rs
-    let _: Tensor<Rank1<5>, f32, Cpu> = dev.tensor([1.0, 2.0, 3.0, 4.0, 5.0]);
+    let _: Tensor<Rank1<5>, f32, Device> = dev.tensor([1.0, 2.0, 3.0, 4.0, 5.0]);
 
     // You can also use [ZerosTensor::zeros] and [OnesTensor::ones] to create tensors
     // filled with the corresponding values.
-    let _: Tensor<Rank2<2, 3>, f32, Cpu> = dev.zeros();
-    let _: Tensor<Rank3<1, 2, 3>, f32, Cpu> = dev.ones();
+    let _: Tensor<Rank2<2, 3>, f32, Device> = dev.zeros();
+    let _: Tensor<Rank3<1, 2, 3>, f32, Device> = dev.ones();
 
     // Dynamic size
-    let dynamic: Tensor<(usize, Const<3>, Const<4>), f32, Cpu> = dev.zeros_like(&(5, Const, Const));
+    let dynamic: Tensor<(usize, Const<3>, Const<4>), f32, _> = dev.zeros_like(&(5, Const, Const));
     println!("Dynamic shape={:?}", dynamic.shape());
 
     // each of the creation methods also supports specifying the shape on the function
     // note to change the dtype we specify the dtype as the 2nd generic parameter
-    let _: Tensor<Rank2<2, 3>, f64, Cpu> = dev.zeros();
-    let _: Tensor<Rank2<2, 3>, f32, Cpu> = dev.ones();
+    let _: Tensor<Rank2<2, 3>, f64, Device> = dev.zeros();
+    let _: Tensor<Rank2<2, 3>, f32, Device> = dev.ones();
 
     // we can also create tensors filled with random values
     // from a normal distribution
-    let _: Tensor<Rank3<2, 3, 4>, f32, Cpu> = dev.sample_normal();
+    let _: Tensor<Rank3<2, 3, 4>, f32, Device> = dev.sample_normal();
 
     // or a uniform distribution
-    let _: Tensor<Rank3<2, 3, 4>, f32, Cpu> = dev.sample_uniform();
+    let _: Tensor<Rank3<2, 3, 4>, f32, Device> = dev.sample_uniform();
 
-    // or whatever distributino you want to use!
-    let a: Tensor<Rank3<2, 3, 4>, f32, Cpu> = dev.sample(rand_distr::Uniform::new(-1.0, 1.0));
+    // or whatever distribution you want to use!
+    let a: Tensor<Rank3<2, 3, 4>, f32, Device> = dev.sample(rand_distr::Uniform::new(-1.0, 1.0));
 
-    // use `AsArray::as_array` to get acces to the data as an array
+    // use `AsArray::as_array` to get access to the data as an array
     let a_data: [[[f32; 4]; 3]; 2] = a.array();
     println!("a={a_data:?}");
 

diff --git a/examples/02-ops.rs b/examples/02-ops.rs
@@ -2,12 +2,18 @@
 
 use dfdx::{
     shapes::{Rank0, Rank1, Rank2},
-    tensor::{AsArray, Cpu, SampleTensor, Tensor},
+    tensor::{AsArray, Cpu, SampleTensor, Tensor, ToDevice},
     tensor_ops::{MeanTo, TryMatMul},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 fn main() {
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     let a: Tensor<Rank2<2, 3>, f32, _> = dev.sample_normal();
     dbg!(a.array());
@@ -53,4 +59,16 @@ fn main() {
     let b: Tensor<Rank1<7>, f32, _> = dev.sample_normal();
     let c = a.matmul(b);
     dbg!(c.array());
+
+    // these operations are equal across devices
+    #[cfg(feature = "cuda")]
+    {
+        let cpu = Cpu::default();
+
+        let a: Tensor<Rank1<3>, f32, _> = dev.sample_normal();
+        let b: Tensor<Rank1<7>, f32, _> = dev.sample_normal();
+        let a_cpu = a.to_device(&cpu);
+        let b_cpu = b.to_device(&cpu);
+        assert_eq!(a_cpu.matmul(b_cpu).array(), a.matmul(b).array());
+    }
 }
diff --git a/examples/03-nn.rs b/examples/03-nn.rs
@@ -3,11 +3,17 @@
 use dfdx::{
     nn::{builders::*, BuildOnDevice, DeviceBuildExt, Module, ModuleMut, ResetParams},
     shapes::{Const, Rank1, Rank2},
-    tensor::{AsArray, Cpu, SampleTensor, Tensor, ZerosTensor},
+    tensor::{AsArray, SampleTensor, Tensor, ZerosTensor},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 fn main() {
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     // `nn::builders` exposes many different neural network types, like the Linear layer!
     type Model = Linear<4, 2>;

diff --git a/examples/04-gradients.rs b/examples/04-gradients.rs
@@ -3,30 +3,36 @@
 use dfdx::{
     gradients::{Gradients, NoneTape, OwnedTape},
     shapes::{Rank0, Rank2},
-    tensor::{AsArray, Cpu, SampleTensor, Tensor},
+    tensor::{AsArray, SampleTensor, Tensor},
     tensor_ops::{Backward, MeanTo, TryMatMul},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 fn main() {
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     // tensors are actually generic over a fourth field: the tape
     // by default tensors are created with a `NoneTape`, which
     // means they don't currently **own** a tape.
-    let weight: Tensor<Rank2<4, 2>, f32, Cpu, NoneTape> = dev.sample_normal();
-    let a: Tensor<Rank2<3, 4>, f32, Cpu, NoneTape> = dev.sample_normal();
+    let weight: Tensor<Rank2<4, 2>, f32, _, NoneTape> = dev.sample_normal();
+    let a: Tensor<Rank2<3, 4>, f32, _, NoneTape> = dev.sample_normal();
 
     // the first step to tracing is to call .trace()
     // this sticks a gradient tape into the input tensor!
     // NOTE: the tape has changed from a `NoneTape` to an `OwnedTape`.
-    let b: Tensor<Rank2<3, 4>, _, _, OwnedTape<Cpu>> = a.trace();
+    let b: Tensor<Rank2<3, 4>, _, _, OwnedTape<Device>> = a.trace();
 
     // the tape will **automatically** be moved around as you perform ops
     // ie. the tapes on inputs to operations are moved to the output
     // of the operation.
-    let c: Tensor<Rank2<3, 2>, _, _, OwnedTape<_>> = b.matmul(weight.clone());
-    let d: Tensor<Rank2<3, 2>, _, _, OwnedTape<_>> = c.sin();
-    let e: Tensor<Rank0, _, _, OwnedTape<_>> = d.mean();
+    let c: Tensor<Rank2<3, 2>, _, _, OwnedTape<Device>> = b.matmul(weight.clone());
+    let d: Tensor<Rank2<3, 2>, _, _, OwnedTape<Device>> = c.sin();
+    let e: Tensor<Rank0, _, _, OwnedTape<Device>> = d.mean();
 
     // finally you can use .backward() to extract the gradients!
     // NOTE: that this method is only available on tensors that **own**

diff --git a/examples/05-optim.rs b/examples/05-optim.rs
@@ -5,10 +5,16 @@ use dfdx::{
     nn::{builders::*, DeviceBuildExt, ModuleMut},
     optim::{Momentum, Optimizer, Sgd, SgdConfig},
     shapes::Rank2,
-    tensor::{AsArray, Cpu, SampleTensor, Tensor},
+    tensor::{AsArray, SampleTensor, Tensor},
     tensor_ops::Backward,
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 // first let's declare our neural network to optimze
 type Mlp = (
     (Linear<5, 32>, ReLU),
@@ -17,7 +23,7 @@ type Mlp = (
 );
 
 fn main() {
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     // First randomly initialize our model
     let mut mlp = dev.build_module::<Mlp, f32>();

diff --git a/examples/06-mnist.rs b/examples/06-mnist.rs
@@ -17,18 +17,20 @@
 //! cargo run --example 06-mnist -- tmp/
 //! ```
 
-use dfdx::{data::SubsetIterator, losses::cross_entropy_with_logits_loss, optim::Adam, prelude::*};
+use std::time::Instant;
+
 use indicatif::ProgressBar;
 use mnist::*;
 use rand::prelude::{SeedableRng, StdRng};
-use std::time::Instant;
 
-#[cfg(feature = "cuda")]
-type Dev = Cuda;
+use dfdx::{data::SubsetIterator, losses::cross_entropy_with_logits_loss, optim::Adam, prelude::*};
 
 #[cfg(not(feature = "cuda"))]
 type Dev = Cpu;
 
+#[cfg(feature = "cuda")]
+type Dev = Cuda;
+
 struct MnistDataset {
     img: Vec<f32>,
     lbl: Vec<usize>,

diff --git a/examples/07-custom-module.rs b/examples/07-custom-module.rs
@@ -3,28 +3,33 @@
 use dfdx::{
     gradients::Tape,
     nn::{
-        self,
         modules::{Linear, ReLU},
         BuildModule, Module,
     },
     shapes::{Rank1, Rank2},
-    tensor::{Cpu, HasErr, SampleTensor, Tensor},
+    tensor::{HasErr, SampleTensor, Tensor},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 /// Custom model struct
 /// This case is trivial and should be done with a tuple of linears and relus,
 /// but it demonstrates how to build models with custom behavior
 struct Mlp<const IN: usize, const INNER: usize, const OUT: usize> {
-    l1: Linear<IN, INNER, f32, Cpu>,
-    l2: Linear<INNER, OUT, f32, Cpu>,
+    l1: Linear<IN, INNER, f32, Device>,
+    l2: Linear<INNER, OUT, f32, Device>,
     relu: ReLU,
 }
 
 // BuildModule lets you randomize a model's parameters
-impl<const IN: usize, const INNER: usize, const OUT: usize> nn::BuildModule<Cpu, f32>
+impl<const IN: usize, const INNER: usize, const OUT: usize> BuildModule<Device, f32>
     for Mlp<IN, INNER, OUT>
 {
-    fn try_build(device: &Cpu) -> Result<Self, <Cpu as HasErr>::Err> {
+    fn try_build(device: &Device) -> Result<Self, <Device as HasErr>::Err> {
         Ok(Self {
             l1: BuildModule::try_build(device)?,
             l2: BuildModule::try_build(device)?,
@@ -34,25 +39,30 @@ impl<const IN: usize, const INNER: usize, const OUT: usize> nn::BuildModule<Cpu,
 }
 
 // impl Module for single item
-impl<const IN: usize, const INNER: usize, const OUT: usize> nn::Module<Tensor<Rank1<IN>, f32, Cpu>>
+impl<const IN: usize, const INNER: usize, const OUT: usize> Module<Tensor<Rank1<IN>, f32, Device>>
     for Mlp<IN, INNER, OUT>
 {
-    type Output = Tensor<Rank1<OUT>, f32, Cpu>;
+    type Output = Tensor<Rank1<OUT>, f32, Device>;
 
-    fn forward(&self, x: Tensor<Rank1<IN>, f32, Cpu>) -> Self::Output {
+    fn forward(&self, x: Tensor<Rank1<IN>, f32, Device>) -> Self::Output {
         let x = self.l1.forward(x);
         let x = self.relu.forward(x);
         self.l2.forward(x)
     }
 }
 
 // impl Module for batch of items
-impl<const BATCH: usize, const IN: usize, const INNER: usize, const OUT: usize, T: Tape<Cpu>>
-    nn::Module<Tensor<Rank2<BATCH, IN>, f32, Cpu, T>> for Mlp<IN, INNER, OUT>
+impl<
+        const BATCH: usize,
+        const IN: usize,
+        const INNER: usize,
+        const OUT: usize,
+        T: Tape<Device>,
+    > Module<Tensor<Rank2<BATCH, IN>, f32, Device, T>> for Mlp<IN, INNER, OUT>
 {
-    type Output = Tensor<Rank2<BATCH, OUT>, f32, Cpu, T>;
+    type Output = Tensor<Rank2<BATCH, OUT>, f32, Device, T>;
 
-    fn forward(&self, x: Tensor<Rank2<BATCH, IN>, f32, Cpu, T>) -> Self::Output {
+    fn forward(&self, x: Tensor<Rank2<BATCH, IN>, f32, Device, T>) -> Self::Output {
         let x = self.l1.forward(x);
         let x = self.relu.forward(x);
         self.l2.forward(x)
@@ -61,16 +71,16 @@ impl<const BATCH: usize, const IN: usize, const INNER: usize, const OUT: usize,
 
 fn main() {
     // Rng for generating model's params
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     // Construct model
     let model = Mlp::<10, 512, 20>::build(&dev);
 
     // Forward pass with a single sample
     let item: Tensor<Rank1<10>, f32, _> = dev.sample_normal();
-    let _: Tensor<Rank1<20>, f32, Cpu> = model.forward(item);
+    let _: Tensor<Rank1<20>, f32, _> = model.forward(item);
 
     // Forward pass with a batch of samples
     let batch: Tensor<Rank2<32, 10>, f32, _> = dev.sample_normal();
-    let _: Tensor<Rank2<32, 20>, f32, Cpu, _> = model.forward(batch.trace());
+    let _: Tensor<Rank2<32, 20>, f32, _, _> = model.forward(batch.trace());
 }
diff --git a/examples/08-tensor-broadcast-reduce.rs b/examples/08-tensor-broadcast-reduce.rs
@@ -3,13 +3,21 @@
 
 use dfdx::{
     shapes::{Axis, Rank2, Rank4},
-    tensor::{AsArray, Cpu, TensorFrom},
+    tensor::{AsArray, TensorFrom},
     tensor_ops::{BroadcastTo, MeanTo},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 fn main() {
-    let dev: Cpu = Default::default();
-    let a = dev.tensor([1.0, 2.0, 3.0]);
+    let dev = Device::default();
+    let a = dev.tensor([1.0f32, 2.0, 3.0]);
+    // NOTE: Cuda currently does not support broadcasting.
+    // Its usage results in errors and wrong outputs.
 
     // to broadcast, use `Broadcast::broadcast()` and specify
     // the output type. the axes that are broadcast are inferred for you!

diff --git a/examples/09-tensor-permute.rs b/examples/09-tensor-permute.rs
@@ -1,11 +1,19 @@
 //! Demonstrates how to re-order (permute/transpose) the axes of a tensor
 
-use dfdx::shapes::{Axes3, Rank3};
-use dfdx::tensor::{Cpu, Tensor, ZerosTensor};
-use dfdx::tensor_ops::PermuteTo;
+use dfdx::{
+    shapes::{Axes3, Rank3},
+    tensor::{Tensor, ZerosTensor},
+    tensor_ops::PermuteTo,
+};
+
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
 
 fn main() {
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     let a: Tensor<Rank3<3, 5, 7>, f32, _> = dev.zeros();
 

diff --git a/examples/10-tensor-index.rs b/examples/10-tensor-index.rs
@@ -2,12 +2,18 @@
 
 use dfdx::{
     shapes::Rank3,
-    tensor::{AsArray, Cpu, Tensor, TensorFrom},
+    tensor::{AsArray, Tensor, TensorFrom},
     tensor_ops::{GatherTo, SelectTo},
 };
 
+#[cfg(not(feature = "cuda"))]
+type Device = dfdx::tensor::Cpu;
+
+#[cfg(feature = "cuda")]
+type Device = dfdx::tensor::Cuda;
+
 fn main() {
-    let dev: Cpu = Default::default();
+    let dev = Device::default();
 
     let a: Tensor<Rank3<4, 2, 3>, f32, _> = dev.tensor([
         [[0.00, 0.01, 0.02], [0.10, 0.11, 0.12]],