diff --git a/R/CallbackSetLRScheduler.R b/R/CallbackSetLRScheduler.R
index b43cf8b3e..4670fc0ac 100644
--- a/R/CallbackSetLRScheduler.R
+++ b/R/CallbackSetLRScheduler.R
@@ -10,7 +10,7 @@
 #' * [torch::lr_cosine_annealing()]
 #' * [torch::lr_lambda()]
 #' * [torch::lr_multiplicative()]
-#' * [torch::lr_one_cycle()] (where the default values for `epochs` and `steps_per_epoch` are the number of training epochs and the number of batches per epoch)
+#' * [torch::lr_one_cycle()]
 #' * [torch::lr_reduce_on_plateau()]
 #' * [torch::lr_step()]
 #' * Custom schedulers defined with [torch::lr_scheduler()].
@@ -18,9 +18,7 @@
 #' @param .scheduler (`lr_scheduler_generator`)\cr
 #'   The `torch` scheduler generator (e.g. `torch::lr_step`).
 #' @param ... (any)\cr
-#'   The scheduler-specific initialization arguments.
-#' @param step_on_epoch (`logical(1)`)\cr
-#'   Whether the scheduler steps after every epoch (otherwise every batch).
+#'   The scheduler-specific arguments
 #'
 #' @export
 CallbackSetLRScheduler = R6Class("CallbackSetLRScheduler",
@@ -35,13 +33,14 @@ CallbackSetLRScheduler = R6Class("CallbackSetLRScheduler",
     scheduler = NULL,
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
+    #' @param step_on_epoch (`logical(1)`)\cr
+    #'   Whether the scheduler steps after every epoch (otherwise every batch).
     initialize = function(.scheduler, step_on_epoch, ...) {
       assert_class(.scheduler, "lr_scheduler_generator")
       assert_flag(step_on_epoch)
 
       self$scheduler_fn = .scheduler
       private$.scheduler_args = list(...)
-
       if (step_on_epoch) {
         self$on_epoch_end = function() self$scheduler$step()
       } else {
@@ -59,77 +58,6 @@ CallbackSetLRScheduler = R6Class("CallbackSetLRScheduler",
   )
 )
 
-#' @title OneCycle Learning Rate Scheduling Callback
-#'
-#' @name mlr_callback_set.lr_scheduler_one_cycle
-#'
-#' @description
-#' Changes the learning rate based on the 1cycle learning rate policy.
-#'
-#' Wraps [torch::lr_one_cycle()], where the default values for `epochs` and `steps_per_epoch` are the number of training epochs and the number of batches per epoch.
-#'
-#' @param ... (any)\cr
-#'   The scheduler-specific initialization arguments.
-#'
-#' @export
-CallbackSetLRSchedulerOneCycle = R6Class("CallbackSetLRSchedulerOneCycle",
-  inherit = CallbackSetLRScheduler,
-  lock_objects = FALSE,
-  public = list(
-    #' @description
-    #' Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(...) {
-      super$initialize(
-        .scheduler = torch::lr_one_cycle,
-        step_on_epoch = FALSE,
-        ...
-        )
-    },
-    #' @description
-    #' Creates the scheduler using the optimizer from the context
-    on_begin = function() {
-      private$.scheduler_args = insert_named(
-        private$.scheduler_args,
-        list(epochs = self$ctx$total_epochs, steps_per_epoch = self$ctx$loader_train$.length())
-      )
-
-      self$scheduler = invoke(self$scheduler_fn, optimizer = self$ctx$optimizer, .args = private$.scheduler_args)
-    }
-  )
-)
-
-#' @title Reduce On Plateau Learning Rate Scheduler
-#'
-#' @name mlr_callback_set.lr_scheduler_reduce_on_plateau
-#'
-#' @description
-#' Reduces the learning rate when the first validation metric stops improving for `patience` epochs.
-#' Wraps [torch::lr_reduce_on_plateau()]
-#'
-#' @param ... (any)\cr
-#'   The scheduler-specific initialization arguments.
-#'
-#' @export
-CallbackSetLRSchedulerReduceOnPlateau = R6Class("CallbackSetLRSchedulerReduceOnPlateau",
-  inherit = CallbackSetLRScheduler,
-  lock_objects = FALSE,
-  public = list(
-    #' @description
-    #' Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(...) {
-      super$initialize(
-        .scheduler = torch::lr_reduce_on_plateau,
-        step_on_epoch = TRUE,
-        ...
-      )
-
-      self$on_epoch_end = function() {
-        self$scheduler$step(self$ctx$last_scores_valid[[1L]])
-      }
-    }
-  )
-)
-
 # some of the schedulers accept lists
 # so they can treat different parameter groups differently
 check_class_or_list = function(x, classname) {
@@ -162,7 +90,7 @@ mlr3torch_callbacks$add("lr_lambda", function() {
       last_epoch = p_int(default = -1, tags = "train"),
       verbose = p_lgl(default = FALSE, tags = "train")
     ),
-    id = "lr_lambda",
+    id = "lr_scheduler",
     label = "Multiplication by Function LR Scheduler",
     man = "mlr3torch::mlr_callback_set.lr_scheduler",
     additional_args = list(.scheduler = torch::lr_lambda, step_on_epoch = TRUE)
@@ -188,7 +116,7 @@ mlr3torch_callbacks$add("lr_multiplicative", function() {
 #' @include TorchCallback.R
 mlr3torch_callbacks$add("lr_one_cycle", function() {
   TorchCallback$new(
-    callback_generator = CallbackSetLRSchedulerOneCycle,
+    callback_generator = CallbackSetLRScheduler,
     param_set = ps(
       max_lr = p_uty(tags = c("train", "required"), custom_check = function(x) check_class_or_list(x, "numeric")),
       total_steps = p_int(default = NULL, special_vals = list(NULL), tags = "train"),
@@ -204,15 +132,16 @@ mlr3torch_callbacks$add("lr_one_cycle", function() {
       verbose = p_lgl(default = FALSE, tags = "train")
     ),
     id = "lr_one_cycle",
-    label = "1cycle LR Scheduler",
-    man = "mlr3torch::mlr_callback_set.lr_scheduler"
+    label = "1cyle LR Scheduler",
+    man = "mlr3torch::mlr_callback_set.lr_scheduler",
+    additional_args = list(.scheduler = torch::lr_one_cycle, step_on_epoch = FALSE)
   )
 })
 
 #' @include TorchCallback.R
 mlr3torch_callbacks$add("lr_reduce_on_plateau", function() {
   TorchCallback$new(
-    callback_generator = CallbackSetLRSchedulerReduceOnPlateau,
+    callback_generator = CallbackSetLRScheduler,
     param_set = ps(
       mode = p_fct(default = "min", levels = c("min", "max"), tags = "train"),
       factor = p_dbl(default = 0.1, tags = "train"),
@@ -226,7 +155,8 @@ mlr3torch_callbacks$add("lr_reduce_on_plateau", function() {
     ),
     id = "lr_reduce_on_plateau",
     label = "Reduce on Plateau LR Scheduler",
-    man = "mlr3torch::mlr_callback_set.lr_scheduler"
+    man = "mlr3torch::mlr_callback_set.lr_scheduler",
+    additional_args = list(.scheduler = torch::lr_reduce_on_plateau, step_on_epoch = TRUE)
   )
 })
 
diff --git a/R/CallbackSetProgress.R b/R/CallbackSetProgress.R
index 2138b64c7..35b8bd41c 100644
--- a/R/CallbackSetProgress.R
+++ b/R/CallbackSetProgress.R
@@ -7,8 +7,6 @@
 #'
 #' @family Callback
 #' @include CallbackSet.R
-#' @param digits `integer(1)`\cr
-#'   The number of digits to print for the measures.
 #' @export
 #' @examplesIf torch::torch_is_installed()
 #' task = tsk("iris")
@@ -25,11 +23,6 @@ CallbackSetProgress = R6Class("CallbackSetProgress",
   inherit = CallbackSet,
   lock_objects = FALSE,
   public = list(
-    #' @description
-    #' Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(digits = 2) {
-      self$digits = assert_int(digits, lower = 0)
-    },
     #' @description
     #' Initializes the progress bar for training.
     on_epoch_begin = function() {
@@ -48,7 +41,6 @@ CallbackSetProgress = R6Class("CallbackSetProgress",
     #' @description
     #' Creates the progress bar for validation.
     on_before_valid = function() {
-      catf("Validation for epoch %s started (%s)", self$ctx$epoch, format(Sys.time()))
       self$pb_valid = progress::progress_bar$new(
         total = length(self$ctx$loader_valid),
         format = "Validation: [:bar]"
@@ -77,7 +69,7 @@ CallbackSetProgress = R6Class("CallbackSetProgress",
         for (phase in names(scores)) {
           catf("Measures (%s):", capitalize(phase))
           curscore = scores[[phase]]
-          output = sprintf(paste0(" * %s = %.", self$digits, "f\n"), names(curscore), unlist(curscore))
+          output = sprintf(" * %s = %.2f\n", names(curscore), unlist(curscore))
           cat(paste(output, collapse = ""))
         }
       }
@@ -95,9 +87,7 @@ CallbackSetProgress = R6Class("CallbackSetProgress",
 mlr3torch_callbacks$add("progress", function() {
   TorchCallback$new(
     callback_generator = CallbackSetProgress,
-    param_set = ps(
-      digits = p_int(lower = 1, default = 2, tags = "train")
-    ),
+    param_set = ps(),
     id = "progress",
     label = "Progress",
     man = "mlr3torch::mlr_callback_set.progress",
diff --git a/R/CallbackSetTB.R b/R/CallbackSetTB.R
index a8a31edd7..ed4040096 100644
--- a/R/CallbackSetTB.R
+++ b/R/CallbackSetTB.R
@@ -28,15 +28,14 @@ CallbackSetTB = R6Class("CallbackSetTB",
         dir.create(path, recursive = TRUE)
       }
       self$log_train_loss = assert_flag(log_train_loss)
-      if (self$log_train_loss) {
-        self$on_batch_end = function() {
-          private$.log_train_loss()
-        }
-      }
     },
     #' @description
     #' Logs the training loss, training measures, and validation measures as TensorBoard events.
     on_epoch_end = function() {
+      if (self$log_train_loss) {
+        private$.log_train_loss()
+      }
+
       if (length(self$ctx$last_scores_train)) {
         walk(names(self$ctx$measures_train), private$.log_train_score)
       }
diff --git a/R/ContextTorch.R b/R/ContextTorch.R
index 922bb9346..ce507e402 100644
--- a/R/ContextTorch.R
+++ b/R/ContextTorch.R
@@ -112,9 +112,6 @@ ContextTorch = R6Class("ContextTorch",
     #' @field last_loss (`numeric(1)`)\cr
     #' The loss from the last trainings batch.
     last_loss = NULL,
-    #' @field y_hat (`torch_tensor`)\cr
-    #' The model's prediction for the current batch.
-    y_hat = NULL,
     #' @field epoch (`integer(1)`)\cr
     #'   The current epoch.
     epoch = NULL,
diff --git a/R/DataDescriptor.R b/R/DataDescriptor.R
index 81dad0a33..1bf3cd68d 100644
--- a/R/DataDescriptor.R
+++ b/R/DataDescriptor.R
@@ -84,7 +84,8 @@ DataDescriptor = R6Class("DataDescriptor",
         assert_true(length(graph$pipeops) >= 1L)
       }
       # no preprocessing, dataset returns only a single element (there we can infer a lot)
-      simple_case = length(graph$pipeops) == 1L && inherits(graph$pipeops[[1L]], "PipeOpNOP")
+      simple_case = length(graph$pipeops) == 1L && inherits(graph$pipeops[[1L]], "PipeOpNOP") &&
+        length(dataset_shapes) == 1L
 
       if (is.null(input_map) && nrow(graph$input) == 1L && length(dataset_shapes) == 1L) {
         input_map = names(dataset_shapes)
@@ -99,7 +100,7 @@ DataDescriptor = R6Class("DataDescriptor",
         assert_choice(pointer[[2]], graph$pipeops[[pointer[[1]]]]$output$name)
       }
       if (is.null(pointer_shape) && simple_case) {
-        pointer_shape = dataset_shapes[[input_map]]
+        pointer_shape = dataset_shapes[[1L]]
       } else {
         assert_shape(pointer_shape, null_ok = TRUE)
       }
@@ -224,7 +225,7 @@ infer_shapes_from_getbatch = function(ds) {
 }
 
 assert_compatible_shapes = function(shapes, dataset) {
-  shapes = assert_shapes(shapes, null_ok = TRUE, unknown_batch = TRUE, named = TRUE)
+  assert_shapes(shapes, null_ok = TRUE, unknown_batch = TRUE, named = TRUE)
 
   # prevent user from e.g. forgetting to wrap the return in a list
   example = if (is.null(dataset$.getbatch)) {
@@ -246,26 +247,12 @@ assert_compatible_shapes = function(shapes, dataset) {
   }
 
   iwalk(shapes, function(dataset_shape, name) {
-    if (is.null(dataset_shape)) {
-      return(NULL)
-    }
-    shape_specified = shapes[[name]]
-    shape_example = example[[name]]$shape
-    if (length(shape_specified) != length(shape_example)) {
-      stopf("The specified number of dimensions for element '%s' is %s, but the dataset returned %s",
-        name, length(shape_specified), length(shape_example))
-    }
-
-    if (all(is.na(shape_specified))) {
-      # compatible with any shape
-      return(NULL)
-    }
-
-    shape_example[is.na(shape_specified)] = NA
-    if (!test_equal(shape_specified, shape_example)) {
-      stopf(paste0("First example batch from dataset is incompatible with the provided shape of %s:\n",
-        "* Observed shape: %s.\n* Specified shape: %s."), name,
-        shape_to_str(example[[name]]$shape), shape_to_str(shape_specified))
+    if (!is.null(dataset_shape) && !test_equal(shapes[[name]][-1], example[[name]]$shape[-1L])) {
+      expected_shape = example[[name]]$shape
+      expected_shape[1] = NA
+      stopf(paste0("First batch from dataset is incompatible with the provided shape of %s:\n",
+        "* Provided shape: %s.\n* Expected shape: %s."), name,
+        shape_to_str(unname(shapes[name])), shape_to_str(list(expected_shape)))
     }
   })
 }
diff --git a/R/LearnerTorch.R b/R/LearnerTorch.R
index ddcdf9a46..865ac733e 100644
--- a/R/LearnerTorch.R
+++ b/R/LearnerTorch.R
@@ -29,21 +29,6 @@
 #' To do so, you just need to include `epochs = to_tune(upper = <upper>, internal = TRUE)` in the search space,
 #' where `<upper>` is the maximally allowed number of epochs, and configure the early stopping.
 #'
-#' @section Network Head and Target Encoding:
-#' Torch learners are expected to have the following output:
-#' * binary classification: `(batch_size, 1)`, representing the logits for the positive class.
-#' * multiclass classification: `(batch_size, n_classes)`, representing the logits for all classes.
-#' * regression: `(batch_size, 1)` representing the response prediction.
-#'
-#' Furthermore, the target encoding is expected to be as follows:
-#' * regression: The `numeric` target variable of a [`TaskRegr`][mlr3::TaskRegr] is encoded as a
-#'   [`torch_float`][torch::torch_float] with shape `c(batch_size, 1)`.
-#' * binary classification: The `factor` target variable of a [`TaskClassif`][mlr3::TaskClassif] is encoded as a
-#'   [`torch_float`][torch::torch_float] with shape `(batch_size, 1)` where the positive class (`Task$positive`, which
-#'   is also ensured to be the first factor level) is `1` and the negative class is `0`.
-#' * multi-class classification: The `factor` target variable of a [`TaskClassif`][mlr3::TaskClassif] is a label-encoded
-#'   [`torch_long`][torch::torch_long] with shape `(batch_size)` where the label-encoding goes from `1` to `n_classes`.
-#'
 #' @template param_id
 #' @template param_task_type
 #' @template param_param_vals
@@ -72,8 +57,6 @@
 #' @param callbacks (`list()` of [`TorchCallback`]s)\cr
 #'   The callbacks to use for training.
 #'   Defaults to an empty` list()`, i.e. no callbacks.
-#' @param jittable (`logical(1)`)\cr
-#'   Whether the model can be jit-traced. Default is `FALSE`.
 #'
 #' @section Model:
 #' The Model is a list of class `"learner_torch_model"` with the following elements:
@@ -92,35 +75,24 @@
 #' Instead, the `task_type` must be specified  as a construction argument.
 #' Currently, only classification and regression are supported.
 #'
-#' When inheriting from this class, one should overload the following methods:
+#' When inheriting from this class, one should overload two private methods:
 #'
 #' * `.network(task, param_vals)`\cr
 #'   ([`Task`][mlr3::Task], `list()`) -> [`nn_module`][torch::nn_module]\cr
 #'   Construct a [`torch::nn_module`] object for the given task and parameter values, i.e. the neural network that
 #'   is trained by the learner.
-#'   Note that a specific output shape is expected from the returned network, see section *Network Head and Target Encoding*.
-#'   You can use [`output_dim_for()`] to obtain the correct output dimension for a given task.
-#' * `.ingress_tokens(task, param_vals)`\cr
-#'   ([`Task`][mlr3::Task], `list()`) -> named `list()` with [`TorchIngressToken`]s\cr
-#'   Create the [`TorchIngressToken`]s that are passed to the [`task_dataset`] constructor.
-#'   The number of ingress tokens must correspond to the number of input parameters of the network.
-#'   If there is more than one input, the names must correspond to the inputs of the network.
-#'   See [`ingress_num`], [`ingress_categ`], and [`ingress_ltnsr`] on how to easily create the correct tokens.
-#'   For more flexibility, you can also directly implement the `.dataset(task, param_vals)` method,
-#'   see below.
+#'   For classification, the output of this network are expected to be the scores before the application of the
+#'   final softmax layer.
 #' * `.dataset(task, param_vals)`\cr
 #'   ([`Task`][mlr3::Task], `list()`) -> [`torch::dataset`]\cr
 #'   Create the dataset for the task.
-#'   Don't implement this if the `.ingress_tokens()` method is defined.
 #'   The dataset must return a named list where:
 #'   * `x` is a list of torch tensors that are the input to the network.
 #'     For networks with more than one input, the names must correspond to the inputs of the network.
 #'   * `y` is the target tensor.
 #'   * `.index` are the indices of the batch (`integer()` or a `torch_int()`).
 #'
-#'   For information on the expected target encoding of `y`, see section *Network Head and Target Encoding*.
 #'   Moreover, one needs to pay attention respect the row ids of the provided task.
-#'   It is recommended to relu on [`task_dataset`] for creating the [`dataset`][torch::dataset].
 #'
 #' It is also possible to overwrite the private `.dataloader()` method.
 #' This must respect the dataloader parameters from the [`ParamSet`][paradox::ParamSet].
@@ -128,9 +100,9 @@
 #' * `.dataloader(dataset, param_vals)`\cr
 #'   ([`Task`][mlr3::Task], `list()`) -> [`torch::dataloader`]\cr
 #'   Create a dataloader from the task.
-#'   Needs to respect at least `batch_size` and `shuffle` (otherwise predictions will be incorrectly ordered).
+#'   Needs to respect at least `batch_size` and `shuffle` (otherwise predictions can be permuted).
 #'
-#' To change the predict types, it is possible to overwrite the method below:
+#' To change the predict types, the it is possible to overwrite the method below:
 #'
 #' * `.encode_prediction(predict_tensor, task)`\cr
 #'   ([`torch_tensor`][torch::torch_tensor], [`Task`][mlr3::Task]) -> `list()`\cr
@@ -144,9 +116,8 @@
 #' or `"cb."`, as these are preserved for the dynamically constructed parameters of the optimizer, the loss function,
 #' and the callbacks.
 #'
-#' To perform additional input checks on the task, the private `.check_train_task(task, param_vals)` and
-#' `.check_predict_task(task, param_vals)` can be overwritten.
-#' These should return `TRUE` if the input task is valid and otherwise a string with an error message.
+#' To perform additional input checks on the task, the private `.verify_train_task(task, param_vals)` and
+#' `.verify_predict_task(task, param_vals)` can be overwritten.
 #'
 #' For learners that have other construction arguments that should change the hash of a learner, it is required
 #' to implement the private `$.additional_phash_input()`.
@@ -157,9 +128,8 @@ LearnerTorch = R6Class("LearnerTorch",
   inherit = Learner,
   public = list(
     #' @description Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(id, task_type, param_set, properties = character(), man, label, feature_types,
-      optimizer = NULL, loss = NULL, packages = character(), predict_types = NULL, callbacks = list(),
-      jittable = FALSE) {
+    initialize = function(id, task_type, param_set, properties, man, label, feature_types,
+      optimizer = NULL, loss = NULL, packages = character(), predict_types = NULL, callbacks = list()) {
       assert_choice(task_type, c("regr", "classif"))
 
       predict_types = predict_types %??% switch(task_type,
@@ -169,14 +139,11 @@ LearnerTorch = R6Class("LearnerTorch",
 
       assert_subset(properties, mlr_reflections$learner_properties[[task_type]])
       properties = union(properties, c("marshal", "validation", "internal_tuning"))
-      if (task_type == "classif") {
-        properties = union(properties, c("twoclass", "multiclass"))
-      }
       assert_subset(predict_types, names(mlr_reflections$learner_predict_types[[task_type]]))
       packages = assert_character(packages, any.missing = FALSE, min.chars = 1L)
       packages = union(c("mlr3", "mlr3torch"), packages)
 
-      private$.param_set_torch = paramset_torchlearner(task_type, jittable = jittable)
+      private$.param_set_torch = paramset_torchlearner(task_type)
 
       check_ps = function(param_set) {
         assert_param_set(param_set)
@@ -356,7 +323,7 @@ LearnerTorch = R6Class("LearnerTorch",
       self$state$internal_valid_scores
     },
     #' @field internal_tuned_values
-    #' When early stopping is active, this returns a named list with the early-stopped epochs,
+    #' When early stopping is activate, this returns a named list with the early-stopped epochs,
     #' otherwise an empty list is returned.
     #' Returns `NULL` if learner is not trained yet.
     internal_tuned_values = function() {
@@ -468,10 +435,7 @@ LearnerTorch = R6Class("LearnerTorch",
             nm, paste0(train_shape, collapse = "x"), paste0(predict_shape, collapse = "x"))
         }
       })
-      msg = private$.check_train_task(task, param_vals)
-      if (!isTRUE(msg)) {
-        stopf("Training task '%s' is invalid for learner '%s': %s", task$id, self$id, msg)
-      }
+      private$.verify_train_task(task, param_vals)
 
       param_vals$device = auto_device(param_vals$device)
       if (identical(param_vals$seed, "random")) param_vals$seed = sample.int(.Machine$integer.max, 1)
@@ -495,10 +459,7 @@ LearnerTorch = R6Class("LearnerTorch",
       # Ideally we could rely on state$train_task, but there is this complication
       # https://github.com/mlr-org/mlr3/issues/947
       param_vals$device = auto_device(param_vals$device)
-      msg = private$.check_predict_task(task, param_vals)
-      if (!isTRUE(msg)) {
-        stopf("Prediction task '%s' is invalid for learner '%s': %s", task$id, self$id, msg)
-      }
+      private$.verify_predict_task(task, param_vals)
 
       with_torch_settings(seed = self$model$seed, num_threads = param_vals$num_threads,
         num_interop_threads = param_vals$num_interop_threads, expr = {
@@ -537,25 +498,15 @@ LearnerTorch = R6Class("LearnerTorch",
       param_vals_test = insert_named(param_vals, list(shuffle = FALSE, drop_last = FALSE))
       private$.dataloader(dataset, param_vals_test)
     },
-    .ingress_tokens = function(task, param_vals)  {
-      stopf("Private method `$.ingress_tokens()` must be implemented.")
-    },
     .dataset = function(task, param_vals) {
-      if (!is.null(private$.ingress_tokens)) {
-        task_dataset(
-          task = task,
-          feature_ingress_tokens = private$.ingress_tokens(task, param_vals),
-          target_batchgetter = get_target_batchgetter(task)
-        )
-      } else {
-        stopf("Private method `$.dataset()` or `$.ingress_tokens()` must be implemented.")
-      }
+      stopf(".dataset must be implemented.")
+
     },
     .optimizer = NULL,
     .loss = NULL,
     .callbacks = NULL,
-    .check_train_task = function(task, param_vals) TRUE,
-    .check_predict_task = function(task, param_vals) TRUE,
+    .verify_train_task = function(task, param_vals) NULL,
+    .verify_predict_task = function(task, param_vals) NULL,
     deep_clone = function(name, value) {
       private$.param_set = NULL # required to keep clone identical to original, otherwise tests get really ugly
       if (is.R6(value)) {
diff --git a/R/LearnerTorchFeatureless.R b/R/LearnerTorchFeatureless.R
index 112aaa2d3..66d512032 100644
--- a/R/LearnerTorchFeatureless.R
+++ b/R/LearnerTorchFeatureless.R
@@ -21,12 +21,16 @@ LearnerTorchFeatureless = R6Class("LearnerTorchFeatureless",
   public = list(
     #' @description Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function(task_type, optimizer = NULL, loss = NULL, callbacks = list()) {
+      properties = switch(task_type,
+        classif = c("twoclass", "multiclass", "missings", "featureless", "marshal"),
+        regr = c("missings", "featureless", "marshal")
+      )
       super$initialize(
         id = paste0(task_type, ".torch_featureless"),
         task_type = task_type,
         label = "Featureless Torch Learner",
         param_set = ps(),
-        properties = c("missings", "featureless"),
+        properties = properties,
         feature_types = unname(mlr_reflections$task_feature_types),
         man = "mlr3torch::mlr_learners.torch_featureless",
         optimizer = optimizer,
@@ -37,7 +41,7 @@ LearnerTorchFeatureless = R6Class("LearnerTorchFeatureless",
   ),
   private = list(
     .network = function(task, param_vals) {
-      nn_featureless(nout = output_dim_for(task))
+      nn_featureless(nout = get_nout(task))
     },
     .dataset = function(task, dataset) {
       dataset_featureless(task)
@@ -48,7 +52,7 @@ LearnerTorchFeatureless = R6Class("LearnerTorchFeatureless",
 dataset_featureless = dataset(
   initialize = function(task) {
     self$task = task
-    self$target_batchgetter = get_target_batchgetter(task)
+    self$target_batchgetter = get_target_batchgetter(task$task_type)
   },
   .getbatch = function(index) {
     target = self$task$data(rows = self$task$row_ids[index], cols = self$task$target_names)
diff --git a/R/LearnerTorchImage.R b/R/LearnerTorchImage.R
index 3683f0aa3..95a2f711a 100644
--- a/R/LearnerTorchImage.R
+++ b/R/LearnerTorchImage.R
@@ -17,8 +17,6 @@
 #' @template param_properties
 #' @template param_label
 #' @template param_predict_types
-#' @param jittable (`logical(1)`)\cr
-#'   Whether the model can be jit-traced.
 #'
 #' @section Parameters:
 #' Parameters include those inherited from [`LearnerTorch`] and the `param_set` construction argument.
@@ -33,29 +31,41 @@ LearnerTorchImage = R6Class("LearnerTorchImage",
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function(id, task_type, param_set = ps(), label, optimizer = NULL, loss = NULL,
-      callbacks = list(), packages, man, properties = NULL, predict_types = NULL, jittable = FALSE) {
+      callbacks = list(), packages = "torchvision", man, properties = NULL,
+      predict_types = NULL) {
+      properties = properties %??% switch(task_type,
+        regr = c(),
+        classif = c("twoclass", "multiclass")
+      )
       super$initialize(
         id = id,
         task_type = task_type,
         label = label,
         optimizer = optimizer,
+        properties = properties,
         loss = loss,
         param_set = param_set,
         packages = packages,
         callbacks = callbacks,
         predict_types = predict_types,
         feature_types = "lazy_tensor",
-        man = man,
-        jittable = jittable
+        man = man
       )
     }
   ),
   private = list(
-    .ingress_tokens = function(task, param_vals) {
-      if (task$n_features != 1L) {
-        stopf("Learner '%s' received task '%s' with %i features, but the learner expects exactly one feature.", self$id, task$id, length(task$feature_names))
+    .verify_train_task = function(task, param_vals) {
+      if (!isTRUE(all.equal(task$feature_types$type, "lazy_tensor"))) {
+        stopf("Must have exactly one feature of type lazy_tensor.")
       }
-      list(input = ingress_ltnsr(feature_name = task$feature_names))
+      assert_rgb_shape(c(
+        c(NA, materialize(task$data(task$row_ids[1L], task$feature_names)[[1L]])[[1L]]$shape))
+      )
+      return(TRUE)
+    },
+    .dataset = function(task, param_vals) {
+      param_vals$shape = "infer"
+      dataset_ltnsr(task, param_vals)
     }
   )
 )
diff --git a/R/LearnerTorchMLP.R b/R/LearnerTorchMLP.R
index 98d186619..8f03c7f25 100644
--- a/R/LearnerTorchMLP.R
+++ b/R/LearnerTorchMLP.R
@@ -41,10 +41,14 @@ LearnerTorchMLP = R6Class("LearnerTorchMLP",
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function(task_type, optimizer = NULL, loss = NULL, callbacks = list()) {
-      check_activation = crate(function(x) check_class(x, "nn_module"))
-      check_activation_args = crate(function(x) check_list(x, names = "unique"))
-      check_neurons = crate(function(x) check_integerish(x, any.missing = FALSE, lower = 1))
-      check_shape = crate(function(x) check_shape(x, null_ok = TRUE, len = 2L))
+      check_activation = crate(function(x) check_class(x, "nn_module"),
+        .parent = topenv())
+      check_activation_args = crate(function(x) check_list(x, names = "unique"),
+        .parent = topenv())
+      check_neurons = crate(function(x) check_integerish(x, any.missing = FALSE, lower = 1),
+        .parent = topenv())
+      check_shape = crate(function(x) check_shape(x, null_ok = TRUE, len = 2L),
+        .parent = topenv(), check_shape)
 
       param_set = ps(
         neurons         = p_uty(tags = c("train", "predict"), custom_check = check_neurons),
@@ -61,48 +65,72 @@ LearnerTorchMLP = R6Class("LearnerTorchMLP",
         neurons = integer(0),
         p = 0.5
       )
+      properties = switch(task_type,
+        regr = character(0),
+        classif = c("twoclass", "multiclass")
+      )
 
       super$initialize(
         task_type = task_type,
         id = paste0(task_type, ".mlp"),
+        properties = properties,
         label = "Multi Layer Perceptron",
         param_set = param_set,
         optimizer = optimizer,
         callbacks = callbacks,
         loss = loss,
         man = "mlr3torch::mlr_learners.mlp",
-        feature_types = c("numeric", "integer", "lazy_tensor"),
-        jittable = TRUE
+        feature_types = c("numeric", "integer", "lazy_tensor")
       )
     }
   ),
   private = list(
     .network = function(task, param_vals) {
-      d_out = output_dim_for(task)
-      d_in = private$.ingress_tokens(task, param_vals)[[1L]]$shape[2L]
+      # verify_train_task was already called beforehand, so we can make some assumptions
+      d_out = get_nout(task)
+      d_in = if (single_lazy_tensor(task)) {
+        private$.get_input_shape(task, param_vals$shape)[2L]
+      } else {
+        length(task$feature_names)
+      }
       network = invoke(make_mlp, .args = param_vals, d_in = d_in, d_out = d_out)
       network
     },
-    .ingress_tokens = function(task, param_vals) {
-      token = if (single_lazy_tensor(task)) {
-        shape = param_vals$shape %??% lazy_shape(task$head(1L)[[task$feature_names]])
-        if (is.null(shape)) {
-          stopf("Learner '%s' received task '%s' with lazy tensor feature '%s' with unknown shape. Please specify the learner's `shape` parameter.", self$id, task$id, task$feature_names) # nolint
-        } else if (is.null(param_vals$shape)) {
-          msg = check_shape(shape, len = 2L)
-          if (!isTRUE(msg)) {
-            stopf("Learner '%s' received task '%s' with lazy_tensor column of shape '%s', but the learner expects an input shape of length 2.", self$id, task$id, shape_to_str(shape))
-          }
-        }
-        ingress_ltnsr(shape = shape)
+    .dataset = function(task, param_vals) {
+      if (single_lazy_tensor(task)) {
+        param_vals$shape = private$.get_input_shape(task, param_vals$shape)
+        dataset_ltnsr(task, param_vals)
       } else {
-        ingress_num(shape = c(NA, length(task$feature_names)))
+        dataset_num(task, param_vals)
       }
-      list(input = token)
+    },
+    .verify_train_task = function(task, param_vals) {
+      features = task$feature_types[, "type"][[1L]]
+      lazy_tensor_input = identical(features, "lazy_tensor")
+      assert(check_true(lazy_tensor_input), check_false(some(features, function(x) x == "lazy_tensor")))
+
+      if (lazy_tensor_input) {
+        shape = private$.get_input_shape(task, param_vals$shape)
+        assert_shape(shape, len = 2L)
+      }
+    },
+    .get_input_shape = function(s1, s2) {
+      if (test_class(s1, "Task")) {
+        assert_true(identical(s1$feature_types[, "type"][[1L]], "lazy_tensor"))
+        s1 = dd(s1$data(s1$row_roles$use[1L], s1$feature_names)[[1L]])$pointer_shape
+      }
+      assert_shape(s1, null_ok = TRUE)
+      assert_shape(s2, null_ok = TRUE)
+      s = unique(discard(list(s1, s2), is.null))
+      assert_true(length(s) == 1L)
+      s[[1L]]
     }
   )
 )
 
+single_lazy_tensor = function(task) {
+  identical(task$feature_types[, "type"][[1L]], "lazy_tensor")
+}
 
 # shape is (NA, x) if preesnt
 make_mlp = function(task, d_in, d_out, activation, neurons = integer(0), p, activation_args, n_layers = NULL, ...) {
diff --git a/R/LearnerTorchModel.R b/R/LearnerTorchModel.R
index 9b186a7fb..b64b0ac47 100644
--- a/R/LearnerTorchModel.R
+++ b/R/LearnerTorchModel.R
@@ -84,7 +84,6 @@ LearnerTorchModel = R6Class("LearnerTorchModel",
         packages = packages,
         param_set = ps(),
         feature_types = feature_types,
-        jittable = TRUE,
         man = "mlr3torch::mlr_learners.torch_model"
       )
     }
@@ -94,13 +93,13 @@ LearnerTorchModel = R6Class("LearnerTorchModel",
     #' The ingress tokens. Must be non-`NULL` when calling `$train()`.
     ingress_tokens = function(rhs) {
       if (!missing(rhs)) {
-        private$.ingress_tokens_ = assert_list(rhs, types = "TorchIngressToken", min.len = 1L, names = "unique")
+        private$.ingress_tokens = assert_list(rhs, types = "TorchIngressToken", min.len = 1L, names = "unique")
       }
-      private$.ingress_tokens_
+      private$.ingress_tokens
     }
   ),
   private = list(
-    .ingress_tokens_ = NULL,
+    .ingress_tokens = NULL,
     deep_clone = function(name, value) {
       if (name == ".network_stored" && is.null(value) && !is.null(self$state)) {
         # the initial network state is lost after training a LearnerTorchModel
@@ -130,7 +129,7 @@ LearnerTorchModel = R6Class("LearnerTorchModel",
       dataset = task_dataset(
         task,
         feature_ingress_tokens = ingress_tokens,
-        target_batchgetter = get_target_batchgetter(task)
+        target_batchgetter = get_target_batchgetter(self$task_type)
       )
     },
     .network_stored = NULL,
diff --git a/R/LearnerTorchModule.R b/R/LearnerTorchModule.R
index 25def762d..5d845c85c 100644
--- a/R/LearnerTorchModule.R
+++ b/R/LearnerTorchModule.R
@@ -27,7 +27,6 @@
 #' @param properties (`NULL` or `character()`)\cr
 #'   The properties of the learner.
 #'   Defaults to all available properties for the given task type.
-#' @template param_predict_types
 # @section Parameters: See [`LearnerTorch`] and constructor argument `param_set`.
 #' @family Learner
 #' @include LearnerTorch.R
@@ -36,7 +35,7 @@
 #' nn_one_layer = nn_module("nn_one_layer",
 #'   initialize = function(task, size_hidden) {
 #'     self$first = nn_linear(task$n_features, size_hidden)
-#'     self$second = nn_linear(size_hidden, output_dim_for(task))
+#'     self$second = nn_linear(size_hidden, length(task$class_names))
 #'   },
 #'   # argument x corresponds to the ingress token x
 #'   forward = function(x) {
@@ -62,10 +61,7 @@ LearnerTorchModule = R6Class("LearnerTorchModule",
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function(module_generator = NULL, param_set = NULL, ingress_tokens = NULL,
       task_type, properties = NULL, optimizer = NULL, loss = NULL, callbacks = list(),
-      packages = character(0), feature_types = NULL, predict_types = NULL) {
-      if (is.null(task_type)) {
-        stopf("task_type must be provided")
-      }
+      packages = character(0), feature_types = NULL) {
       assert(check_class(module_generator, "nn_module_generator"), check_function(module_generator))
       private$.module_generator = module_generator
       args = names(formals(module_generator))
@@ -73,7 +69,7 @@ LearnerTorchModule = R6Class("LearnerTorchModule",
         stopf("module_generator must have 'task' as a parameter")
       }
 
-      private$.ingress_tokens_ = assert_list(ingress_tokens, types = "TorchIngressToken", names = "unique", min.len = 1L)
+      private$.ingress_tokens = assert_list(ingress_tokens, types = "TorchIngressToken", names = "unique", min.len = 1L)
 
       if (is.null(feature_types)) {
         feature_types = unname(mlr_reflections$task_feature_types)
@@ -103,13 +99,12 @@ LearnerTorchModule = R6Class("LearnerTorchModule",
         packages = packages,
         param_set = param_set,
         feature_types = feature_types,
-        predict_types = predict_types,
         man = "mlr3torch::mlr_learners.module"
       )
     }
   ),
   private = list(
-    .ingress_tokens_ = NULL,
+    .ingress_tokens = NULL,
     .module_generator = NULL,
 
     .network = function(task, param_vals) {
@@ -117,8 +112,13 @@ LearnerTorchModule = R6Class("LearnerTorchModule",
       invoke(private$.module_generator, task = task, .args = module_params)
     },
 
-    .ingress_tokens = function(task, param_vals) {
-      private$.ingress_tokens_
+    .dataset = function(task, param_vals) {
+      ingress_tokens = private$.ingress_tokens
+      dataset = task_dataset(
+        task,
+        feature_ingress_tokens = ingress_tokens,
+        target_batchgetter = get_target_batchgetter(self$task_type)
+      )
     },
 
     .additional_phash_input = function() {
diff --git a/R/LearnerTorchTabResNet.R b/R/LearnerTorchTabResNet.R
index e06bfb461..d6c56073a 100644
--- a/R/LearnerTorchTabResNet.R
+++ b/R/LearnerTorchTabResNet.R
@@ -24,12 +24,10 @@
 #'   First dropout ratio.
 #' * `dropout2` :: `numeric(1)`\cr
 #'    Second dropout ratio.
-#' * `shape` :: `integer()` or `NULL`\cr
-#'   Shape of the input tensor. Only needs to be provided if the input is a lazy tensor with
-#'   unknown shape.
 #'
 #' @references
 #' `r format_bib("gorishniy2021revisiting")`
+#'
 #' @export
 LearnerTorchTabResNet = R6Class("LearnerTorchTabResNet",
   inherit = LearnerTorch,
@@ -39,55 +37,38 @@ LearnerTorchTabResNet = R6Class("LearnerTorchTabResNet",
     initialize = function(task_type, optimizer = NULL, loss = NULL, callbacks = list()) {
       private$.block = PipeOpTorchTabResNetBlock$new()
 
-      check_shape = crate(function(x) check_shape(x, null_ok = TRUE, len = 2L))
+      properties = switch(task_type,
+        regr = character(0),
+        classif = c("twoclass", "multiclass")
+      )
 
       private$.param_set_base =  ps(
-        n_blocks = p_int(0, tags = c("train", "required")),
-        d_block = p_int(1, tags = c("train", "required")),
-        shape = p_uty(tags = "train", custom_check = check_shape)
+        n_blocks = p_int(1, tags = c("train", "required")),
+        d_block = p_int(1, tags = c("train", "required"))
       )
       param_set = alist(private$.block$param_set, private$.param_set_base)
 
       super$initialize(
         task_type = task_type,
         id = paste0(task_type, ".tab_resnet"),
+        properties = properties,
         label = "Tabular ResNet",
         param_set = param_set,
         optimizer = optimizer,
         callbacks = callbacks,
         loss = loss,
         man = "mlr3torch::mlr_learners.tab_resnet",
-        feature_types = c("numeric", "integer", "lazy_tensor"),
-        jittable = TRUE
+        feature_types = c("numeric", "integer"),
       )
     }
   ),
   private = list(
     .block = NULL,
-    .ingress_tokens = function(task, param_vals) {
-      token = if (single_lazy_tensor(task)) {
-        shape = param_vals$shape %??% lazy_shape(task$head(1L)[[task$feature_names]])
-        if (is.null(shape)) {
-          stopf("Learner '%s' received task '%s' with lazy tensor feature '%s' with unknown shape. Please specify the learner's `shape` parameter.", self$id, task$id, task$feature_names) # nolint
-        } else if (is.null(param_vals$shape)) {
-          msg = check_shape(shape, len = 2L)
-          if (!isTRUE(msg)) {
-            stopf("Learner '%s' received task '%s' with lazy_tensor column of shape '%s', but the learner expects an input shape of length 2.", self$id, task$id, shape_to_str(shape))
-          }
-        }
-        ingress_ltnsr(shape = shape)
-      } else {
-        ingress_num(shape = c(NA, length(task$feature_names)))
-      }
-      list(input = token)
+    .dataset = function(task, param_vals) {
+      dataset_num(task, param_vals)
     },
     .network = function(task, param_vals) {
-      ingress = if (single_lazy_tensor(task)) {
-        po("torch_ingress_ltnsr", id = "num", shape = private$.ingress_tokens(task, param_vals)[[1L]]$shape)
-      } else {
-        po("torch_ingress_num", id = "num")
-      }
-      graph = ingress %>>%
+      graph = po("torch_ingress_num") %>>%
         po("nn_linear", out_features = param_vals$d_block) %>>%
         po("nn_block", private$.block, n_blocks = param_vals$n_blocks) %>>%
         po("nn_head")
@@ -139,7 +120,7 @@ nn_tab_resnet_block = nn_module("nn_tab_resnet_block",
   ) {
     assert_int(d_block, lower = 1L)
     if (is.null(d_hidden)) {
-      assert_numeric(d_hidden_multiplier, lower = 0)
+      assert_numeric(d_hidden_multiplier, lower = 0, null.ok = TRUE)
       d_hidden = as.integer(d_block * d_hidden_multiplier)
     } else {
       assert_int(d_hidden, lower = 1L)
diff --git a/R/LearnerTorchVision.R b/R/LearnerTorchVision.R
index 73217507a..42f8d607d 100644
--- a/R/LearnerTorchVision.R
+++ b/R/LearnerTorchVision.R
@@ -25,9 +25,7 @@
 #'   Function that generates the network.
 #' @param label (`character(1)`)\cr
 #'   The label of the network.
-#' @param jittable (`logical(1)`)\cr
-#'   Whether to use jitting.
-#' @references
+#'#' @references
 #' `r format_bib("krizhevsky2017imagenet")`
 #' `r format_bib("sandler2018mobilenetv2")`
 #' `r format_bib("he2016deep")`
@@ -38,8 +36,7 @@ LearnerTorchVision = R6Class("LearnerTorchVision",
   inherit = LearnerTorchImage,
   public = list(
     #' @description Creates a new instance of this [R6][R6::R6Class] class.
-    initialize = function(name, module_generator, label, optimizer = NULL, loss = NULL,
-      callbacks = list(), jittable = FALSE) { # nolint
+    initialize = function(name, module_generator, label, optimizer = NULL, loss = NULL, callbacks = list()) { # nolint
       task_type = "classif"
       private$.module_generator = module_generator
       param_set = ps(
@@ -54,15 +51,14 @@ LearnerTorchVision = R6Class("LearnerTorchVision",
         optimizer = optimizer,
         loss = loss,
         callbacks = callbacks,
-        label = label,
-        packages = "torchvision"
+        label = label
       )
     }
   ),
   private = list(
     .module_generator = NULL,
     .network = function(task, param_vals) {
-      nout = output_dim_for(task)
+      nout = get_nout(task)
       if (param_vals$pretrained) {
         network = replace_head(private$.module_generator(pretrained = TRUE), nout)
         return(network)
@@ -111,128 +107,128 @@ replace_head.VGG = function(network, d_out) {
 }
 
 #' @include aaa.R
-register_learner("classif.alexnet",
+register_learner("classif.alexnet", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("alexnet", torchvision::model_alexnet, "AlexNet",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-# register_learner("classif.inception_v3",
+# register_learner("classif.inception_v3", 
 #   function(loss = NULL, optimizer = NULL, callbacks = list()) {
 #     LearnerTorchVision$new("inception_v3", torchvision::model_inception_v3, "Inception V3",
 #     loss = loss, optimizer = optimizer, callbacks = callbacks)
 #   }
 # )
 
-register_learner("classif.mobilenet_v2",
+register_learner("classif.mobilenet_v2", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("mobilenet_v2", torchvision::model_mobilenet_v2, "Mobilenet V2",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnet18",
+register_learner("classif.resnet18", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnet18", torchvision::model_resnet18, "ResNet-18",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnet34",
+register_learner("classif.resnet34", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnet34", torchvision::model_resnet34, "ResNet-34",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnet50",
+register_learner("classif.resnet50", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnet50", torchvision::model_resnet50, "ResNet-50",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnet101",
+register_learner("classif.resnet101", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnet101", torchvision::model_resnet101, "ResNet-101",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnet152",
+register_learner("classif.resnet152", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnet152", torchvision::model_resnet152, "ResNet-152",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnext101_32x8d",
+register_learner("classif.resnext101_32x8d", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnext101_32x8d", torchvision::model_resnext101_32x8d, "ResNeXt-101 32x8d",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.resnext50_32x4d",
+register_learner("classif.resnext50_32x4d", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("resnext50_32x4d", torchvision::model_resnext50_32x4d, "ResNeXt-50 32x4d",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg11",
+register_learner("classif.vgg11", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg11", torchvision::model_vgg11, "VGG 11",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg11_bn",
+register_learner("classif.vgg11_bn", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg11_bn", torchvision::model_vgg11_bn, "VGG 11",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg13",
+register_learner("classif.vgg13", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg13", torchvision::model_vgg13, "VGG 13",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg13_bn",
+register_learner("classif.vgg13_bn", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg13_bn", torchvision::model_vgg13_bn, "VGG 13",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg16",
+register_learner("classif.vgg16", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg16", torchvision::model_vgg16, "VGG 16",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg16_bn",
+register_learner("classif.vgg16_bn", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg16_bn", torchvision::model_vgg16_bn, "VGG 16",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg19",
+register_learner("classif.vgg19", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg19", torchvision::model_vgg19, "VGG 19",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
 
-register_learner("classif.vgg19_bn",
+register_learner("classif.vgg19_bn", 
   function(loss = NULL, optimizer = NULL, callbacks = list()) {
     LearnerTorchVision$new("vgg19_bn", torchvision::model_vgg19_bn, "VGG 19",
-      loss = loss, optimizer = optimizer, callbacks = callbacks, jittable = TRUE)
+      loss = loss, optimizer = optimizer, callbacks = callbacks)
   }
 )
diff --git a/R/ModelDescriptor.R b/R/ModelDescriptor.R
index 946472d36..3ec362574 100644
--- a/R/ModelDescriptor.R
+++ b/R/ModelDescriptor.R
@@ -46,7 +46,6 @@
 #' @export
 ModelDescriptor = function(graph, ingress, task, optimizer = NULL, loss = NULL, callbacks = NULL, pointer = NULL,
   pointer_shape = NULL) {
-  graph = as_graph(graph)
   assert_r6(graph, "Graph")
   innames = graph$input$name  # graph$input$name access is slow
 
diff --git a/R/PipeOpTaskPreprocTorch.R b/R/PipeOpTaskPreprocTorch.R
index 232b63984..203b8afb7 100644
--- a/R/PipeOpTaskPreprocTorch.R
+++ b/R/PipeOpTaskPreprocTorch.R
@@ -404,9 +404,12 @@ create_ps = function(fn) {
 #' @param shapes_out (`function` or `NULL` or `"infer"`)\cr
 #'   The private `.shapes_out(shapes_in, param_vals, task)` method of [`PipeOpTaskPreprocTorch`]
 #'   (see section Inheriting).
-#'   Special values are `NULL` and `"infer"`:
+#'   Special values are `NULL` and `infer`:
 #'   If `NULL`, the output shapes are unknown.
-#'   Option `"infer"` uses [`infer_shapes`].
+#'   If "infer", the output shape function is inferred and calculates the output shapes as follows:
+#'   For an input shape of (NA, ...) a meta-tensor of shape (1, ...) is created and the preprocessing function is
+#'   applied. Afterwards the batch dimension (1) is replaced with NA and the shape is returned.
+#'   If the first dimension is not `NA`, the output shape of applying the preprocessing function is returned.
 #'   Method `"infer"` should be correct in most cases, but might fail in some edge cases.
 #' @param param_set ([`ParamSet`][paradox::ParamSet] or `NULL`)\cr
 #'   The parameter set.
@@ -449,8 +452,31 @@ pipeop_preproc_torch = function(id, fn, shapes_out = NULL, param_set = NULL, pac
   # we e.g. want torchvision in suggests, so we cannot already access the function.
   if (identical(shapes_out, "infer")) {
     shapes_out = crate(function(shapes_in, param_vals, task) {
-      getFromNamespace("infer_shapes", "mlr3torch")(shapes_in = shapes_in, param_vals = param_vals, output_names = self$output$name, fn = self$fn, rowwise = self$rowwise, id = self$id) # nolint
-    })
+      sin = shapes_in[[1L]]
+      batch_dim = sin[1L]
+      batchdim_is_unknown = is.na(batch_dim)
+      if (batchdim_is_unknown) {
+        sin[1] = 1L
+      }
+      if (self$rowwise) {
+        sin = sin[-1L]
+      }
+      tensor_in = mlr3misc::invoke(torch_empty, .args = sin, device = torch_device("meta"))
+      tensor_out = tryCatch(mlr3misc::invoke(self$fn, tensor_in, .args = param_vals),
+        error = function(e) {
+          stopf("Input shape '%s' is invalid for PipeOp with id '%s'.", shape_to_str(list(sin)), self$id)
+        }
+      )
+      sout = dim(tensor_out)
+
+      if (self$rowwise) {
+        sout = c(batch_dim, sout)
+      } else if (batchdim_is_unknown) {
+        sout[1] = NA
+      }
+
+      list(sout)
+    }, .parent = topenv())
   } else if (is.function(shapes_out) || is.null(shapes_out)) {
     # nothing to do
   } else {
diff --git a/R/PipeOpTorch.R b/R/PipeOpTorch.R
index fce3f22a3..45bdfb55e 100644
--- a/R/PipeOpTorch.R
+++ b/R/PipeOpTorch.R
@@ -34,13 +34,13 @@
 #'
 #' * `.make_module(shapes_in, param_vals, task)`\cr
 #'   (`list()`, `list()`) -> `nn_module`\cr
-#'   This private method is called to generate the `nn_module` that is passed as argument `module` to
+#'   This private method is called to generated the `nn_module` that is passed as argument `module` to
 #'   [`PipeOpModule`]. It must be overwritten, when no `module_generator` is provided.
 #'   If left as is, it calls the provided `module_generator` with the arguments obtained by
 #'   the private method `.shape_dependent_params()`.
 #' * `.shapes_out(shapes_in, param_vals, task)`\cr
 #'   (`list()`, `list()`, [`Task`][mlr3::Task] or `NULL`) -> named `list()`\cr
-#'   This private method gets a list of `integer` vectors (`shapes_in`), the parameter values (`param_vals`),
+#'   This private method gets a list of `numeric` vectors (`shapes_in`), the parameter values (`param_vals`),
 #'   as well as an (optional) [`Task`][mlr3::Task].
 #    The `shapes_in` list indicates the shape of input tensors that will be fed to the module's `$forward()` function.
 #    The list has one item per input tensor, typically only one.
@@ -49,10 +49,6 @@
 #'   The output shapes must be in the same order as the output names of the `PipeOp`.
 #'   In case the output shapes depends on the task (as is the case for [`PipeOpTorchHead`]), the function should return
 #'   valid output shapes (possibly containing `NA`s) if the `task` argument is provided or not.
-#'   It is important to properly handle the presence of `NA`s in the input shapes.
-#'   By default (if construction argument `only_batch_unknown` is `TRUE`), only the batch dimension can be `NA`.
-#'   If you set this to `FALSE`, you need to take other unknown dimensions into account.
-#'   The method can also throw an error if the input shapes violate some assumptions.
 #' * `.shape_dependent_params(shapes_in, param_vals, task)`\cr
 #'   (`list()`, `list()`) -> named `list()`\cr
 #'   This private method has the same inputs as `.shapes_out`.
@@ -102,7 +98,7 @@
 #'     self$output = if (task$task_type == "regr") {
 #'       torch::nn_linear(d_hidden, 1)
 #'     } else if (task$task_type == "classif") {
-#'       torch::nn_linear(d_hidden, output_dim_for(task))
+#'       torch::nn_linear(d_hidden, length(task$class_names))
 #'     }
 #'   },
 #'   forward = function(x) {
@@ -204,7 +200,7 @@
 #'
 #' # the resulting graphs are identical
 #' identical(mds_out[[1L]]$graph, mds_out[[2L]]$graph)
-#' # note that as a side-effect, also one of the input graphs is modified in-place for efficiency
+#' # not that as a side-effect, also one of the input graphs is modified in-place for efficiency
 #' mds_in[[1L]]$graph$edges
 #'
 #' # The new task has both Sepal and Petal features
@@ -256,19 +252,14 @@ PipeOpTorch = R6Class("PipeOpTorch",
     #'   In case there is more than one output channel, the `nn_module` that is constructed by this
     #'   [`PipeOp`][mlr3pipelines::PipeOp] during training must return a named `list()`, where the names of the list are the
     #'   names out the output channels. The default is `"output"`.
-    #' @param only_batch_unknown (`logical(1)`)\cr
-    #'   Whether only the batch dimension can be missing in the input shapes or whether other
-    #'   dimensions can also be unknown.
-    #'   Default is `TRUE`.
     initialize = function(id, module_generator, param_set = ps(), param_vals = list(),
-      inname = "input", outname = "output", packages = "torch", tags = NULL, only_batch_unknown = TRUE) {
+      inname = "input", outname = "output", packages = "torch", tags = NULL) {
       self$module_generator = assert_class(module_generator, "nn_module_generator", null.ok = TRUE)
       assert_character(inname, .var.name = "input channel names")
       assert_character(outname, .var.name = "output channel names", min.len = 1L)
       assert_character(tags, null.ok = TRUE)
       assert_character(packages, any.missing = FALSE)
 
-      private$.only_batch_unknown = assert_flag(only_batch_unknown)
       packages = union(packages, c("mlr3torch", "torch"))
       input = data.table(name = inname, train = "ModelDescriptor", predict = "Task")
       output = data.table(name = outname, train = "ModelDescriptor", predict = "Task")
@@ -297,16 +288,17 @@ PipeOpTorch = R6Class("PipeOpTorch",
       assert_r6(task, "Task", null.ok = TRUE)
       if (is.numeric(shapes_in)) shapes_in = list(shapes_in)
       # batch dimension can be known or unknown
-      assert_shapes(shapes_in, unknown_batch = NULL, only_batch_unknown = private$.only_batch_unknown)
+      assert_shapes(shapes_in, unknown_batch = NULL)
       if ("..." %nin% self$input$name) {
         assert_true(length(shapes_in) == nrow(self$input),
           .var.name = "number of input shapes equal to number of input channels")
       }
       set_names(private$.shapes_out(shapes_in, self$param_set$get_values(), task = task), self$output$name)
     }
+
+    # TODO: printer that calls the nn_module's printer
   ),
   private = list(
-    .only_batch_unknown = TRUE,
     .shapes_out = function(shapes_in, param_vals, task) shapes_in,
     .shape_dependent_params = function(shapes_in, param_vals, task) param_vals,
     .make_module = function(shapes_in, param_vals, task) {
diff --git a/R/PipeOpTorchActivation.R b/R/PipeOpTorchActivation.R
index 57a2fd0a1..18e9c6152 100644
--- a/R/PipeOpTorchActivation.R
+++ b/R/PipeOpTorchActivation.R
@@ -139,8 +139,7 @@ PipeOpTorchHardTanh = R6Class("PipeOpTorchHardTanh",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_hardtanh,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -180,8 +179,7 @@ PipeOpTorchLeakyReLU = R6Class("PipeOpTorchLeakyReLU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_leaky_relu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -201,6 +199,7 @@ register_po("nn_leaky_relu", PipeOpTorchLeakyReLU)
 #' @template pipeop_torch
 #' @template pipeop_torch_example
 #'
+#'
 #' @export
 PipeOpTorchLogSigmoid = R6Class("PipeOpTorchLogSigmoid",
   inherit = PipeOpTorch,
@@ -215,8 +214,7 @@ PipeOpTorchLogSigmoid = R6Class("PipeOpTorchLogSigmoid",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_log_sigmoid,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -256,8 +254,7 @@ PipeOpTorchPReLU = R6Class("PipeOpTorchPReLU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_prelu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -277,6 +274,7 @@ register_po("nn_prelu", PipeOpTorchPReLU)
 #' @template pipeop_torch
 #' @template pipeop_torch_example
 #'
+#'
 #' @export
 PipeOpTorchReLU = R6Class("PipeOpTorchReLU",
   inherit = PipeOpTorch,
@@ -293,8 +291,7 @@ PipeOpTorchReLU = R6Class("PipeOpTorchReLU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_relu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -330,8 +327,7 @@ PipeOpTorchReLU6 = R6Class("PipeOpTorchReLU6",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_relu6,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -374,8 +370,7 @@ PipeOpTorchRReLU = R6Class("PipeOpTorchRReLU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_rrelu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -412,8 +407,7 @@ PipeOpTorchSELU = R6Class("PipeOpTorchSELU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_selu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -453,8 +447,7 @@ PipeOpTorchCELU = R6Class("PipeOpTorchCELU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_celu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -491,8 +484,7 @@ PipeOpTorchGELU = R6Class("PipeOpTorchGELU",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_gelu,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -526,8 +518,7 @@ PipeOpTorchSigmoid = R6Class("PipeOpTorchSigmoid",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_sigmoid,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -566,8 +557,7 @@ PipeOpTorchSoftPlus = R6Class("PipeOpTorchSoftPlus",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_softplus,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -604,8 +594,7 @@ PipeOpTorchSoftShrink = R6Class("PipeOpTorchSoftShrink",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_softshrink,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -638,8 +627,7 @@ PipeOpTorchSoftSign = R6Class("PipeOpTorchSoftSign",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_softsign,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -673,8 +661,7 @@ PipeOpTorchTanh = R6Class("PipeOpTorchTanh",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_tanh,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -708,8 +695,7 @@ PipeOpTorchTanhShrink = R6Class("PipeOpTorchTanhShrink",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_tanhshrink,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -753,8 +739,7 @@ PipeOpTorchThreshold = R6Class("PipeOpTorchThreshold",
         param_set = param_set,
         param_vals = param_vals,
         module_generator = nn_threshold,
-        tags = "activation",
-        only_batch_unknown = FALSE
+        tags = "activation"
       )
     }
   )
@@ -813,149 +798,3 @@ PipeOpTorchGLU = R6Class("PipeOpTorchGLU",
 )
 
 register_po("nn_glu", PipeOpTorchGLU)
-
-reglu = function(x) {
-  assert_true(last(x$shape, 1) %% 2 == 0)
-  chunked = x$chunk(2, dim = -1)
-  a = chunked[[1]]
-  b = chunked[[2]]
-  return(a * nnf_relu(b))
-}
-
-#' @title ReGLU Module
-#'
-#' @description
-#' Rectified Gated Linear Unit (ReGLU) module.
-#' Computes the output as \eqn{\text{ReGLU}(x, g) = x \cdot \text{ReLU}(g)}
-#' where \(x\) and \(g\) are created by splitting the input tensor in half along the last dimension.
-#'
-#' @export
-#' @references
-#' `r format_bib("shazeer2020glu")`
-#' @examplesIf torch::torch_is_installed()
-#' x = torch::torch_randn(10, 10)
-#' reglu = nn_reglu()
-#' reglu(x)
-nn_reglu = nn_module(
-  "nn_reglu",
-  forward = function(input) {
-    return(reglu(input))
-  }
-)
-
-#' @title ReGLU Activation Function
-#'
-#' @description
-#' Rectified Gated Linear Unit (ReGLU) activation function.
-#' See [`nn_reglu`] for details.
-#' @section Parameters:
-#' No parameters.
-#' @templateVar id nn_reglu
-#' @template pipeop_torch_channels_default
-#' @template pipeop_torch
-#' @template pipeop_torch_example
-#' @export
-PipeOpTorchReGLU = R6Class("PipeOpTorchReGLU",
-  inherit = PipeOpTorch,
-  public = list(
-    #' @description Creates a new instance of this [R6][R6::R6Class] class.
-    #' @template params_pipelines
-    initialize = function(id = "nn_reglu", param_vals = list()) {
-      param_set = ps()
-      super$initialize(
-        id = id,
-        param_set = param_set,
-        param_vals = param_vals,
-        module_generator = nn_reglu,
-        tags = "activation"
-      )
-    }
-  ),
-  private = list(
-    .shapes_out = function(shapes_in, param_vals, task) {
-      shape = shapes_in[[1L]]
-      d_new = last(shape, 1) / 2
-      if (test_integerish(d_new)) {
-        shape[length(shape)] = d_new
-        list(shape)
-      } else {
-        stopf("Last dimension of input tensor must be divisible by 2.")
-      }
-    }
-  )
-)
-
-register_po("nn_reglu", PipeOpTorchReGLU)
-
-geglu = function(x) {
-  assert_true(last(x$shape, 1) %% 2 == 0)
-  chunked = x$chunk(2, dim = -1)
-  a = chunked[[1]]
-  b = chunked[[2]]
-  return(a * nnf_gelu(b))
-}
-
-#' @title GeGLU Module
-#' @description
-#' This module implements the Gaussian Error Linear Unit Gated Linear Unit (GeGLU) activation function.
-#' It computes \eqn{\text{GeGLU}(x, g) = x \cdot \text{GELU}(g)}
-#' where \(x\) and \(g\) are created by splitting the input tensor in half along the last dimension.
-#'
-#' @export
-#' @references
-#' `r format_bib("shazeer2020glu")`
-#' @examplesIf torch::torch_is_installed()
-#' x = torch::torch_randn(10, 10)
-#' glu = nn_geglu()
-#' glu(x)
-nn_geglu = nn_module(
-  "nn_geglu",
-  forward = function(input) {
-    return(geglu(input))
-  }
-)
-
-#' @title GeGLU Activation Function
-#'
-#' @description
-#' Gaussian Error Linear Unit Gated Linear Unit (GeGLU) activation function, see
-#' [`nn_geglu`] for details.
-#' @section Parameters:
-#' No parameters.
-#' @templateVar id nn_geglu
-#' @template pipeop_torch_channels_default
-#' @template pipeop_torch
-#' @template pipeop_torch_example
-#'
-#' @export
-PipeOpTorchGeGLU = R6Class("PipeOpTorchGeGLU",
-  inherit = PipeOpTorch,
-  public = list(
-    #' @description Creates a new instance of this [R6][R6::R6Class] class.
-    #' @template params_pipelines
-    initialize = function(id = "nn_geglu", param_vals = list()) {
-      param_set = ps()
-      super$initialize(
-        id = id,
-        param_set = param_set,
-        param_vals = param_vals,
-        module_generator = nn_geglu,
-        tags = "activation"
-      )
-    }
-  ),
-  private = list(
-    .shapes_out = function(shapes_in, param_vals, task) {
-      shape = shapes_in[[1L]]
-      d_new = last(shape, 1) / 2
-      if (test_integerish(d_new)) {
-        shape[length(shape)] = d_new
-        list(shape)
-      } else {
-        stopf("Last dimension of input tensor must be divisible by 2.")
-      }
-    }
-  )
-)
-
-register_po("nn_geglu", PipeOpTorchGeGLU)
diff --git a/R/PipeOpTorchBlock.R b/R/PipeOpTorchBlock.R
index a80120142..bc5c50480 100644
--- a/R/PipeOpTorchBlock.R
+++ b/R/PipeOpTorchBlock.R
@@ -7,21 +7,9 @@
 #' `__<layer>`.
 #'
 #' @section Parameters:
-#' The parameters available for the provided `block`, as well as
+#' The parameters available for the block itself, as well as
 #' * `n_blocks` :: `integer(1)`\cr
 #'   How often to repeat the block.
-#' * `trafo` :: `function(i, param_vals, param_set) -> list()`\cr
-#'   A function that allows to transform the parameters vaues of each layer (`block`).
-#'   Here,
-#'   * `i` :: `integer(1)`\cr
-#'       is the index of the layer, ranging from `1` to `n_blocks`.
-#'   * `param_vals` :: named `list()`\cr
-#'       are the parameter values of the layer `i`.
-#'   * `param_set` :: [`ParamSet`][paradox::ParamSet]\cr
-#'       is the parameter set of the whole `PipeOpTorchBlock`.
-#'
-#'   The function must return the modified parameter values for the given layer.
-#'   This, e.g., allows for special behavior of the first or last layer.
 #' @section Input and Output Channels:
 #' The `PipeOp` sets its input and output channels to those from the `block` (Graph)
 #' it received during construction.
@@ -29,24 +17,20 @@
 #' @template pipeop_torch
 #' @export
 #' @examplesIf torch::torch_is_installed()
-#' # repeat a simple linear layer with ReLU activation 3 times, but set the bias for the last
-#' # layer to `FALSE`
-#' block = nn("linear") %>>% nn("relu")
+#' block = po("nn_linear") %>>% po("nn_relu")
+#' po_block = po("nn_block", block,
+#' nn_linear.out_features = 10L, n_blocks = 3)
+#' network = po("torch_ingress_num") %>>%
+#' po_block %>>%
+#' po("nn_head") %>>%
+#' po("torch_loss", t_loss("cross_entropy")) %>>%
+#' po("torch_optimizer", t_opt("adam")) %>>%
+#' po("torch_model_classif",
+#'   batch_size = 50,
+#'   epochs = 3)
 #'
-#' blocks = nn("block", block,
-#'   linear.out_features = 10L, linear.bias = TRUE, n_blocks = 3,
-#'   trafo = function(i, param_vals, param_set) {
-#'     if (i  == param_set$get_values()$n_blocks) {
-#'       param_vals$linear.bias = FALSE
-#'     }
-#'     param_vals
-#'   })
-#' graph = po("torch_ingress_num") %>>%
-#'   blocks %>>%
-#'   nn("head")
-#' md = graph$train(tsk("iris"))[[1L]]
-#' network = model_descriptor_to_module(md)
-#' network
+#' task = tsk("iris")
+#' network$train(task)
 PipeOpTorchBlock = R6Class("PipeOpTorchBlock",
   inherit = PipeOpTorch,
   public = list(
@@ -60,12 +44,8 @@ PipeOpTorchBlock = R6Class("PipeOpTorchBlock",
     initialize = function(block, id = "nn_block", param_vals = list()) {
       private$.block = as_graph(block)
       private$.param_set_base = ps(
-        n_blocks = p_int(lower = 0L, tags = c("train", "required")),
-        trafo = p_uty(tags = "train", custom_check = crate(function(x) {
-          check_function(x, args = c("i", "param_vals", "param_set"))
-        }))
+        n_blocks = p_int(lower = 1L, tags = c("train", "required"))
       )
-
       super$initialize(
         id = id,
         param_vals = param_vals,
@@ -88,18 +68,11 @@ PipeOpTorchBlock = R6Class("PipeOpTorchBlock",
   private = list(
     .block = NULL,
     .make_graph = function(block, n_blocks) {
-      trafo = self$param_set$get_values()$trafo
       graph = block
-      graphs = c(replicate(n_blocks, graph$clone(deep = TRUE)))
-      if (!is.null(trafo)) {
-        param_vals = map(graphs, function(graph) graph$param_set$get_values())
-        walk(seq_along(param_vals), function(i) {
-          vals = trafo(i = i, param_vals = param_vals[[i]], param_set = self$param_set)
-          graphs[[i]]$param_set$values = vals
-        })
-      }
+      graph$update_ids(prefix = paste0(self$id, "."))
+      graphs = c(list(graph), replicate(n_blocks - 1L, graph$clone(deep = TRUE)))
       lapply(seq_len(n_blocks), function(i) {
-        graphs[[i]]$update_ids(prefix = paste0(self$id, "."), postfix = paste0("__", i))
+        graphs[[i]]$update_ids(postfix = paste0("__", i))
       })
       Reduce(`%>>%`, graphs)
     },
@@ -139,10 +112,7 @@ PipeOpTorchBlock = R6Class("PipeOpTorchBlock",
       map(mdouts, "pointer_shape")
     },
     .train = function(inputs) {
-      param_vals = self$param_set$get_values()
-      if (param_vals$n_blocks == 0L) {
-        return(inputs)
-      }
+      param_vals = self$param_set$get_values(tags = "train")
       block = private$.block$clone(deep = TRUE)
       graph = private$.make_graph(block, param_vals$n_blocks)
       inputs = set_names(inputs, graph$input$name)
diff --git a/R/PipeOpTorchConv.R b/R/PipeOpTorchConv.R
index 017e4662a..f5afb7281 100644
--- a/R/PipeOpTorchConv.R
+++ b/R/PipeOpTorchConv.R
@@ -163,3 +163,4 @@ conv_output_shape = function(shape_in, conv_dim, padding, dilation, stride, kern
     (if (ceil_mode) base::ceiling else base::floor)((shape_tail + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1)
   )
 }
+
diff --git a/R/PipeOpTorchHead.R b/R/PipeOpTorchHead.R
index 3f363d704..73244b025 100644
--- a/R/PipeOpTorchHead.R
+++ b/R/PipeOpTorchHead.R
@@ -4,10 +4,6 @@
 #'
 #' @section nn_module:
 #' Calls [`torch::nn_linear()`] with the input and output features inferred from the input shape / task.
-#' For
-#' * binary classification, the output dimension is 1.
-#' * multiclass classification, the output dimension is the number of classes.
-#' * regression, the output dimension is 1.
 #'
 #' @section Parameters:
 #' * `bias` :: `logical(1)`\cr
@@ -43,13 +39,13 @@ PipeOpTorchHead = R6Class("PipeOpTorchHead",
       if (length(shapes_in[[1]]) != 2L) {
         stopf("PipeOpTorchHead expects 2D input, but got %s.", shape_to_str(shapes_in))
       }
-      d = output_dim_for(task)
+      d = get_nout(task)
       list(c(shapes_in[[1]][[1]], d))
     },
     .shape_dependent_params = function(shapes_in, param_vals, task) {
       param_vals$in_features = shapes_in[[1L]][2L]
 
-      param_vals$out_features = output_dim_for(task)
+      param_vals$out_features = get_nout(task)
 
       param_vals
     }
diff --git a/R/PipeOpTorchIngress.R b/R/PipeOpTorchIngress.R
index fec8a8175..8b9ab0b6e 100644
--- a/R/PipeOpTorchIngress.R
+++ b/R/PipeOpTorchIngress.R
@@ -119,7 +119,7 @@ PipeOpTorchIngress = R6Class("PipeOpTorchIngress",
 #'   the output of `Task$data(rows = batch_indices, cols = features)`
 #'   and it should produce a tensor of shape `shape_out`.
 #' @param shape (`integer`)\cr
-#'   Shape that `batchgetter` will produce. Batch dimension must be included as `NA` (but other dimensions can also be `NA`, i.e., unknown).
+#'   Shape that `batchgetter` will produce. Batch-dimension should be included as `NA`.
 #' @return `TorchIngressToken` object.
 #' @family Graph Network
 #' @export
@@ -165,15 +165,12 @@ TorchIngressToken = function(features, batchgetter, shape = NULL) {
 #' @description
 #' Represents an entry point representing a tensor containing all numeric (`integer()` and `double()`)
 #' features of a task.
-#' @param shape (`integer()` or `NULL`)\cr
-#'   Shape that `batchgetter` will produce. Batch-dimension should be included as `NA`.
 #' @return [`TorchIngressToken`]
 #' @export
-ingress_num = function(shape = NULL) {
+ingress_num = function() {
   TorchIngressToken(
     selector_type(c("numeric", "integer")),
-    batchgetter_num,
-    shape = shape
+    batchgetter_num
   )
 }
 
@@ -181,15 +178,12 @@ ingress_num = function(shape = NULL) {
 #' @description
 #' Represents an entry point representing a tensor containing all categorical (`factor()`, `ordered()`, `logical()`)
 #' features of a task.
-#' @param shape (`integer()` or `NULL`)\cr
-#'   Shape that `batchgetter` will produce. Batch-dimension should be included as `NA`.
 #' @return [`TorchIngressToken`]
 #' @export
-ingress_categ = function(shape = NULL) {
+ingress_categ = function() {
   TorchIngressToken(
     selector_type(c("factor", "ordered", "logical")),
-    batchgetter_categ,
-    shape = shape
+    batchgetter_categ
   )
 }
 
@@ -219,15 +213,12 @@ selector_ltnsr = function(feature_name = NULL) {
 #' Represents an entry point representing a tensor containing a single lazy tensor feature.
 #' @param feature_name (`character(1)`)\cr
 #'   Which lazy tensor feature to select if there is more than one.
-#' @param shape (`integer()` or `NULL`)\cr
-#'   Shape that `batchgetter` will produce. Batch-dimension should be included as `NA`.
 #' @return [`TorchIngressToken`]
 #' @export
-ingress_ltnsr = function(feature_name = NULL, shape = NULL) {
+ingress_ltnsr = function(feature_name = NULL) {
   TorchIngressToken(
     selector_ltnsr(feature_name),
-    batchgetter_lazy_tensor,
-    shape = shape
+    batchgetter_lazy_tensor
   )
 }
 
diff --git a/R/PipeOpTorchLinear.R b/R/PipeOpTorchLinear.R
index 0b5aa3d72..9e26bdd51 100644
--- a/R/PipeOpTorchLinear.R
+++ b/R/PipeOpTorchLinear.R
@@ -32,18 +32,13 @@ PipeOpTorchLinear = R6Class("PipeOpTorchLinear",
         id = id,
         param_set = param_set,
         param_vals = param_vals,
-        module_generator = nn_linear,
-        only_batch_unknown = FALSE
+        module_generator = nn_linear
       )
     }
   ),
   private = list(
     .shape_dependent_params = function(shapes_in, param_vals, task) {
-      d_in = tail(shapes_in[[1]], 1)
-      if (is.na(d_in)) {
-        stopf("PipeOpLinear received an input shape where the last dimension is unknown. Please provide an input with a known last dimension.")
-      }
-      c(param_vals, list(in_features = d_in))
+      c(param_vals, list(in_features = tail(shapes_in[[1]], 1)))
     },
     .shapes_out = function(shapes_in, param_vals, task) list(c(head(shapes_in[[1]], -1), param_vals$out_features))
   )
diff --git a/R/PipeOpTorchMerge.R b/R/PipeOpTorchMerge.R
index 0a582edda..187808b54 100644
--- a/R/PipeOpTorchMerge.R
+++ b/R/PipeOpTorchMerge.R
@@ -142,7 +142,7 @@ PipeOpTorchMergeProd = R6Class("PipeOpTorchMergeProd", inherit = PipeOpTorchMerg
 #' Calls [`nn_merge_cat()`] when trained.
 #' @section Parameters:
 #' * `dim` :: `integer(1)`\cr
-#'   The dimension along which to concatenate the tensors. The default is -1, i.e., the last dimension.
+#'   The dimension along which to concatenate the tensors.
 #' @templateVar id nn_merge_cat
 #' @template pipeop_torch_channels_default
 #' @template pipeop_torch
diff --git a/R/PipeOpTorchReshape.R b/R/PipeOpTorchReshape.R
index 67289de32..a689e7cf9 100644
--- a/R/PipeOpTorchReshape.R
+++ b/R/PipeOpTorchReshape.R
@@ -5,7 +5,7 @@
 #' This internally calls [`torch::torch_reshape()`] with the given `shape`.
 #' @section Parameters:
 #' * `shape` :: `integer(1)`\cr
-#'   The desired output shape. Unknown dimension (one at most) can either be specified as `-1`.
+#'   The desired output shape. Unknown dimension (one at most) can either be specified as `-1` or `NA`.
 #' @templateVar id nn_reshape
 #' @template pipeop_torch_channels_default
 #' @template pipeop_torch
diff --git a/R/PipeOpTorchTokenizer.R b/R/PipeOpTorchTokenizer.R
index 92d6e0948..6634c64d0 100644
--- a/R/PipeOpTorchTokenizer.R
+++ b/R/PipeOpTorchTokenizer.R
@@ -13,7 +13,7 @@ initialize_token_ = function(x, d, initialization) {
 #' @title Numeric Tokenizer
 #' @inherit nn_tokenizer_num description
 #' @section nn_module:
-#' Calls [`nn_tokenizer_num()`] when trained where the parameter `n_features` is inferred.
+#' Calls [`nn_tokenizer_numeric()`] when trained where the parameter `n_features` is inferred.
 #' The output shape is `(batch, n_features, d_token)`.
 #'
 #' @section Parameters:
@@ -47,7 +47,7 @@ PipeOpTorchTokenizerNum = R6Class("PipeOpTorchTokenizerNum",
         id = id,
         param_set = param_set,
         param_vals = param_vals,
-        module_generator = nn_tokenizer_num
+        module_generator = nn_tokenizer_numeric
       )
     }
   ),
@@ -68,7 +68,6 @@ PipeOpTorchTokenizerNum = R6Class("PipeOpTorchTokenizerNum",
 #' @name nn_tokenizer_num
 #' @description
 #' Tokenizes numeric features into a dense embedding.
-#' For an input of shape `(batch, n_features)` the output shape is `(batch, n_features, d_token)`.
 #' @param n_features (`integer(1)`)\cr
 #'   The number of features.
 #' @param d_token (`integer(1)`)\cr
@@ -82,7 +81,7 @@ PipeOpTorchTokenizerNum = R6Class("PipeOpTorchTokenizerNum",
 #' @references
 #' `r format_bib("gorishniy2021revisiting")`
 #' @export
-nn_tokenizer_num = nn_module(
+nn_tokenizer_numeric = nn_module(
   "nn_tokenizer_num",
   initialize = function(n_features, d_token, bias, initialization) {
     self$n_features = assert_int(n_features, lower = 1L)
@@ -120,7 +119,6 @@ nn_tokenizer_num = nn_module(
 #' @name nn_tokenizer_categ
 #' @description
 #' Tokenizes categorical features into a dense embedding.
-#' For an input of shape `(batch, n_features)` the output shape is `(batch, n_features, d_token)`.
 #' @param cardinalities (`integer()`)\cr
 #'   The number of categories for each feature.
 #' @param d_token (`integer(1)`)\cr
@@ -147,7 +145,7 @@ nn_tokenizer_categ = nn_module(
     cardinalities_cs = cumsum(cardinalities)
     category_offsets = torch_tensor(c(0, cardinalities_cs[-length(cardinalities_cs)]),
       dtype = torch_long())
-    self$category_offsets = nn_buffer(category_offsets)
+    self$register_buffer("category_offsets", category_offsets, persistent = FALSE)
     n_embeddings = cardinalities_cs[length(cardinalities_cs)]
 
     self$embeddings = nn_embedding(n_embeddings, d_token)
@@ -181,17 +179,7 @@ nn_tokenizer_categ = nn_module(
 #' @section nn_module:
 #' Calls [`nn_tokenizer_categ()`] when trained where the parameter `cardinalities` is inferred.
 #' The output shape is `(batch, n_features, d_token)`.
-#' @section Parameters:
-#' * `d_token` :: `integer(1)`\cr
-#'   The dimension of the embedding.
-#' * `bias` :: `logical(1)`\cr
-#'   Whether to use a bias. Is initialized to `TRUE`.
-#' * `initialization` :: `character(1)`\cr
-#'   The initialization method for the embedding weights. Possible values are `"uniform"` (default)
-#'   and `"normal"`.
-#' * `cardinalities` :: `integer()`\cr
-#'   The number of categories for each feature.
-#'   Only needs to be provided when working with [`lazy_tensor`] inputs.
+#' @inheritSection mlr_pipeops_nn_tokenizer_num Parameters
 #' @templateVar id nn_tokenizer_categ
 #' @template pipeop_torch_channels_default
 #' @templateVar param_vals d_token = 10
@@ -208,8 +196,7 @@ PipeOpTorchTokenizerCateg = R6Class("PipeOpTorchTokenizerCateg",
       param_set = ps(
         d_token = p_int(lower = 1, tags = c("train", "required")),
         bias = p_lgl(init = TRUE, tags = "train"),
-        initialization = p_fct(init = "uniform", levels = c("uniform", "normal"), tags = "train"),
-        cardinalities = p_int(lower = 1, tags = "train")
+        initialization = p_fct(init = "uniform", levels = c("uniform", "normal"), tags = "train")
       )
       super$initialize(
         id = id,
@@ -221,15 +208,6 @@ PipeOpTorchTokenizerCateg = R6Class("PipeOpTorchTokenizerCateg",
   ),
   private = list(
     .shape_dependent_params = function(shapes_in, param_vals, task) {
-      if ("lazy_tensor" %in% task$feature_types$type) {
-        if (!single_lazy_tensor(task)) {
-          stopf("Categorical tokenizer can only work with a single lazy tensor, but got %i", sum(task$feature_types$type == "lazy_tensor"))
-        }
-        if (is.null(param_vals$cardinalities)) {
-          stopf("Categorical tokenizer received a lazy tensor input, but no parameter 'cardinalities' was specified.")
-        }
-        return(param_vals)
-      }
       c(param_vals, list(cardinalities = lengths(task$levels(task$feature_names))))
     },
     .shapes_out = function(shapes_in, param_vals, task) {
diff --git a/R/TaskClassif_cifar.R b/R/TaskClassif_cifar.R
index d498a8053..798421e27 100644
--- a/R/TaskClassif_cifar.R
+++ b/R/TaskClassif_cifar.R
@@ -34,6 +34,8 @@
 #' @examples
 #' task_cifar10 = tsk("cifar10")
 #' task_cifar100 = tsk("cifar100")
+#' print(task_cifar10)
+#' print(task_cifar100)
 NULL
 
 cifar_ds_generator = torch::dataset(
diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R
index 248470326..39b84a475 100644
--- a/R/TaskClassif_melanoma.R
+++ b/R/TaskClassif_melanoma.R
@@ -39,6 +39,7 @@
 #' `r format_bib("melanoma2021")`
 #' @examples
 #' task = tsk("melanoma")
+#' task
 NULL
 
 # @param path (`character(1)`)\cr
diff --git a/R/TaskClassif_mnist.R b/R/TaskClassif_mnist.R
index 099152af9..a35e7ab4d 100644
--- a/R/TaskClassif_mnist.R
+++ b/R/TaskClassif_mnist.R
@@ -23,8 +23,9 @@
 #'
 #' @references
 #' `r format_bib("mnist")`
-#' @examples
+#' @examplesIf torch::torch_is_installed()
 #' task = tsk("mnist")
+#' task
 NULL
 
 # @param path (`character(1)`)\cr
diff --git a/R/TaskClassif_tiny_imagenet.R b/R/TaskClassif_tiny_imagenet.R
index c8c45ba21..431d10eff 100644
--- a/R/TaskClassif_tiny_imagenet.R
+++ b/R/TaskClassif_tiny_imagenet.R
@@ -24,8 +24,9 @@
 #'
 #' @references
 #' `r format_bib("imagenet2009")`
-#' @examples
+#' @examplesIf torch::torch_is_installed()
 #' task = tsk("tiny_imagenet")
+#' task
 NULL
 
 # @param path (`character(1)`)\cr
diff --git a/R/TorchDescriptor.R b/R/TorchDescriptor.R
index fb3cd7a13..e8170430a 100644
--- a/R/TorchDescriptor.R
+++ b/R/TorchDescriptor.R
@@ -57,7 +57,10 @@ TorchDescriptor = R6Class("TorchDescriptor",
           args = formalArgs(init)
         }
       }
-
+      if ("..." %nin% args && !test_subset(self$param_set$ids(), args)) {
+        missing = setdiff(self$param_set$ids(), args)
+        stopf("Parameter values with ids %s are missing in generator.", paste0("'", missing, "'", collapse = ", "))
+      }
       self$man = assert_string(man, null.ok = TRUE)
       self$id = assert_string(id %??% class(generator)[[1L]], min.chars = 1L)
       self$label = assert_string(label %??% self$id, min.chars = 1L)
diff --git a/R/TorchLoss.R b/R/TorchLoss.R
index 2d9f154a2..3fa657df2 100644
--- a/R/TorchLoss.R
+++ b/R/TorchLoss.R
@@ -92,9 +92,8 @@ TorchLoss = R6::R6Class("TorchLoss",
     task_types = NULL,
     #' @description
     #' Creates a new instance of this [R6][R6::R6Class] class.
-    #' @param torch_loss (`nn_loss` or `function`)\cr
-    #'   The loss module or function that generates the loss module.
-    #'   Can have arguments `task` that will be provided when the loss is instantiated.
+    #' @param torch_loss (`nn_loss`)\cr
+    #'   The loss module.
     #' @param task_types (`character()`)\cr
     #'   The task types supported by this loss.
     #' @param param_set ([`ParamSet`][paradox::ParamSet] or `NULL`)\cr
@@ -111,9 +110,8 @@ TorchLoss = R6::R6Class("TorchLoss",
       } else {
         c("classif", "regr")
       }
-      assert(check_class(torch_loss, "nn_module_generator"), check_class(torch_loss, "function"))
+      torch_loss = assert_class(torch_loss, "nn_module")
 
-      param_set = assert_r6(param_set, "ParamSet", null.ok = TRUE) %??% inferps(torch_loss, ignore = "task")
       super$initialize(
         generator = torch_loss,
         id = id,
@@ -130,20 +128,6 @@ TorchLoss = R6::R6Class("TorchLoss",
       super$print(...)
       catn(str_indent("* Task Types:", as_short_string(self$task_types, 1000L)))
       invisible(self)
-    },
-    #' @description
-    #' Instantiates the loss function.
-    #' @param task (`Task`)\cr
-    #'   The task. Must be provided if the loss function requires a task.
-    #' @return `torch_loss`
-    generate = function(task = NULL) {
-      require_namespaces(self$packages)
-      args = self$param_set$get_values()
-      if ("task" %in% formalArgs(self$generator)) {
-        assert_true(!is.null(task))
-        args = insert_named(args, list(task = task))
-      }
-      do.call(self$generator, args)
     }
   ),
   private = list(
@@ -275,67 +259,16 @@ mlr3torch_losses$add("l1", function() {
 
 mlr3torch_losses$add("cross_entropy", function() {
   p = ps(
-    class_weight = p_uty(default = NULL, tags = "train"),
+    weight = p_uty(default = NULL, tags = "train"),
     ignore_index = p_int(default = -100, tags = "train"),
     reduction = p_fct(levels = c("mean", "sum"), default = "mean", tags = "train")
   )
   TorchLoss$new(
-    torch_loss = function(task, ...) {
-      if (task$task_type != "classif") {
-        stopf("Cross entropy loss is only defined for classification tasks, but task is of type '%s'", task$task_type)
-      }
-      args = list(...)
-      is_binary = "twoclass" %in% task$properties
-      if (is_binary) {
-        if (!is.null(args$ignore_index)) {
-          stopf("ignore_index is not supported for binary cross entropy loss")
-        }
-        if (!is.null(args$class_weight)) {
-          args$pos_weight = args$class_weight
-          args$class_weight = NULL
-        }
-        return(invoke(nn_bce_with_logits_loss, .args = args))
-      }
-      if (!is.null(args$class_weight)) {
-        args$weight = args$class_weight
-        args$class_weight = NULL
-      }
-      invoke(nn_cross_entropy_loss, .args = args)
-    },
+    torch_loss = torch::nn_cross_entropy_loss,
     task_types = "classif",
     param_set = p,
     id = "cross_entropy",
     label = "Cross Entropy",
-    man = "mlr3torch::cross_entropy"
+    man = "torch::nn_cross_entropy_loss"
   )
 })
-
-#' @title Cross Entropy Loss
-#' @name cross_entropy
-#' @description
-#' The `cross_entropy` loss function selects the multi-class ([`nn_cross_entropy_loss`][torch::nn_cross_entropy_loss])
-#' or binary ([`nn_bce_with_logits_loss`][torch::nn_bce_with_logits_loss]) cross entropy
-#' loss based on the number of classes.
-#' Because of this, there is a slight reparameterization of the loss arguments, see *Parameters*.
-#' @section Parameters:
-#' * `class_weight`:: [`torch_tensor`][torch::torch_tensor]\cr
-#'    The class weights. For multi-class problems, this must be a `torch_tensor` of length `num_classes`
-#'    (and is passed as argument `weight` to [`nn_cross_entropy_loss`][torch::nn_cross_entropy_loss]).
-#'    For binary problems, this must be a scalar (and is passed as argument `pos_weight` to
-#'    [`nn_bce_with_logits_loss`][torch::nn_bce_with_logits_loss]).
-#' - `ignore_index`:: `integer(1)`\cr
-#'    Index of the class which to ignore and which does not contribute to the gradient.
-#'    This is only available for multi-class loss.
-#' - `reduction` :: `character(1)`\cr
-#'    The reduction to apply. Is either `"mean"` or `"sum"` and passed as argument `reduction`
-#'    to either loss function. The default is `"mean"`.
-#' @examplesIf torch::torch_is_installed()
-#' loss = t_loss("cross_entropy")
-#' # multi-class
-#' multi_ce = loss$generate(tsk("iris"))
-#' multi_ce
-#'
-#' # binary
-#' binary_ce = loss$generate(tsk("sonar"))
-#' binary_ce
-NULL
diff --git a/R/TorchOptimizer.R b/R/TorchOptimizer.R
index 0edb85a40..f183380f6 100644
--- a/R/TorchOptimizer.R
+++ b/R/TorchOptimizer.R
@@ -1,7 +1,3 @@
-single_param_group = function(params) {
-  return(params)
-}
-
 #' @title Convert to TorchOptimizer
 #'
 #' @description
@@ -45,11 +41,11 @@ as_torch_optimizer.character = function(x, clone = FALSE, ...) { # nolint
 #' @title Torch Optimizer
 #'
 #' @description
-#' This wraps a `torch::torch_optimizer_generator` and annotates it with metadata, most importantly a [`ParamSet`][paradox::ParamSet].
+#' This wraps a `torch::torch_optimizer_generator`a and annotates it with metadata, most importantly a [`ParamSet`][paradox::ParamSet].
 #' The optimizer is created for the given parameter values by calling the `$generate()` method.
 #'
 #' This class is usually used to configure the optimizer of a torch learner, e.g.
-#' when constructing a learner or in a [`ModelDescriptor`].
+#' when construcing a learner or in a [`ModelDescriptor`].
 #'
 #' For a list of available optimizers, see [`mlr3torch_optimizers`].
 #' Items from this dictionary can be retrieved using [`t_opt()`].
@@ -57,12 +53,12 @@ as_torch_optimizer.character = function(x, clone = FALSE, ...) { # nolint
 #' @section Parameters:
 #' Defined by the constructor argument `param_set`.
 #' If no parameter set is provided during construction, the parameter set is constructed by creating a parameter
-#' for each argument of the wrapped loss function, where the parameters are then of type [`ParamUty`][paradox::Domain].
+#' for each argument of the wrapped loss function, where the parametes are then of type [`ParamUty`][paradox::Domain].
 #'
 #' @family Torch Descriptor
 #' @export
 #' @examplesIf torch::torch_is_installed()
-#' # Create a new torch optimizer
+#' # Create a new torch loss
 #' torch_opt = TorchOptimizer$new(optim_ignite_adam, label = "adam")
 #' torch_opt
 #' # If the param set is not specified, parameters are inferred but are of class ParamUty
@@ -113,9 +109,6 @@ TorchOptimizer = R6::R6Class("TorchOptimizer",
       } else {
         param_set = inferps(torch_optimizer, ignore = "params")
       }
-      param_set = c(param_set,
-        ps(param_groups = p_uty(custom_check = check_function, tags = "train"))
-      )
       super$initialize(
         generator = torch_optimizer,
         id = id,
@@ -132,15 +125,7 @@ TorchOptimizer = R6::R6Class("TorchOptimizer",
     #' @return `torch_optimizer`
     generate = function(params) {
       require_namespaces(self$packages)
-      pvs = self$param_set$get_values()
-      param_groups_fn = pvs$param_groups
-
-      if (!is.null(param_groups_fn)) {
-        pvs$param_groups = NULL
-        params = param_groups_fn(params)
-      }
-
-      invoke(self$generator, .args = pvs, params = params)
+      invoke(self$generator, .args = self$param_set$get_values(), params = params)
     }
   ),
   private = list(
@@ -293,6 +278,7 @@ mlr3torch_optimizers$add("adam",
   }
 )
 
+
 mlr3torch_optimizers$add("sgd",
   function() {
     p = ps(
@@ -312,6 +298,8 @@ mlr3torch_optimizers$add("sgd",
   }
 )
 
+
+
 mlr3torch_optimizers$add("rmsprop",
   function() {
     p = ps(
@@ -332,6 +320,7 @@ mlr3torch_optimizers$add("rmsprop",
   }
 )
 
+
 mlr3torch_optimizers$add("adagrad",
   function() {
     p = ps(
diff --git a/R/bibentries.R b/R/bibentries.R
index 0d9f2513b..9a1c7667c 100644
--- a/R/bibentries.R
+++ b/R/bibentries.R
@@ -127,15 +127,6 @@ bibentries = c(# nolint start
     author = "Krizhevsky, Alex",
     journal= "Master's thesis, Department of Computer Science, University of Toronto",,
     year = "2009",
-  ),
-  shazeer2020glu = bibentry("misc",
-    title = "GLU Variants Improve Transformer",
-    author = "Noam Shazeer",
-    year = "2020",
-    eprint = "2002.05202",
-    archivePrefix = "arXiv",
-    primaryClass = "cs.LG",
-    url = "https://arxiv.org/abs/2002.05202"
   )
 ) # nolint end
 
diff --git a/R/lazy_tensor.R b/R/lazy_tensor.R
index caa59fd2c..d050f8545 100644
--- a/R/lazy_tensor.R
+++ b/R/lazy_tensor.R
@@ -231,7 +231,6 @@ is_lazy_tensor = function(x) {
 #'   Is not cloned, so should be cloned beforehand.
 #' @param shape (`integer()` or `NULL`)\cr
 #'   The shape of the lazy tensor.
-#'   `NA`s indicate dimensions where the shape is not known.
 #' @param shape_predict (`integer()` or `NULL`)\cr
 #'   The shape of the lazy tensor if it was applied during `$predict()`.
 #'
@@ -340,21 +339,3 @@ rep.lazy_tensor = function(x, ...) {
 rep_len.lazy_tensor = function(x, ...) {
   set_class(NextMethod(), c("lazy_tensor", "list"))
 }
-
-
-#' @title Shape of Lazy Tensor
-#' @description
-#' Shape of a lazy tensor. Might be `NULL` if the shapes is not known or varying between rows.
-#' Batch dimension is always `NA`.
-#' @param x ([`lazy_tensor`])\cr
-#'   Lazy tensor.
-#' @export
-#' @return (`integer()` or `NULL`)
-#' @examplesIf torch::torch_is_installed()
-#' lt = as_lazy_tensor(1:10)
-#' lazy_shape(lt)
-#' lt = as_lazy_tensor(matrix(1:10, nrow = 2))
-#' lazy_shape(lt)
-lazy_shape = function(x) {
-  dd(x)$pointer_shape
-}
diff --git a/R/learner_torch_methods.R b/R/learner_torch_methods.R
index 19d9b7881..913d9bf55 100644
--- a/R/learner_torch_methods.R
+++ b/R/learner_torch_methods.R
@@ -27,8 +27,9 @@ learner_torch_train = function(self, private, super, task, param_vals) {
     stopf("Training Dataloader of Learner '%s' has length 0", self$id)
   }
 
-  network = private$.network(task, param_vals)$to(device = param_vals$device)
-  if (isTRUE(param_vals$jit_trace) && !inherits(network, "script_module")) {
+  network = private$.network(task, param_vals)
+  network$to(device = param_vals$device)
+  if (param_vals$jit_trace && !inherits(network, "script_module")) {
     example = get_example_batch(loader_train)$x
     example = lapply(example, function(x) x$to(device = param_vals$device))
     # tracer requires arguments to be passed by name
@@ -42,7 +43,7 @@ learner_torch_train = function(self, private, super, task, param_vals) {
   if (is.null(self$optimizer)) stopf("Learner '%s' defines no optimizer", self$id)
   optimizer = self$optimizer$generate(network$parameters)
   if (is.null(self$loss)) stopf("Learner '%s' defines no loss", self$id)
-  loss_fn = self$loss$generate(task)
+  loss_fn = self$loss$generate()
   loss_fn$to(device = param_vals$device)
 
   measures_train = normalize_to_list(param_vals$measures_train)
@@ -134,6 +135,8 @@ train_loop = function(ctx, cbs) {
 
   ctx$network$train()
 
+  forward = get_forward(ctx$network)
+
   # if we increment epoch at the end of the loop it has the wrong value
   # during the final two callback stages
   ctx$epoch = 0L
@@ -145,6 +148,7 @@ train_loop = function(ctx, cbs) {
     indices = list()
     train_iterator = dataloader_make_iter(ctx$loader_train)
     ctx$step = 0L
+    eval_train = eval_train_in_epoch(ctx)
     while (ctx$step < length(ctx$loader_train)) {
       ctx$step = ctx$step + 1
       ctx$batch = dataloader_next(train_iterator)
@@ -155,26 +159,28 @@ train_loop = function(ctx, cbs) {
       call("on_batch_begin")
 
       if (length(ctx$batch$x) == 1L) {
-        ctx$y_hat = ctx$network(ctx$batch$x[[1L]])
+        y_hat = forward(ctx$batch$x[[1L]])
       } else {
-        ctx$y_hat = do.call(ctx$network, ctx$batch$x)
+        y_hat = do.call(forward, ctx$batch$x)
       }
 
-      loss = ctx$loss_fn(ctx$y_hat, ctx$batch$y)
+      loss = ctx$loss_fn(y_hat, ctx$batch$y)
 
       loss$backward()
 
       call("on_after_backward")
 
       ctx$last_loss = loss$item()
-      predictions[[length(predictions) + 1]] = ctx$y_hat$detach()
-      indices[[length(indices) + 1]] = as.integer(ctx$batch$.index$to(device = "cpu"))
+      if (eval_train) {
+        predictions[[length(predictions) + 1]] = y_hat$detach()
+        indices[[length(indices) + 1]] = as.integer(ctx$batch$.index$to(device = "cpu"))
+      }
       ctx$optimizer$step()
 
       call("on_batch_end")
     }
 
-    ctx$last_scores_train = if (eval_train_in_epoch(ctx)) {
+    ctx$last_scores_train = if (eval_train) {
       measure_prediction(
         pred_tensor = torch_cat(predictions, dim = 1L),
         measures = ctx$measures_train,
@@ -228,9 +234,6 @@ eval_valid_in_epoch = function(ctx) {
 }
 
 has_one_arg = function(network) {
-  if (inherits(network, "nn_graph")) {
-    return(length(network$input_map) == 1L)
-  }
   fargs = formalArgs(network)
   length(fargs) == 1L && !fargs == "..."
 }
@@ -241,14 +244,14 @@ torch_network_predict_valid = function(ctx, callback_receiver = function(step_na
   one_arg = has_one_arg(network)
   predictions = vector("list", length = length(loader))
   valid_iterator = dataloader_make_iter(loader)
-  ctx$step_valid = 0L
-  while (ctx$step_valid < length(loader)) {
-    ctx$step_valid = ctx$step_valid + 1L
+  ctx$step = 0L
+  while (ctx$step < length(loader)) {
+    ctx$step = ctx$step + 1L
     ctx$batch = dataloader_next(valid_iterator)
     ctx$batch$x = lapply(ctx$batch$x, function(x) x$to(device = ctx$device))
 
     callback_receiver("on_batch_valid_begin")
-    predictions[[ctx$step_valid]] = if (one_arg) {
+    predictions[[ctx$step]] = if (one_arg) {
       with_no_grad(network$forward(ctx$batch$x[[1L]]))
     } else {
       with_no_grad(invoke(network$forward, .args = ctx$batch$x))
@@ -287,7 +290,7 @@ encode_prediction_default = function(predict_tensor, predict_type, task) {
   # Currently this check is done in mlr3torch but should at some point be handled in mlr3 / mlr3pipelines
 
   response = prob = NULL
-  if (task$task_type == "classif" && "multiclass" %in% task$properties) {
+  if (task$task_type == "classif") {
     if (predict_type == "prob") {
       predict_tensor = with_no_grad(nnf_softmax(predict_tensor, dim = 2L))
     }
@@ -295,31 +298,15 @@ encode_prediction_default = function(predict_tensor, predict_type, task) {
     response = as.integer(with_no_grad(predict_tensor$argmax(dim = 2L))$to(device = "cpu"))
 
     predict_tensor = predict_tensor$to(device = "cpu")
-    prob = if (predict_type == "prob") {
+    if (predict_type == "prob") {
       prob = as.matrix(predict_tensor)
       colnames(prob) = task$class_names
-      prob
+    } else {
+      prob = NULL
     }
 
     class(response) = "factor"
     levels(response) = task$class_names
-    return(list(response = response, prob = prob))
-  } else if (task$task_type == "classif") {
-    # binary:
-    # (first factor level is positive class)
-    response = as.integer(with_no_grad(predict_tensor < 0)$to(device = "cpu") + 1)
-    class(response) = "factor"
-    levels(response) = task$class_names
-
-    prob = if (predict_type == "prob") {
-      # convert score to prob
-      predict_tensor = with_no_grad(nnf_sigmoid(predict_tensor))
-      prob = as.numeric(predict_tensor)
-      prob = as.matrix(data.frame(prob, 1 - prob))
-      colnames(prob) = task$class_names
-      prob
-    }
-
     return(list(response = response, prob = prob))
   } else if (task$task_type == "regr") {
     if (predict_type == "response") {
@@ -330,6 +317,7 @@ encode_prediction_default = function(predict_tensor, predict_type, task) {
   } else {
     stopf("Invalid task_type.")
   }
+
 }
 
 
@@ -339,16 +327,13 @@ measure_prediction = function(pred_tensor, measures, task, row_ids, prediction_e
   }
 
   prediction = prediction_encoder(predict_tensor = pred_tensor, task = task)
-  prediction = as_prediction_data(prediction, task = task, check = FALSE, row_ids = row_ids)
-  prediction = as_prediction(prediction, task = task, check = FALSE)
+  prediction = as_prediction_data(prediction, task = task, check = TRUE, row_ids = row_ids)
+  prediction = as_prediction(prediction, task = task)
 
   lapply(
     measures,
     function(measure) {
-      tryCatch(
-        measure$score(prediction, task = task, train_set = task$row_roles$use),
-        error = function(e) NaN
-      )
+      measure$score(prediction, task = task, train_set = task$row_roles$use)
     }
   )
 }
diff --git a/R/materialize.R b/R/materialize.R
index 1b1730d63..849024ad4 100644
--- a/R/materialize.R
+++ b/R/materialize.R
@@ -63,13 +63,6 @@ materialize.list = function(x, device = "cpu", rbind = FALSE, cache = "auto", ..
 
   map(x, function(col) {
     if (is_lazy_tensor(col)) {
-      if (length(col) == 0L) {
-        if (rbind) {
-          return(torch_empty(0L, device = device))
-        } else {
-          return(list())
-        }
-      }
       materialize_internal(col, device = device, cache = cache, rbind = rbind)
     } else {
       col
@@ -83,30 +76,16 @@ materialize.list = function(x, device = "cpu", rbind = FALSE, cache = "auto", ..
 #' @method materialize data.frame
 #' @export
 materialize.data.frame = function(x, device = "cpu", rbind = FALSE, cache = "auto", ...) { # nolint
-  if (nrow(x) == 0L) {
-    if (rbind) {
-      set_names(replicate(ncol(x), torch_empty(0L)), names(x))
-    } else {
-      set_names(replicate(ncol(x), list()), names(x))
-    }
-  }
   materialize(as.list(x), device = device, rbind = rbind, cache = cache)
 }
 
 
 #' @export
 materialize.lazy_tensor = function(x, device = "cpu", rbind = FALSE, ...) { # nolint
-  if (length(x) == 0L) {
-    if (rbind) {
-      return(torch_empty(0L))
-    } else {
-      return(list())
-    }
-  }
   materialize_internal(x = x, device = device, cache = NULL, rbind = rbind)
 }
 
-get_input = function(ds, ids, varying_shapes) {
+get_input = function(ds, ids, varying_shapes, rbind) {
   if (is.null(ds$.getbatch)) { # .getindex is never NULL but a function that errs if it was not defined
     x = map(ids, function(id) map(ds$.getitem(id), function(x) x$unsqueeze(1)))
     if (varying_shapes) {
@@ -175,13 +154,16 @@ get_output = function(input, graph, varying_shapes, rbind, device) {
 #' @return [`lazy_tensor()`]
 #' @keywords internal
 materialize_internal = function(x, device = "cpu", cache = NULL, rbind) {
+  if (!length(x)) {
+    stopf("Cannot materialize lazy tensor of length 0.")
+  }
   do_caching = !is.null(cache)
   ids = map_int(x, 1)
 
   data_descriptor = dd(x)
   ds = data_descriptor$dataset
   graph = data_descriptor$graph
-  varying_shapes = is.null(data_descriptor$dataset$.getbatch) && some(data_descriptor$dataset_shapes, function(x) is.null(x) || anyNA(x[-1]))
+  varying_shapes = some(data_descriptor$dataset_shapes, is.null)
 
   pointer_name = paste0(data_descriptor$pointer, collapse = ".")
   if (do_caching) {
@@ -201,7 +183,7 @@ materialize_internal = function(x, device = "cpu", cache = NULL, rbind) {
   }
 
   if (!do_caching || !input_hit) {
-    input = get_input(ds, ids, varying_shapes)
+    input = get_input(ds, ids, varying_shapes, rbind)
   }
 
   if (do_caching && !input_hit) {
diff --git a/R/nn.R b/R/nn.R
index e8efe02c6..1e97e6b15 100644
--- a/R/nn.R
+++ b/R/nn.R
@@ -11,9 +11,5 @@
 #' # is the same as:
 #' po2 = nn("linear")
 nn = function(.key, ...) {
-  args = list(...)
-  if (is.null(args$id)) {
-    args$id = .key
-  }
-  invoke(po, .obj = paste0("nn_", .key), .args = args)
+  invoke(po, .obj = paste0("nn_", .key), id = .key, ...)
 }
diff --git a/R/nn_graph.R b/R/nn_graph.R
index 7d0a98f1e..06f6257f5 100644
--- a/R/nn_graph.R
+++ b/R/nn_graph.R
@@ -1,23 +1,7 @@
 #' @title Graph Network
 #'
 #' @description
-#' Represents a neural network using a [`Graph`][mlr3pipelines::Graph] that contains mostly [`PipeOpModule`]s.
-#'
-#' @section Fields:
-#' * `graph` :: [`Graph`][mlr3pipelines::Graph]\cr
-#'   The graph (consisting primarily of [`PipeOpModule`]s) that is wrapped by the network.
-#' * `input_map` :: `character()`\cr
-#'   The names of the input arguments of the network.
-#' * `shapes_in` :: `list()`\cr
-#'   The shapes of the input tensors of the network.
-#' * `output_map` :: `character()`\cr
-#'   Which output elements of the graph are returned by the `$forward()` method.
-#' * `list_output` :: `logical(1)`\cr
-#'   Whether the output is a list of tensors.
-#' * `module_list` :: [`nn_module_list`][torch::nn_module_list]\cr
-#'   The list of modules in the network.
-#' * `list_output` :: `logical(1)`\cr
-#'   Whether the output is a list of tensors.
+#' Represents a neural network using a [`Graph`][mlr3pipelines::Graph] that usually costains mostly [`PipeOpModule`]s.
 #'
 #' @param graph ([`Graph`][mlr3pipelines::Graph])\cr
 #'   The [`Graph`][mlr3pipelines::Graph] to wrap. Is **not** cloned.
@@ -48,14 +32,14 @@ nn_graph = nn_module(
   "nn_graph",
   initialize = function(graph, shapes_in, output_map = graph$output$name, list_output = FALSE) {
     self$graph = as_graph(graph, clone = FALSE)
-    self$input_map = graph$input$name  # cache this, it is expensive
+    self$graph_input_name = graph$input$name  # cache this, it is expensive
 
     # we do NOT verify the input and type of the graph to be `"torch_tensor"`.
     # The reason for this is that the graph, when constructed with the PipeOpTorch Machinery, contains PipeOpNOPs,
     # which have input and output type *.
 
     self$list_output = assert_flag(list_output)
-    assert_names(names(shapes_in), permutation.of = self$input_map)
+    assert_names(names(shapes_in), permutation.of = self$graph_input_name)
     self$shapes_in = assert_list(shapes_in, types = "integerish")
     self$output_map = assert_subset(output_map, self$graph$output$name)
     if (!list_output && length(output_map) != 1) {
diff --git a/R/paramset_torchlearner.R b/R/paramset_torchlearner.R
index e0dac0b36..0af5d74d4 100644
--- a/R/paramset_torchlearner.R
+++ b/R/paramset_torchlearner.R
@@ -45,7 +45,7 @@ epochs_tune_fn = function(domain, param_vals) {
 }
 
 
-paramset_torchlearner = function(task_type, jittable = FALSE) {
+paramset_torchlearner = function(task_type) {
   check_measures = switch(task_type,
     regr = check_measures_regr,
     classif = check_measures_classif,
@@ -59,6 +59,7 @@ paramset_torchlearner = function(task_type, jittable = FALSE) {
     num_threads           = p_int(lower = 1L, tags = c("train", "predict", "required", "threads"), init = 1L),
     num_interop_threads   = p_int(lower = 1L, tags = c("train", "predict", "required"), init = 1L),
     seed                  = p_int(tags = c("train", "predict", "required"), special_vals = list("random", NULL), init = "random"),
+    jit_trace             = p_lgl(init = FALSE, tags = c("train", "required")),
     # evaluation
     eval_freq             = p_int(lower = 1L, tags = c("train", "required"), init = 1L),
     measures_train        = p_uty(tags = c("train", "required"), custom_check = check_measures, init = list()),
@@ -81,12 +82,6 @@ paramset_torchlearner = function(task_type, jittable = FALSE) {
     worker_packages       = p_uty(tags = c("train", "predict"), custom_check = check_character, special_vals = list(NULL)),
     tensor_dataset        = p_fct(levels = "device", init = FALSE, tags = c("train", "predict"), special_vals = list(FALSE, TRUE))
   )
-  if (jittable) {
-    param_set = c(
-      param_set,
-      ps(jit_trace = p_lgl(init = FALSE, tags = c("train", "required")))
-    )
-  }
   return(param_set)
 }
 
diff --git a/R/preprocess.R b/R/preprocess.R
index 418dd7cd4..637fcfdf5 100644
--- a/R/preprocess.R
+++ b/R/preprocess.R
@@ -7,9 +7,7 @@ NULL
 register_preproc("trafo_resize", torchvision::transform_resize,
   packages = "torchvision",
   param_set = ps(
-    size = p_uty(tags = c("train", "required"), custom_check = crate(function(x) {
-      check_integerish(x, min.len = 1L, max.len = 2L)
-    })),
+    size = p_uty(tags = c("train", "required")),
     interpolation = p_fct(levels = c("Undefined", "Bartlett", "Blackman", "Bohman", "Box", "Catrom", "Cosine", "Cubic",
       "Gaussian", "Hamming", "Hann", "Hanning", "Hermite", "Jinc", "Kaiser", "Lagrange", "Lanczos", "Lanczos2",
       "Lanczos2Sharp", "LanczosRadius", "LanczosSharp", "Mitchell", "Parzen", "Point", "Quadratic", "Robidoux",
@@ -19,14 +17,17 @@ register_preproc("trafo_resize", torchvision::transform_resize,
     )
   ),
   shapes_out = function(shapes_in, param_vals, task) {
-    assert_true(length(shapes_in[[1L]]) >= 2L)
-    size = rep(param_vals$size, length.out = 2L)
-    list(c(shapes_in[[1L]][seq_len(length(shapes_in[[1L]]) - 2)], size))
+    size = param_vals$size
+    shape = shapes_in[[1L]]
+    assert_true(length(shape) > 2)
+    height = shape[[length(shape) - 1L]]
+    width = shape[[length(shape)]]
+    s = torchvision::transform_resize(torch_ones(c(1, height, width), device = "meta"), size = size)$shape[2:3]
+    list(c(shape[seq_len(length(shape) - 2L)], s))
   },
   rowwise = FALSE
 )
 
-
 unchanged_shapes_rgb = function(shapes_in, param_vals, task) {
   assert_rgb_shape(shapes_in[[1L]])
   shapes_in
@@ -360,4 +361,4 @@ register_preproc("augment_random_choice", torchvision::transform_random_choice,
 #    transforms = p_uty(tags = c("train", "required"), custom_check = check_list),
 #    p = p_dbl(default = 0.5, lower = 0, upper = 1, tags = "train")
 #  )
-#)
\ No newline at end of file
+#)
diff --git a/R/shape.R b/R/shape.R
index 6aed4a202..d1fdda83d 100644
--- a/R/shape.R
+++ b/R/shape.R
@@ -13,12 +13,9 @@
 #'   If left `NULL` (default), the first dimension can be `NA` or not.
 #' @param len (`integer(1)`)\cr
 #'   The length of the shape.
-#' @param only_batch_unknown (`logical(1)`)\cr
-#'   Whether only the batch dimension can be `NA` in the input shapes or whether other
-#'   dimensions can also be unknown.
 #' @noRd
-assert_shape = function(shape, null_ok = FALSE, coerce = TRUE, unknown_batch = NULL, len = NULL, only_batch_unknown = FALSE) { # nolint
-  result = check_shape(shape, null_ok = null_ok, unknown_batch = unknown_batch, len = len, only_batch_unknown = only_batch_unknown) # nolint
+assert_shape = function(shape, null_ok = FALSE, coerce = TRUE, unknown_batch = NULL, len = NULL) {
+  result = check_shape(shape, null_ok = null_ok, unknown_batch = unknown_batch, len = len)
 
   if (!isTRUE(result)) stopf(result)
 
@@ -29,20 +26,19 @@ assert_shape = function(shape, null_ok = FALSE, coerce = TRUE, unknown_batch = N
 }
 
 
-test_shape = function(shape, null_ok = FALSE, unknown_batch = NULL, len = NULL, only_batch_unknown = FALSE) {
+test_shape = function(shape, null_ok = FALSE, unknown_batch = NULL, len = NULL) {
   if (is.null(shape) && null_ok) {
     return(TRUE)
   }
-  ok = test_integerish(shape, min.len = 1L, any.missing = TRUE, len = len)
+  ok = test_integerish(shape, min.len = 2L, all.missing = FALSE, any.missing = TRUE, len = len)
 
   if (!ok) {
     return(FALSE)
   }
 
-  if (only_batch_unknown && anyNA(shape[-1L])) {
+  if (anyNA(shape[-1L])) {
     return(FALSE)
   }
-
   if (is.null(unknown_batch)) {
     # first dim can be present or missing
     return(TRUE)
@@ -50,14 +46,14 @@ test_shape = function(shape, null_ok = FALSE, unknown_batch = NULL, len = NULL,
   return(is.na(shape[1L]) == unknown_batch)
 }
 
-check_shape = function(x, null_ok = FALSE, unknown_batch = NULL, len = NULL, only_batch_unknown = FALSE) {
-  if (test_shape(x, null_ok = null_ok, unknown_batch = unknown_batch, len = len, only_batch_unknown = only_batch_unknown)) { # nolint
+check_shape = function(x, null_ok = FALSE, unknown_batch = NULL, len = NULL) {
+  if (test_shape(x, null_ok = null_ok, unknown_batch = unknown_batch, len = len)) {
     return(TRUE)
   }
-  sprintf("Invalid shape: %s.", shape_to_str(x))
+  sprintf("Invalid shape: %s.", paste0(format(x), collapse = ", "))
 }
 
-assert_shapes = function(shapes, coerce = TRUE, named = FALSE, null_ok = FALSE, unknown_batch = NULL, only_batch_unknown = FALSE) { # nolint
+assert_shapes = function(shapes, coerce = TRUE, named = FALSE, null_ok = FALSE, unknown_batch = NULL) { # nolint
   ok = test_list(shapes, min.len = 1L)
   if (named) {
     assert_names(setdiff(names(shapes), "..."), type = "unique")
@@ -65,105 +61,16 @@ assert_shapes = function(shapes, coerce = TRUE, named = FALSE, null_ok = FALSE,
   if (!ok) {
     stopf("Invalid shape")
   }
-  map(shapes, assert_shape, coerce = coerce, null_ok = null_ok, unknown_batch = unknown_batch, only_batch_unknown = only_batch_unknown) # nolint
-}
-
-check_rgb_shape = function(shape) {
-  msg = check_shape(shape, len = 4L, null_ok = FALSE)
-  if (!isTRUE(msg)) {
-    return(msg)
-  }
-  if (shape[2L] != 3L) {
-    return("Second dimension must be 3 for RGB images.")
-  }
-  return(TRUE)
+  map(shapes, assert_shape, coerce = coerce, null_ok = null_ok, unknown_batch = unknown_batch)
 }
 
 assert_rgb_shape = function(shape) {
-  msg = check_rgb_shape(shape)
-  if (!isTRUE(msg)) {
-    stopf(msg)
-  }
-  shape
+  assert_shape(shape, len = 4L, null_ok = FALSE)
+  assert_true(shape[2L] == 3L)
 }
 
 # grayscale or rgb image
 assert_grayscale_or_rgb = function(shape) {
-  assert_shape(shape, len = 4L, null_ok = FALSE, only_batch_unknown = TRUE)
+  assert_shape(shape, len = 4L, null_ok = FALSE)
   assert_true(shape[2L] == 3L || shape[2L] == 1L)
 }
-
-#' @title Infer Shapes
-#' @description
-#' Infer the shapes of the output of a function based on the shapes of the input.
-#' This is done as follows:
-#' 1. All `NA`s are replaced with values `1`, `2`, `3`.
-#' 2. Three tensors are generated for the three shapes of step 1.
-#' 3. The function is called on these three tensors and the shapes are calculated.
-#' 4. If:
-#'    * the number of dimensions varies, an error is thrown.
-#'    * the number of dimensions is the same, values are set to `NA` if the dimension is varying
-#'      between the three tensors and otherwise set to the unique value.
-#'
-#' @param shapes_in (`list()`)\cr
-#'   A list of shapes of the input tensors.
-#' @param param_vals (`list()`)\cr
-#'   A list of named parameters for the function.
-#' @param output_names (`character()`)\cr
-#'   The names of the output tensors.
-#' @param fn (`function()`)\cr
-#'   The function to infer the shapes for.
-#' @param rowwise (`logical(1)`)\cr
-#'   Whether the function is rowwise.
-#' @param id (`character(1)`)\cr
-#'   The id of the PipeOp (for error messages).
-#' @return (`list()`)\cr
-#'   A list of shapes of the output tensors.
-#' @export
-infer_shapes = function(shapes_in, param_vals, output_names, fn, rowwise, id) {
-  assert_shapes(shapes_in)
-  assert_list(param_vals)
-  assert_names(output_names, type = "unique")
-  assert_function(fn)
-  assert_flag(rowwise)
-  assert_string(id)
-
-  infer_shapes_once = function(shapes) {
-    f = function(shapes, na_repl) {
-      if (rowwise) {
-        shapes = shapes[-1L]
-      }
-      shapes[is.na(shapes)] = na_repl
-      tensor_in = mlr3misc::invoke(torch_empty, .args = shapes, device = torch_device("cpu"))
-
-      fn_args = names(formals(fn))
-      filtered_params = param_vals[intersect(names(param_vals), fn_args)]
-
-      tensor_out = tryCatch(invoke(fn, tensor_in, .args = filtered_params),
-        error = function(e) {
-          stopf("Input shape '%s' is invalid for PipeOp with id '%s'.", shape_to_str(list(sin)), id)
-        }
-      )
-      dim(tensor_out)
-    }
-
-    shapes_out = lapply(1:3, f, shapes = shapes)
-
-    if (length(unique(lengths(shapes_out))) > 1L) {
-      stopf("Failed to infer shapes for PipeOp with id '%s', as the number of dimensions varies with different values filled in for the unknown dimensions.", id) # nolint
-    }
-    shapes_out = apply(do.call(rbind, shapes_out), 2, function(xs) {
-      if (length(unique(xs)) == 1L) {
-        return(xs[[1L]])
-      }
-      return(NA)
-    })
-
-    if (rowwise) {
-      shapes_out = c(shapes[[1L]], shapes_out)
-    }
-    as.integer(shapes_out)
-  }
-
-  set_names(lapply(shapes_in, infer_shapes_once), output_names)
-}
diff --git a/R/task_dataset.R b/R/task_dataset.R
index bd088d1bf..af52519ca 100644
--- a/R/task_dataset.R
+++ b/R/task_dataset.R
@@ -167,41 +167,41 @@ merge_compatible_lazy_tensor_graphs = function(lts) {
   })
 }
 
-dataset_ltnsr = function(task, param_vals, argname = "input") {
+dataset_ltnsr = function(task, param_vals) {
   po_ingress = po("torch_ingress_ltnsr", shape = param_vals$shape)
   md = po_ingress$train(list(task))[[1L]]
   ingress = md$ingress
-  names(ingress) = argname
+  names(ingress) = "input"
   task_dataset(
     task = task,
     feature_ingress_tokens = ingress,
-    target_batchgetter = get_target_batchgetter(task)
+    target_batchgetter = get_target_batchgetter(task$task_type)
   )
 }
 
-dataset_num = function(task, param_vals, argname = "input") {
+dataset_num = function(task, param_vals) {
   po_ingress = po("torch_ingress_num")
   md = po_ingress$train(list(task))[[1L]]
   ingress = md$ingress
-  names(ingress) = argname
+  names(ingress) = "input"
   task_dataset(
     task = task,
-    feature_ingress_tokens = ingress,
-    target_batchgetter = get_target_batchgetter(task)
+    feature_ingress_tokens = md$ingress,
+    target_batchgetter = get_target_batchgetter(task$task_type)
   )
 }
 
-dataset_num_categ = function(task, param_vals, argname_num = "num.input", argname_categ = "categ.input") {
+dataset_num_categ = function(task, param_vals) {
   features_num = task$feature_types[get("type") %in% c("numeric", "integer"), "id"][[1L]]
   features_categ = task$feature_types[get("type") %in% c("factor", "ordered", "logical"), "id"][[1L]]
 
   tokens = list()
 
   if (length(features_num)) {
-    tokens[[argname_num]] = TorchIngressToken(features_num, batchgetter_num, c(NA, length(features_num)))
+    tokens$input_num = TorchIngressToken(features_num, batchgetter_num, c(NA, length(features_num)))
   }
   if (length(features_categ)) {
-    tokens[[argname_categ]] = TorchIngressToken(features_categ, batchgetter_categ, c(NA, length(features_categ)))
+    tokens$input_categ = TorchIngressToken(features_categ, batchgetter_categ, c(NA, length(features_categ)))
   }
 
   assert_true(length(tokens) >= 1)
@@ -209,7 +209,7 @@ dataset_num_categ = function(task, param_vals, argname_num = "num.input", argnam
   task_dataset(
     task,
     feature_ingress_tokens = tokens,
-    target_batchgetter = get_target_batchgetter(task)
+    target_batchgetter = get_target_batchgetter(task$task_type)
   )
 }
 
@@ -251,27 +251,17 @@ batchgetter_categ = function(data, ...) {
   )
 }
 
-target_batchgetter_classif_multi = function(data) {
+target_batchgetter_classif = function(data) {
   torch_tensor(data = as.integer(data[[1L]]), dtype = torch_long())
 }
 
-target_batchgetter_classif_binary = function(data) {
-  torch_tensor(data[[1L]] == levels(data[[1L]])[1L], torch_float())$unsqueeze(2)
-}
-
 target_batchgetter_regr = function(data) {
   torch_tensor(data = data[[1L]], dtype = torch_float32())$unsqueeze(2)
 }
 
-get_target_batchgetter = function(task) {
-  task_type = task$task_type
+get_target_batchgetter = function(task_type) {
   switch(task_type,
-    classif = if ("twoclass" %in% task$properties) {
-      target_batchgetter_classif_binary
-    } else {
-      target_batchgetter_classif_multi
-    },
-    regr = target_batchgetter_regr,
-    stopf("Invalid task type: %s", task_type)
+    classif = target_batchgetter_classif,
+    regr = target_batchgetter_regr
   )
 }
diff --git a/R/utils.R b/R/utils.R
index f318013f7..8963d3e14 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -107,6 +107,14 @@ load_col_info = function(name) {
   readRDS(system.file("col_info", paste0(name, ".rds"), package = "mlr3torch"))
 }
 
+get_nout = function(task) {
+  switch(task$task_type,
+    regr = 1,
+    classif = length(task$class_names),
+    stopf("Unknown task type '%s'.", task$task_type)
+  )
+}
+
 
 test_equal_col_info = function(x, y) {
   nms = c("id", "type", "levels")
@@ -148,7 +156,7 @@ uniqueify = function(new, existing) {
 
 shape_to_str = function(x) {
   assert(test_list(x) || test_integerish(x) || is.null(x))
-  if (test_integerish(x)) { # single shape
+  if (is.numeric(x)) { # single shape
     return(sprintf("(%s)", paste0(x, collapse = ",")))
   }
   if (is.null(x)) {
@@ -190,10 +198,7 @@ list_to_batch = function(tensors) {
 }
 
 auto_cache_lazy_tensors = function(lts) {
-  if (length(lts) <= 1L) {
-    return(FALSE)
-  }
-  anyDuplicated(unlist(map_if(lts, function(x) length(x) > 0, function(x) dd(x)$dataset_hash))) > 0L
+  any(duplicated(map_chr(lts, function(x) dd(x)$dataset_hash)))
 }
 
 #' Replace the head of a network
@@ -212,10 +217,6 @@ check_nn_module = function(x) {
   check_class(x, "nn_module")
 }
 
-check_nn_module_generator = function(x) {
-  check_class(x, "nn_module_generator")
-}
-
 check_callbacks = function(x) {
   if (test_class(x, "TorchCallback")) {
     x = list(x)
@@ -275,54 +276,19 @@ order_named_args = function(f, l) {
   l2
 }
 
-
-#' @title Network Output Dimension
-#' @description
-#' Calculates the output dimension of a neural network for a given task that is expected by
-#' \pkg{mlr3torch}.
-#' For classification, this is the number of classes (unless it is a binary classification task,
-#' where it is 1). For regression, it is 1.
-#' @param x (any)\cr
-#'   The task.
-#' @param ... (any)\cr
-#'   Additional arguments. Not used yet.
-#' @export
-output_dim_for = function(x, ...) {
-  UseMethod("output_dim_for")
-}
-
-#' @export
-output_dim_for.TaskClassif = function(x, ...) {
-  if ("twoclass" %in% x$properties) {
-    return(1L)
+get_forward = function(net) {
+  if (inherits(net, "script_module")) {
+    is_training = net$is_training
+    trainforward = net$trainforward
+    evalforward = net$evalforward
+    function(...) {
+      if (is_training()) {
+        trainforward(...)
+      } else {
+        evalforward(...)
+      }
+    }
+  } else {
+    net$forward
   }
-  length(x$class_names)
-}
-
-#' @export
-output_dim_for.TaskRegr = function(x, ...) {
-  1L
-}
-
-all_or_none_ = function(...) {
-  args = list(...)
-  all_none = all(sapply(args, is.null))
-  all_not_none = all(!sapply(args, is.null))
-  return(all_none || all_not_none)
 }
-
-single_lazy_tensor = function(task) {
-  identical(task$feature_types[, "type"][[1L]], "lazy_tensor")
-}
-                              
-n_num_features = function(task) {
-  sum(task$feature_types$type %in% c("numeric", "integer"))
-}
-
-n_categ_features = function(task) {
-  sum(task$feature_types$type %in% c("factor", "ordered", "logical"))
-}
-
-n_ltnsr_features = function(task) {
-  sum(task$feature_types$type == "lazy_tensor")
-}
\ No newline at end of file
diff --git a/man/TorchIngressToken.Rd b/man/TorchIngressToken.Rd
index 3d458db03..c6613e5de 100644
--- a/man/TorchIngressToken.Rd
+++ b/man/TorchIngressToken.Rd
@@ -16,7 +16,7 @@ the output of \code{Task$data(rows = batch_indices, cols = features)}
 and it should produce a tensor of shape \code{shape_out}.}
 
 \item{shape}{(\code{integer})\cr
-Shape that \code{batchgetter} will produce. Batch dimension must be included as \code{NA} (but other dimensions can also be \code{NA}, i.e., unknown).}
+Shape that \code{batchgetter} will produce. Batch-dimension should be included as \code{NA}.}
 }
 \value{
 \code{TorchIngressToken} object.
diff --git a/man/TorchLoss.Rd b/man/TorchLoss.Rd
index eb3ce45a8..1b6e0ec4e 100644
--- a/man/TorchLoss.Rd
+++ b/man/TorchLoss.Rd
@@ -83,13 +83,13 @@ The task types this loss supports.}
 \itemize{
 \item \href{#method-TorchLoss-new}{\code{TorchLoss$new()}}
 \item \href{#method-TorchLoss-print}{\code{TorchLoss$print()}}
-\item \href{#method-TorchLoss-generate}{\code{TorchLoss$generate()}}
 \item \href{#method-TorchLoss-clone}{\code{TorchLoss$clone()}}
 }
 }
 \if{html}{\out{
 <details open><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="mlr3torch" data-topic="TorchDescriptor" data-id="generate"><a href='../../mlr3torch/html/TorchDescriptor.html#method-TorchDescriptor-generate'><code>mlr3torch::TorchDescriptor$generate()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="mlr3torch" data-topic="TorchDescriptor" data-id="help"><a href='../../mlr3torch/html/TorchDescriptor.html#method-TorchDescriptor-help'><code>mlr3torch::TorchDescriptor$help()</code></a></span></li>
 </ul>
 </details>
@@ -114,9 +114,8 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{torch_loss}}{(\code{nn_loss} or \code{function})\cr
-The loss module or function that generates the loss module.
-Can have arguments \code{task} that will be provided when the loss is instantiated.}
+\item{\code{torch_loss}}{(\code{nn_loss})\cr
+The loss module.}
 
 \item{\code{task_types}}{(\code{character()})\cr
 The task types supported by this loss.}
@@ -158,27 +157,6 @@ Prints the object
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-TorchLoss-generate"></a>}}
-\if{latex}{\out{\hypertarget{method-TorchLoss-generate}{}}}
-\subsection{Method \code{generate()}}{
-Instantiates the loss function.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TorchLoss$generate(task = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{task}}{(\code{Task})\cr
-The task. Must be provided if the loss function requires a task.}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-\code{torch_loss}
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-TorchLoss-clone"></a>}}
 \if{latex}{\out{\hypertarget{method-TorchLoss-clone}{}}}
 \subsection{Method \code{clone()}}{
diff --git a/man/TorchOptimizer.Rd b/man/TorchOptimizer.Rd
index 1663ba327..38fd29d3f 100644
--- a/man/TorchOptimizer.Rd
+++ b/man/TorchOptimizer.Rd
@@ -4,11 +4,11 @@
 \alias{TorchOptimizer}
 \title{Torch Optimizer}
 \description{
-This wraps a \code{torch::torch_optimizer_generator} and annotates it with metadata, most importantly a \code{\link[paradox:ParamSet]{ParamSet}}.
+This wraps a \code{torch::torch_optimizer_generator}a and annotates it with metadata, most importantly a \code{\link[paradox:ParamSet]{ParamSet}}.
 The optimizer is created for the given parameter values by calling the \verb{$generate()} method.
 
 This class is usually used to configure the optimizer of a torch learner, e.g.
-when constructing a learner or in a \code{\link{ModelDescriptor}}.
+when construcing a learner or in a \code{\link{ModelDescriptor}}.
 
 For a list of available optimizers, see \code{\link{mlr3torch_optimizers}}.
 Items from this dictionary can be retrieved using \code{\link[=t_opt]{t_opt()}}.
@@ -17,12 +17,12 @@ Items from this dictionary can be retrieved using \code{\link[=t_opt]{t_opt()}}.
 
 Defined by the constructor argument \code{param_set}.
 If no parameter set is provided during construction, the parameter set is constructed by creating a parameter
-for each argument of the wrapped loss function, where the parameters are then of type \code{\link[paradox:Domain]{ParamUty}}.
+for each argument of the wrapped loss function, where the parametes are then of type \code{\link[paradox:Domain]{ParamUty}}.
 }
 
 \examples{
 \dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-# Create a new torch optimizer
+# Create a new torch loss
 torch_opt = TorchOptimizer$new(optim_ignite_adam, label = "adam")
 torch_opt
 # If the param set is not specified, parameters are inferred but are of class ParamUty
diff --git a/man/ingress_categ.Rd b/man/ingress_categ.Rd
index 714d4160d..ffe5e29a2 100644
--- a/man/ingress_categ.Rd
+++ b/man/ingress_categ.Rd
@@ -4,11 +4,7 @@
 \alias{ingress_categ}
 \title{Ingress Token for Categorical Features}
 \usage{
-ingress_categ(shape = NULL)
-}
-\arguments{
-\item{shape}{(\code{integer()} or \code{NULL})\cr
-Shape that \code{batchgetter} will produce. Batch-dimension should be included as \code{NA}.}
+ingress_categ()
 }
 \value{
 \code{\link{TorchIngressToken}}
diff --git a/man/ingress_ltnsr.Rd b/man/ingress_ltnsr.Rd
index dc65f4164..d6e91dc54 100644
--- a/man/ingress_ltnsr.Rd
+++ b/man/ingress_ltnsr.Rd
@@ -4,14 +4,11 @@
 \alias{ingress_ltnsr}
 \title{Ingress Token for Lazy Tensor Feature}
 \usage{
-ingress_ltnsr(feature_name = NULL, shape = NULL)
+ingress_ltnsr(feature_name = NULL)
 }
 \arguments{
 \item{feature_name}{(\code{character(1)})\cr
 Which lazy tensor feature to select if there is more than one.}
-
-\item{shape}{(\code{integer()} or \code{NULL})\cr
-Shape that \code{batchgetter} will produce. Batch-dimension should be included as \code{NA}.}
 }
 \value{
 \code{\link{TorchIngressToken}}
diff --git a/man/ingress_num.Rd b/man/ingress_num.Rd
index 22256e31a..3ea996bfb 100644
--- a/man/ingress_num.Rd
+++ b/man/ingress_num.Rd
@@ -4,11 +4,7 @@
 \alias{ingress_num}
 \title{Ingress Token for Numeric Features}
 \usage{
-ingress_num(shape = NULL)
-}
-\arguments{
-\item{shape}{(\code{integer()} or \code{NULL})\cr
-Shape that \code{batchgetter} will produce. Batch-dimension should be included as \code{NA}.}
+ingress_num()
 }
 \value{
 \code{\link{TorchIngressToken}}
diff --git a/man/mlr_callback_set.lr_scheduler.Rd b/man/mlr_callback_set.lr_scheduler.Rd
index 7ba473ec2..806ee4833 100644
--- a/man/mlr_callback_set.lr_scheduler.Rd
+++ b/man/mlr_callback_set.lr_scheduler.Rd
@@ -12,7 +12,7 @@ As of this writing, the following are available:
 \item \code{\link[torch:lr_cosine_annealing]{torch::lr_cosine_annealing()}}
 \item \code{\link[torch:lr_lambda]{torch::lr_lambda()}}
 \item \code{\link[torch:lr_multiplicative]{torch::lr_multiplicative()}}
-\item \code{\link[torch:lr_one_cycle]{torch::lr_one_cycle()}} (where the default values for \code{epochs} and \code{steps_per_epoch} are the number of training epochs and the number of batches per epoch)
+\item \code{\link[torch:lr_one_cycle]{torch::lr_one_cycle()}}
 \item \code{\link[torch:lr_reduce_on_plateau]{torch::lr_reduce_on_plateau()}}
 \item \code{\link[torch:lr_step]{torch::lr_step()}}
 \item Custom schedulers defined with \code{\link[torch:lr_scheduler]{torch::lr_scheduler()}}.
@@ -68,7 +68,7 @@ The \code{torch} scheduler generator (e.g. \code{torch::lr_step}).}
 Whether the scheduler steps after every epoch (otherwise every batch).}
 
 \item{\code{...}}{(any)\cr
-The scheduler-specific initialization arguments.}
+The scheduler-specific arguments}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/mlr_callback_set.progress.Rd b/man/mlr_callback_set.progress.Rd
index f27112c84..661e09be4 100644
--- a/man/mlr_callback_set.progress.Rd
+++ b/man/mlr_callback_set.progress.Rd
@@ -43,7 +43,6 @@ Other Callback:
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
-\item \href{#method-CallbackSetProgress-new}{\code{CallbackSetProgress$new()}}
 \item \href{#method-CallbackSetProgress-on_epoch_begin}{\code{CallbackSetProgress$on_epoch_begin()}}
 \item \href{#method-CallbackSetProgress-on_batch_end}{\code{CallbackSetProgress$on_batch_end()}}
 \item \href{#method-CallbackSetProgress-on_before_valid}{\code{CallbackSetProgress$on_before_valid()}}
@@ -63,24 +62,6 @@ Other Callback:
 </details>
 }}
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-CallbackSetProgress-new"></a>}}
-\if{latex}{\out{\hypertarget{method-CallbackSetProgress-new}{}}}
-\subsection{Method \code{new()}}{
-Creates a new instance of this \link[R6:R6Class]{R6} class.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{CallbackSetProgress$new(digits = 2)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{digits}}{\code{integer(1)}\cr
-The number of digits to print for the measures.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-CallbackSetProgress-on_epoch_begin"></a>}}
 \if{latex}{\out{\hypertarget{method-CallbackSetProgress-on_epoch_begin}{}}}
 \subsection{Method \code{on_epoch_begin()}}{
diff --git a/man/mlr_context_torch.Rd b/man/mlr_context_torch.Rd
index ae71a3081..606eec109 100644
--- a/man/mlr_context_torch.Rd
+++ b/man/mlr_context_torch.Rd
@@ -74,9 +74,6 @@ that don't evaluate the model.}
 \item{\code{last_loss}}{(\code{numeric(1)})\cr
 The loss from the last trainings batch.}
 
-\item{\code{y_hat}}{(\code{torch_tensor})\cr
-The model's prediction for the current batch.}
-
 \item{\code{epoch}}{(\code{integer(1)})\cr
 The current epoch.}
 
diff --git a/man/mlr_learners.mlp.Rd b/man/mlr_learners.mlp.Rd
index d77802802..d10765604 100644
--- a/man/mlr_learners.mlp.Rd
+++ b/man/mlr_learners.mlp.Rd
@@ -86,7 +86,6 @@ Gorishniy Y, Rubachev I, Khrulkov V, Babenko A (2021).
 }
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.module}},
 \code{\link{mlr_learners.tab_resnet}},
 \code{\link{mlr_learners.torch_featureless}},
diff --git a/man/mlr_learners.module.Rd b/man/mlr_learners.module.Rd
index 19c460b3e..6b3a43681 100644
--- a/man/mlr_learners.module.Rd
+++ b/man/mlr_learners.module.Rd
@@ -35,7 +35,7 @@ lrn("regr.module", ...)
 nn_one_layer = nn_module("nn_one_layer",
   initialize = function(task, size_hidden) {
     self$first = nn_linear(task$n_features, size_hidden)
-    self$second = nn_linear(size_hidden, output_dim_for(task))
+    self$second = nn_linear(size_hidden, length(task$class_names))
   },
   # argument x corresponds to the ingress token x
   forward = function(x) {
@@ -58,7 +58,6 @@ learner$network
 }
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.tab_resnet}},
 \code{\link{mlr_learners.torch_featureless}},
@@ -67,7 +66,6 @@ Other Learner:
 \code{\link{mlr_learners_torch_model}}
 
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.tab_resnet}},
 \code{\link{mlr_learners.torch_featureless}},
@@ -122,8 +120,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
   loss = NULL,
   callbacks = list(),
   packages = character(0),
-  feature_types = NULL,
-  predict_types = NULL
+  feature_types = NULL
 )}\if{html}{\out{</div>}}
 }
 
@@ -171,10 +168,6 @@ The R packages this object depends on.}
 
 \item{\code{feature_types}}{(\code{NULL} or \code{character()})\cr
 The feature types. Defaults to all available feature types.}
-
-\item{\code{predict_types}}{(\code{character()})\cr
-The predict types.
-See \code{\link[mlr3:mlr_reflections]{mlr_reflections$learner_predict_types}} for available values.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/mlr_learners.tab_resnet.Rd b/man/mlr_learners.tab_resnet.Rd
index 8308acc1c..9cc3a4fb3 100644
--- a/man/mlr_learners.tab_resnet.Rd
+++ b/man/mlr_learners.tab_resnet.Rd
@@ -25,7 +25,7 @@ lrn("regr.tab_resnet", ...)
 \item classif: 'response', 'prob'
 \item regr: 'response'
 }
-\item Feature Types: \dQuote{integer}, \dQuote{numeric}, \dQuote{lazy_tensor}
+\item Feature Types: \dQuote{integer}, \dQuote{numeric}
 \item Required Packages: \CRANpkg{mlr3}, \CRANpkg{mlr3torch}, \CRANpkg{torch}
 }
 }
@@ -46,9 +46,6 @@ Alternative way to specify the latent dimension as \code{d_block * d_hidden_mult
 First dropout ratio.
 \item \code{dropout2} :: \code{numeric(1)}\cr
 Second dropout ratio.
-\item \code{shape} :: \code{integer()} or \code{NULL}\cr
-Shape of the input tensor. Only needs to be provided if the input is a lazy tensor with
-unknown shape.
 }
 }
 
@@ -84,7 +81,6 @@ Gorishniy Y, Rubachev I, Khrulkov V, Babenko A (2021).
 }
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.module}},
 \code{\link{mlr_learners.torch_featureless}},
diff --git a/man/mlr_learners.torch_featureless.Rd b/man/mlr_learners.torch_featureless.Rd
index ad9803324..c922ea9f1 100644
--- a/man/mlr_learners.torch_featureless.Rd
+++ b/man/mlr_learners.torch_featureless.Rd
@@ -65,7 +65,6 @@ predictions$score()
 }
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.module}},
 \code{\link{mlr_learners.tab_resnet}},
diff --git a/man/mlr_learners.torchvision.Rd b/man/mlr_learners.torchvision.Rd
index 7bb251217..8f6a9b551 100644
--- a/man/mlr_learners.torchvision.Rd
+++ b/man/mlr_learners.torchvision.Rd
@@ -28,20 +28,6 @@ number of classes inferred from the \code{\link[mlr3:Task]{Task}}.
 }
 }
 
-\references{
-Krizhevsky, Alex, Sutskever, Ilya, Hinton, E. G (2017).
-\dQuote{Imagenet classification with deep convolutional neural networks.}
-\emph{Communications of the ACM}, \bold{60}(6), 84--90.
-Sandler, Mark, Howard, Andrew, Zhu, Menglong, Zhmoginov, Andrey, Chen, Liang-Chieh (2018).
-\dQuote{Mobilenetv2: Inverted residuals and linear bottlenecks.}
-In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 4510--4520.
-He, Kaiming, Zhang, Xiangyu, Ren, Shaoqing, Sun, Jian (2016).
-\dQuote{Deep residual learning for image recognition.}
-In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 770--778.
-Simonyan, Karen, Zisserman, Andrew (2014).
-\dQuote{Very deep convolutional networks for large-scale image recognition.}
-\emph{arXiv preprint arXiv:1409.1556}.
-}
 \section{Super classes}{
 \code{\link[mlr3:Learner]{mlr3::Learner}} -> \code{\link[mlr3torch:LearnerTorch]{mlr3torch::LearnerTorch}} -> \code{\link[mlr3torch:LearnerTorchImage]{mlr3torch::LearnerTorchImage}} -> \code{LearnerTorchVision}
 }
@@ -84,8 +70,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
   label,
   optimizer = NULL,
   loss = NULL,
-  callbacks = list(),
-  jittable = FALSE
+  callbacks = list()
 )}\if{html}{\out{</div>}}
 }
 
@@ -99,7 +84,20 @@ The name of the network.}
 Function that generates the network.}
 
 \item{\code{label}}{(\code{character(1)})\cr
-The label of the network.}
+The label of the network.
+#' @references
+Krizhevsky, Alex, Sutskever, Ilya, Hinton, E. G (2017).
+\dQuote{Imagenet classification with deep convolutional neural networks.}
+\emph{Communications of the ACM}, \bold{60}(6), 84--90.
+Sandler, Mark, Howard, Andrew, Zhu, Menglong, Zhmoginov, Andrey, Chen, Liang-Chieh (2018).
+\dQuote{Mobilenetv2: Inverted residuals and linear bottlenecks.}
+In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 4510--4520.
+He, Kaiming, Zhang, Xiangyu, Ren, Shaoqing, Sun, Jian (2016).
+\dQuote{Deep residual learning for image recognition.}
+In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 770--778.
+Simonyan, Karen, Zisserman, Andrew (2014).
+\dQuote{Very deep convolutional networks for large-scale image recognition.}
+\emph{arXiv preprint arXiv:1409.1556}.}
 
 \item{\code{optimizer}}{(\code{\link{TorchOptimizer}})\cr
 The optimizer to use for training.
@@ -111,9 +109,6 @@ Per default, \emph{mse} is used for regression and \emph{cross_entropy} for clas
 
 \item{\code{callbacks}}{(\code{list()} of \code{\link{TorchCallback}}s)\cr
 The callbacks. Must have unique ids.}
-
-\item{\code{jittable}}{(\code{logical(1)})\cr
-Whether to use jitting.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/mlr_learners_torch.Rd b/man/mlr_learners_torch.Rd
index bb2feec28..3f1d3e6d0 100644
--- a/man/mlr_learners_torch.Rd
+++ b/man/mlr_learners_torch.Rd
@@ -39,27 +39,6 @@ To do so, you just need to include \verb{epochs = to_tune(upper = <upper>, inter
 where \verb{<upper>} is the maximally allowed number of epochs, and configure the early stopping.
 }
 
-\section{Network Head and Target Encoding}{
-
-Torch learners are expected to have the following output:
-\itemize{
-\item binary classification: \verb{(batch_size, 1)}, representing the logits for the positive class.
-\item multiclass classification: \verb{(batch_size, n_classes)}, representing the logits for all classes.
-\item regression: \verb{(batch_size, 1)} representing the response prediction.
-}
-
-Furthermore, the target encoding is expected to be as follows:
-\itemize{
-\item regression: The \code{numeric} target variable of a \code{\link[mlr3:TaskRegr]{TaskRegr}} is encoded as a
-\code{\link[torch:torch_dtype]{torch_float}} with shape \code{c(batch_size, 1)}.
-\item binary classification: The \code{factor} target variable of a \code{\link[mlr3:TaskClassif]{TaskClassif}} is encoded as a
-\code{\link[torch:torch_dtype]{torch_float}} with shape \verb{(batch_size, 1)} where the positive class (\code{Task$positive}, which
-is also ensured to be the first factor level) is \code{1} and the negative class is \code{0}.
-\item multi-class classification: The \code{factor} target variable of a \code{\link[mlr3:TaskClassif]{TaskClassif}} is a label-encoded
-\code{\link[torch:torch_dtype]{torch_long}} with shape \code{(batch_size)} where the label-encoding goes from \code{1} to \code{n_classes}.
-}
-}
-
 \section{Model}{
 
 The Model is a list of class \code{"learner_torch_model"} with the following elements:
@@ -184,26 +163,17 @@ There are no seperate classes for classification and regression to inherit from.
 Instead, the \code{task_type} must be specified  as a construction argument.
 Currently, only classification and regression are supported.
 
-When inheriting from this class, one should overload the following methods:
+When inheriting from this class, one should overload two private methods:
 \itemize{
 \item \code{.network(task, param_vals)}\cr
 (\code{\link[mlr3:Task]{Task}}, \code{list()}) -> \code{\link[torch:nn_module]{nn_module}}\cr
 Construct a \code{\link[torch:nn_module]{torch::nn_module}} object for the given task and parameter values, i.e. the neural network that
 is trained by the learner.
-Note that a specific output shape is expected from the returned network, see section \emph{Network Head and Target Encoding}.
-You can use \code{\link[=output_dim_for]{output_dim_for()}} to obtain the correct output dimension for a given task.
-\item \code{.ingress_tokens(task, param_vals)}\cr
-(\code{\link[mlr3:Task]{Task}}, \code{list()}) -> named \code{list()} with \code{\link{TorchIngressToken}}s\cr
-Create the \code{\link{TorchIngressToken}}s that are passed to the \code{\link{task_dataset}} constructor.
-The number of ingress tokens must correspond to the number of input parameters of the network.
-If there is more than one input, the names must correspond to the inputs of the network.
-See \code{\link{ingress_num}}, \code{\link{ingress_categ}}, and \code{\link{ingress_ltnsr}} on how to easily create the correct tokens.
-For more flexibility, you can also directly implement the \code{.dataset(task, param_vals)} method,
-see below.
+For classification, the output of this network are expected to be the scores before the application of the
+final softmax layer.
 \item \code{.dataset(task, param_vals)}\cr
 (\code{\link[mlr3:Task]{Task}}, \code{list()}) -> \code{\link[torch:dataset]{torch::dataset}}\cr
 Create the dataset for the task.
-Don't implement this if the \code{.ingress_tokens()} method is defined.
 The dataset must return a named list where:
 \itemize{
 \item \code{x} is a list of torch tensors that are the input to the network.
@@ -212,9 +182,7 @@ For networks with more than one input, the names must correspond to the inputs o
 \item \code{.index} are the indices of the batch (\code{integer()} or a \code{torch_int()}).
 }
 
-For information on the expected target encoding of \code{y}, see section \emph{Network Head and Target Encoding}.
 Moreover, one needs to pay attention respect the row ids of the provided task.
-It is recommended to relu on \code{\link{task_dataset}} for creating the \code{\link[torch:dataset]{dataset}}.
 }
 
 It is also possible to overwrite the private \code{.dataloader()} method.
@@ -223,10 +191,10 @@ This must respect the dataloader parameters from the \code{\link[paradox:ParamSe
 \item \code{.dataloader(dataset, param_vals)}\cr
 (\code{\link[mlr3:Task]{Task}}, \code{list()}) -> \code{\link[torch:dataloader]{torch::dataloader}}\cr
 Create a dataloader from the task.
-Needs to respect at least \code{batch_size} and \code{shuffle} (otherwise predictions will be incorrectly ordered).
+Needs to respect at least \code{batch_size} and \code{shuffle} (otherwise predictions can be permuted).
 }
 
-To change the predict types, it is possible to overwrite the method below:
+To change the predict types, the it is possible to overwrite the method below:
 \itemize{
 \item \code{.encode_prediction(predict_tensor, task)}\cr
 (\code{\link[torch:torch_tensor]{torch_tensor}}, \code{\link[mlr3:Task]{Task}}) -> \code{list()}\cr
@@ -240,9 +208,8 @@ not possible to remove existing parameters, i.e. those listed in section \emph{P
 None of the parameters provided in \code{param_set} can have an id that starts with \code{"loss."}, \verb{"opt.", or }"cb."`, as these are preserved for the dynamically constructed parameters of the optimizer, the loss function,
 and the callbacks.
 
-To perform additional input checks on the task, the private \code{.check_train_task(task, param_vals)} and
-\code{.check_predict_task(task, param_vals)} can be overwritten.
-These should return \code{TRUE} if the input task is valid and otherwise a string with an error message.
+To perform additional input checks on the task, the private \code{.verify_train_task(task, param_vals)} and
+\code{.verify_predict_task(task, param_vals)} can be overwritten.
 
 For learners that have other construction arguments that should change the hash of a learner, it is required
 to implement the private \verb{$.additional_phash_input()}.
@@ -250,7 +217,6 @@ to implement the private \verb{$.additional_phash_input()}.
 
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.module}},
 \code{\link{mlr_learners.tab_resnet}},
@@ -282,7 +248,7 @@ The ids will be set as the names.}
 Specify the \verb{$validate} field and the \code{measures_valid} parameter to configure this.
 Returns \code{NULL} if learner is not trained yet.}
 
-\item{\code{internal_tuned_values}}{When early stopping is active, this returns a named list with the early-stopped epochs,
+\item{\code{internal_tuned_values}}{When early stopping is activate, this returns a named list with the early-stopped epochs,
 otherwise an empty list is returned.
 Returns \code{NULL} if learner is not trained yet.}
 
@@ -341,7 +307,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
   id,
   task_type,
   param_set,
-  properties = character(),
+  properties,
   man,
   label,
   feature_types,
@@ -349,8 +315,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
   loss = NULL,
   packages = character(),
   predict_types = NULL,
-  callbacks = list(),
-  jittable = FALSE
+  callbacks = list()
 )}\if{html}{\out{</div>}}
 }
 
@@ -406,9 +371,6 @@ method, see section \emph{Inheriting}.}
 \item{\code{callbacks}}{(\code{list()} of \code{\link{TorchCallback}}s)\cr
 The callbacks to use for training.
 Defaults to an empty\code{ list()}, i.e. no callbacks.}
-
-\item{\code{jittable}}{(\code{logical(1)})\cr
-Whether the model can be jit-traced. Default is \code{FALSE}.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/mlr_learners_torch_image.Rd b/man/mlr_learners_torch_image.Rd
index fd9e0b901..4bf5f1a22 100644
--- a/man/mlr_learners_torch_image.Rd
+++ b/man/mlr_learners_torch_image.Rd
@@ -15,7 +15,6 @@ Parameters include those inherited from \code{\link{LearnerTorch}} and the \code
 
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.module}},
 \code{\link{mlr_learners.tab_resnet}},
@@ -68,11 +67,10 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
   optimizer = NULL,
   loss = NULL,
   callbacks = list(),
-  packages,
+  packages = "torchvision",
   man,
   properties = NULL,
-  predict_types = NULL,
-  jittable = FALSE
+  predict_types = NULL
 )}\if{html}{\out{</div>}}
 }
 
@@ -116,9 +114,6 @@ See \code{\link[mlr3:mlr_reflections]{mlr_reflections$learner_properties}} for a
 \item{\code{predict_types}}{(\code{character()})\cr
 The predict types.
 See \code{\link[mlr3:mlr_reflections]{mlr_reflections$learner_predict_types}} for available values.}
-
-\item{\code{jittable}}{(\code{logical(1)})\cr
-Whether the model can be jit-traced.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/mlr_learners_torch_model.Rd b/man/mlr_learners_torch_model.Rd
index bfc3b3d60..0de631c9d 100644
--- a/man/mlr_learners_torch_model.Rd
+++ b/man/mlr_learners_torch_model.Rd
@@ -44,7 +44,6 @@ learner$predict(task, ids$test)
 }
 \seealso{
 Other Learner: 
-\code{\link{mlr_learners.ft_transformer}},
 \code{\link{mlr_learners.mlp}},
 \code{\link{mlr_learners.module}},
 \code{\link{mlr_learners.tab_resnet}},
diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd
index 46d765fd4..8150ea062 100644
--- a/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd
+++ b/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd
@@ -62,9 +62,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -84,7 +81,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd
index 0f4b8f44f..9d07c0b1a 100644
--- a/man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd
+++ b/man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd
@@ -62,9 +62,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -84,7 +81,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd
index c9e2a56bd..5382a3c06 100644
--- a/man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd
+++ b/man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd
@@ -62,9 +62,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -84,7 +81,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_avg_pool1d.Rd b/man/mlr_pipeops_nn_avg_pool1d.Rd
index 7034ce215..d7b64b063 100644
--- a/man/mlr_pipeops_nn_avg_pool1d.Rd
+++ b/man/mlr_pipeops_nn_avg_pool1d.Rd
@@ -73,9 +73,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -95,7 +92,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_avg_pool2d.Rd b/man/mlr_pipeops_nn_avg_pool2d.Rd
index 537c93776..4ead02b30 100644
--- a/man/mlr_pipeops_nn_avg_pool2d.Rd
+++ b/man/mlr_pipeops_nn_avg_pool2d.Rd
@@ -74,9 +74,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -96,7 +93,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_avg_pool3d.Rd b/man/mlr_pipeops_nn_avg_pool3d.Rd
index 3e0946f73..1f53ad6b9 100644
--- a/man/mlr_pipeops_nn_avg_pool3d.Rd
+++ b/man/mlr_pipeops_nn_avg_pool3d.Rd
@@ -74,9 +74,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -96,7 +93,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_batch_norm1d.Rd b/man/mlr_pipeops_nn_batch_norm1d.Rd
index 562dbbdec..50940e548 100644
--- a/man/mlr_pipeops_nn_batch_norm1d.Rd
+++ b/man/mlr_pipeops_nn_batch_norm1d.Rd
@@ -71,9 +71,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_batch_norm2d.Rd b/man/mlr_pipeops_nn_batch_norm2d.Rd
index a678628a7..9ab8773bf 100644
--- a/man/mlr_pipeops_nn_batch_norm2d.Rd
+++ b/man/mlr_pipeops_nn_batch_norm2d.Rd
@@ -71,9 +71,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_batch_norm3d.Rd b/man/mlr_pipeops_nn_batch_norm3d.Rd
index bad2b1fe5..2974695d5 100644
--- a/man/mlr_pipeops_nn_batch_norm3d.Rd
+++ b/man/mlr_pipeops_nn_batch_norm3d.Rd
@@ -71,9 +71,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_block.Rd b/man/mlr_pipeops_nn_block.Rd
index d54abc6a6..408e2958c 100644
--- a/man/mlr_pipeops_nn_block.Rd
+++ b/man/mlr_pipeops_nn_block.Rd
@@ -16,24 +16,10 @@ IDs of the \code{n_blocks} layers with the ID of the \code{PipeOpTorchBlock} and
 
 \section{Parameters}{
 
-The parameters available for the provided \code{block}, as well as
+The parameters available for the block itself, as well as
 \itemize{
 \item \code{n_blocks} :: \code{integer(1)}\cr
 How often to repeat the block.
-\item \code{trafo} :: \verb{function(i, param_vals, param_set) -> list()}\cr
-A function that allows to transform the parameters vaues of each layer (\code{block}).
-Here,
-\itemize{
-\item \code{i} :: \code{integer(1)}\cr
-is the index of the layer, ranging from \code{1} to \code{n_blocks}.
-\item \code{param_vals} :: named \code{list()}\cr
-are the parameter values of the layer \code{i}.
-\item \code{param_set} :: \code{\link[paradox:ParamSet]{ParamSet}}\cr
-is the parameter set of the whole \code{PipeOpTorchBlock}.
-}
-
-The function must return the modified parameter values for the given layer.
-This, e.g., allows for special behavior of the first or last layer.
 }
 }
 
@@ -50,24 +36,20 @@ The state is the value calculated by the public method \verb{$shapes_out()}.
 
 \examples{
 \dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-# repeat a simple linear layer with ReLU activation 3 times, but set the bias for the last
-# layer to `FALSE`
-block = nn("linear") \%>>\% nn("relu")
+block = po("nn_linear") \%>>\% po("nn_relu")
+po_block = po("nn_block", block,
+nn_linear.out_features = 10L, n_blocks = 3)
+network = po("torch_ingress_num") \%>>\%
+po_block \%>>\%
+po("nn_head") \%>>\%
+po("torch_loss", t_loss("cross_entropy")) \%>>\%
+po("torch_optimizer", t_opt("adam")) \%>>\%
+po("torch_model_classif",
+  batch_size = 50,
+  epochs = 3)
 
-blocks = nn("block", block,
-  linear.out_features = 10L, linear.bias = TRUE, n_blocks = 3,
-  trafo = function(i, param_vals, param_set) {
-    if (i  == param_set$get_values()$n_blocks) {
-      param_vals$linear.bias = FALSE
-    }
-    param_vals
-  })
-graph = po("torch_ingress_num") \%>>\%
-  blocks \%>>\%
-  nn("head")
-md = graph$train(tsk("iris"))[[1L]]
-network = model_descriptor_to_module(md)
-network
+task = tsk("iris")
+network$train(task)
 \dontshow{\}) # examplesIf}
 }
 \seealso{
@@ -91,9 +73,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -113,7 +92,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_celu.Rd b/man/mlr_pipeops_nn_celu.Rd
index f9abb2f2c..929a1c80f 100644
--- a/man/mlr_pipeops_nn_celu.Rd
+++ b/man/mlr_pipeops_nn_celu.Rd
@@ -62,9 +62,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -84,7 +81,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_conv1d.Rd b/man/mlr_pipeops_nn_conv1d.Rd
index 1f881b57a..1aecea727 100644
--- a/man/mlr_pipeops_nn_conv1d.Rd
+++ b/man/mlr_pipeops_nn_conv1d.Rd
@@ -77,9 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -99,7 +96,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_conv2d.Rd b/man/mlr_pipeops_nn_conv2d.Rd
index 90ee8effe..15aeba474 100644
--- a/man/mlr_pipeops_nn_conv2d.Rd
+++ b/man/mlr_pipeops_nn_conv2d.Rd
@@ -77,9 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -99,7 +96,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_conv3d.Rd b/man/mlr_pipeops_nn_conv3d.Rd
index 31de57fd0..21a4d8194 100644
--- a/man/mlr_pipeops_nn_conv3d.Rd
+++ b/man/mlr_pipeops_nn_conv3d.Rd
@@ -77,9 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -99,7 +96,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_conv_transpose1d.Rd b/man/mlr_pipeops_nn_conv_transpose1d.Rd
index 72ac28cf4..97bd2af68 100644
--- a/man/mlr_pipeops_nn_conv_transpose1d.Rd
+++ b/man/mlr_pipeops_nn_conv_transpose1d.Rd
@@ -79,9 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -101,7 +98,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_conv_transpose2d.Rd b/man/mlr_pipeops_nn_conv_transpose2d.Rd
index 224a19934..025567485 100644
--- a/man/mlr_pipeops_nn_conv_transpose2d.Rd
+++ b/man/mlr_pipeops_nn_conv_transpose2d.Rd
@@ -79,9 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -101,7 +98,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_conv_transpose3d.Rd b/man/mlr_pipeops_nn_conv_transpose3d.Rd
index b4a115b7b..8bac95050 100644
--- a/man/mlr_pipeops_nn_conv_transpose3d.Rd
+++ b/man/mlr_pipeops_nn_conv_transpose3d.Rd
@@ -79,9 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -101,7 +98,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_dropout.Rd b/man/mlr_pipeops_nn_dropout.Rd
index ee1cb876e..ebcae728b 100644
--- a/man/mlr_pipeops_nn_dropout.Rd
+++ b/man/mlr_pipeops_nn_dropout.Rd
@@ -65,9 +65,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_conv_transpose3d}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -87,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_elu.Rd b/man/mlr_pipeops_nn_elu.Rd
index 20f5a3eb4..60d99d06a 100644
--- a/man/mlr_pipeops_nn_elu.Rd
+++ b/man/mlr_pipeops_nn_elu.Rd
@@ -63,9 +63,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_conv_transpose3d}},
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_flatten.Rd b/man/mlr_pipeops_nn_flatten.Rd
index 3ae7e5a63..2b69333ca 100644
--- a/man/mlr_pipeops_nn_flatten.Rd
+++ b/man/mlr_pipeops_nn_flatten.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_conv_transpose3d}},
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -83,7 +80,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_gelu.Rd b/man/mlr_pipeops_nn_gelu.Rd
index 3d7ee7c02..2ad203760 100644
--- a/man/mlr_pipeops_nn_gelu.Rd
+++ b/man/mlr_pipeops_nn_gelu.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
 \code{\link{mlr_pipeops_nn_hardsigmoid}},
@@ -82,7 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_glu.Rd b/man/mlr_pipeops_nn_glu.Rd
index 3458b4c2b..254d0fc2c 100644
--- a/man/mlr_pipeops_nn_glu.Rd
+++ b/man/mlr_pipeops_nn_glu.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
 \code{\link{mlr_pipeops_nn_hardsigmoid}},
@@ -82,7 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_hardshrink.Rd b/man/mlr_pipeops_nn_hardshrink.Rd
index e8f60ee4f..80260201a 100644
--- a/man/mlr_pipeops_nn_hardshrink.Rd
+++ b/man/mlr_pipeops_nn_hardshrink.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardsigmoid}},
@@ -82,7 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_hardsigmoid.Rd b/man/mlr_pipeops_nn_hardsigmoid.Rd
index f97d44431..bb50604d7 100644
--- a/man/mlr_pipeops_nn_hardsigmoid.Rd
+++ b/man/mlr_pipeops_nn_hardsigmoid.Rd
@@ -58,9 +58,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -79,7 +76,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_hardtanh.Rd b/man/mlr_pipeops_nn_hardtanh.Rd
index 69da3b34f..6499b6213 100644
--- a/man/mlr_pipeops_nn_hardtanh.Rd
+++ b/man/mlr_pipeops_nn_hardtanh.Rd
@@ -59,9 +59,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -80,7 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_head.Rd b/man/mlr_pipeops_nn_head.Rd
index 8d91cd167..e64ef2149 100644
--- a/man/mlr_pipeops_nn_head.Rd
+++ b/man/mlr_pipeops_nn_head.Rd
@@ -14,12 +14,6 @@ When this \code{\link[mlr3pipelines:PipeOp]{PipeOp}} is trained however, the mod
 \section{nn_module}{
 
 Calls \code{\link[torch:nn_linear]{torch::nn_linear()}} with the input and output features inferred from the input shape / task.
-For
-\itemize{
-\item binary classification, the output dimension is 1.
-\item multiclass classification, the output dimension is the number of classes.
-\item regression, the output dimension is 1.
-}
 }
 
 \section{Parameters}{
@@ -72,9 +66,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_identity.Rd b/man/mlr_pipeops_nn_identity.Rd
index 04ae4912e..2f52c7968 100644
--- a/man/mlr_pipeops_nn_identity.Rd
+++ b/man/mlr_pipeops_nn_identity.Rd
@@ -54,9 +54,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -75,7 +72,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_layer_norm.Rd b/man/mlr_pipeops_nn_layer_norm.Rd
index 3f93788b5..95523b04d 100644
--- a/man/mlr_pipeops_nn_layer_norm.Rd
+++ b/man/mlr_pipeops_nn_layer_norm.Rd
@@ -68,9 +68,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -89,7 +86,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_leaky_relu.Rd b/man/mlr_pipeops_nn_leaky_relu.Rd
index 5bbe27330..d66473e27 100644
--- a/man/mlr_pipeops_nn_leaky_relu.Rd
+++ b/man/mlr_pipeops_nn_leaky_relu.Rd
@@ -64,9 +64,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_linear.Rd b/man/mlr_pipeops_nn_linear.Rd
index edf4305fe..7803e4c7e 100644
--- a/man/mlr_pipeops_nn_linear.Rd
+++ b/man/mlr_pipeops_nn_linear.Rd
@@ -66,9 +66,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -87,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_log_sigmoid.Rd b/man/mlr_pipeops_nn_log_sigmoid.Rd
index 10dcd896a..e4ef0d705 100644
--- a/man/mlr_pipeops_nn_log_sigmoid.Rd
+++ b/man/mlr_pipeops_nn_log_sigmoid.Rd
@@ -58,9 +58,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -79,7 +76,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_max_pool1d.Rd b/man/mlr_pipeops_nn_max_pool1d.Rd
index 37a6a548d..73398cbdb 100644
--- a/man/mlr_pipeops_nn_max_pool1d.Rd
+++ b/man/mlr_pipeops_nn_max_pool1d.Rd
@@ -72,9 +72,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_max_pool2d.Rd b/man/mlr_pipeops_nn_max_pool2d.Rd
index dc90df8b7..51069f201 100644
--- a/man/mlr_pipeops_nn_max_pool2d.Rd
+++ b/man/mlr_pipeops_nn_max_pool2d.Rd
@@ -72,9 +72,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_max_pool3d.Rd b/man/mlr_pipeops_nn_max_pool3d.Rd
index f86b42b04..1ee0dd63b 100644
--- a/man/mlr_pipeops_nn_max_pool3d.Rd
+++ b/man/mlr_pipeops_nn_max_pool3d.Rd
@@ -72,9 +72,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_merge.Rd b/man/mlr_pipeops_nn_merge.Rd
index 70ce6eb91..697942de6 100644
--- a/man/mlr_pipeops_nn_merge.Rd
+++ b/man/mlr_pipeops_nn_merge.Rd
@@ -57,9 +57,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -78,7 +75,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_merge_cat.Rd b/man/mlr_pipeops_nn_merge_cat.Rd
index c9403d79e..4f3b400ad 100644
--- a/man/mlr_pipeops_nn_merge_cat.Rd
+++ b/man/mlr_pipeops_nn_merge_cat.Rd
@@ -17,7 +17,7 @@ Calls \code{\link[=nn_merge_cat]{nn_merge_cat()}} when trained.
 
 \itemize{
 \item \code{dim} :: \code{integer(1)}\cr
-The dimension along which to concatenate the tensors. The default is -1, i.e., the last dimension.
+The dimension along which to concatenate the tensors.
 }
 }
 
@@ -68,9 +68,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -89,7 +86,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_merge_prod.Rd b/man/mlr_pipeops_nn_merge_prod.Rd
index b775d5db6..9c5ea7257 100644
--- a/man/mlr_pipeops_nn_merge_prod.Rd
+++ b/man/mlr_pipeops_nn_merge_prod.Rd
@@ -64,9 +64,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_cat}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_merge_sum.Rd b/man/mlr_pipeops_nn_merge_sum.Rd
index 31cf6ad17..bec5f75cb 100644
--- a/man/mlr_pipeops_nn_merge_sum.Rd
+++ b/man/mlr_pipeops_nn_merge_sum.Rd
@@ -64,9 +64,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_cat}},
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
@@ -133,9 +129,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -154,7 +147,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_cat}},
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_prelu.Rd b/man/mlr_pipeops_nn_prelu.Rd
index af5f747cb..8bf9f0894 100644
--- a/man/mlr_pipeops_nn_prelu.Rd
+++ b/man/mlr_pipeops_nn_prelu.Rd
@@ -66,9 +66,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -87,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_cat}},
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_relu.Rd b/man/mlr_pipeops_nn_relu.Rd
index dba158009..e3fda5178 100644
--- a/man/mlr_pipeops_nn_relu.Rd
+++ b/man/mlr_pipeops_nn_relu.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -83,7 +80,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
 \code{\link{mlr_pipeops_nn_rrelu}},
diff --git a/man/mlr_pipeops_nn_relu6.Rd b/man/mlr_pipeops_nn_relu6.Rd
index 866f2f1ce..aa4c0b188 100644
--- a/man/mlr_pipeops_nn_relu6.Rd
+++ b/man/mlr_pipeops_nn_relu6.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -83,7 +80,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_reshape}},
 \code{\link{mlr_pipeops_nn_rrelu}},
diff --git a/man/mlr_pipeops_nn_reshape.Rd b/man/mlr_pipeops_nn_reshape.Rd
index 4f11a59a6..0f7d4cd76 100644
--- a/man/mlr_pipeops_nn_reshape.Rd
+++ b/man/mlr_pipeops_nn_reshape.Rd
@@ -17,7 +17,7 @@ This internally calls \code{\link[torch:torch_reshape]{torch::torch_reshape()}}
 
 \itemize{
 \item \code{shape} :: \code{integer(1)}\cr
-The desired output shape. Unknown dimension (one at most) can either be specified as \code{-1}.
+The desired output shape. Unknown dimension (one at most) can either be specified as \code{-1} or \code{NA}.
 }
 }
 
@@ -63,9 +63,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_rrelu}},
diff --git a/man/mlr_pipeops_nn_rrelu.Rd b/man/mlr_pipeops_nn_rrelu.Rd
index f7f31eb66..185cbf054 100644
--- a/man/mlr_pipeops_nn_rrelu.Rd
+++ b/man/mlr_pipeops_nn_rrelu.Rd
@@ -65,9 +65,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -87,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_selu.Rd b/man/mlr_pipeops_nn_selu.Rd
index 65b9bb316..91d4e0b1c 100644
--- a/man/mlr_pipeops_nn_selu.Rd
+++ b/man/mlr_pipeops_nn_selu.Rd
@@ -64,9 +64,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -86,7 +83,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_sigmoid.Rd b/man/mlr_pipeops_nn_sigmoid.Rd
index f5a286738..c8b2c0f4c 100644
--- a/man/mlr_pipeops_nn_sigmoid.Rd
+++ b/man/mlr_pipeops_nn_sigmoid.Rd
@@ -58,9 +58,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -80,7 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_softmax.Rd b/man/mlr_pipeops_nn_softmax.Rd
index fbc2c3dd5..8bc699ac0 100644
--- a/man/mlr_pipeops_nn_softmax.Rd
+++ b/man/mlr_pipeops_nn_softmax.Rd
@@ -62,9 +62,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -84,7 +81,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_softplus.Rd b/man/mlr_pipeops_nn_softplus.Rd
index 298138d74..01856f5d8 100644
--- a/man/mlr_pipeops_nn_softplus.Rd
+++ b/man/mlr_pipeops_nn_softplus.Rd
@@ -63,9 +63,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_softshrink.Rd b/man/mlr_pipeops_nn_softshrink.Rd
index d17b87afa..8b45caec4 100644
--- a/man/mlr_pipeops_nn_softshrink.Rd
+++ b/man/mlr_pipeops_nn_softshrink.Rd
@@ -61,9 +61,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -83,7 +80,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_softsign.Rd b/man/mlr_pipeops_nn_softsign.Rd
index 623c9c2f6..20185e67f 100644
--- a/man/mlr_pipeops_nn_softsign.Rd
+++ b/man/mlr_pipeops_nn_softsign.Rd
@@ -58,9 +58,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -80,7 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_squeeze.Rd b/man/mlr_pipeops_nn_squeeze.Rd
index 15a693ace..d9f80184b 100644
--- a/man/mlr_pipeops_nn_squeeze.Rd
+++ b/man/mlr_pipeops_nn_squeeze.Rd
@@ -63,9 +63,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_tanh.Rd b/man/mlr_pipeops_nn_tanh.Rd
index 931e5f91e..e79d2e397 100644
--- a/man/mlr_pipeops_nn_tanh.Rd
+++ b/man/mlr_pipeops_nn_tanh.Rd
@@ -58,9 +58,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -80,7 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_tanhshrink.Rd b/man/mlr_pipeops_nn_tanhshrink.Rd
index 28c5b36bf..f478da836 100644
--- a/man/mlr_pipeops_nn_tanhshrink.Rd
+++ b/man/mlr_pipeops_nn_tanhshrink.Rd
@@ -58,9 +58,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -80,7 +77,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_threshold.Rd b/man/mlr_pipeops_nn_threshold.Rd
index f284bd88d..0099babf3 100644
--- a/man/mlr_pipeops_nn_threshold.Rd
+++ b/man/mlr_pipeops_nn_threshold.Rd
@@ -65,9 +65,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -87,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_tokenizer_categ.Rd b/man/mlr_pipeops_nn_tokenizer_categ.Rd
index 3b611f836..0be05c68c 100644
--- a/man/mlr_pipeops_nn_tokenizer_categ.Rd
+++ b/man/mlr_pipeops_nn_tokenizer_categ.Rd
@@ -6,7 +6,6 @@
 \title{Categorical Tokenizer}
 \description{
 Tokenizes categorical features into a dense embedding.
-For an input of shape \verb{(batch, n_features)} the output shape is \verb{(batch, n_features, d_token)}.
 }
 \section{nn_module}{
 
@@ -14,6 +13,17 @@ Calls \code{\link[=nn_tokenizer_categ]{nn_tokenizer_categ()}} when trained where
 The output shape is \verb{(batch, n_features, d_token)}.
 }
 
+\section{Input and Output Channels}{
+
+One input channel called \code{"input"} and one output channel called \code{"output"}.
+For an explanation see \code{\link{PipeOpTorch}}.
+}
+
+\section{State}{
+
+The state is the value calculated by the public method \verb{$shapes_out()}.
+}
+
 \section{Parameters}{
 
 \itemize{
@@ -24,23 +34,9 @@ Whether to use a bias. Is initialized to \code{TRUE}.
 \item \code{initialization} :: \code{character(1)}\cr
 The initialization method for the embedding weights. Possible values are \code{"uniform"} (default)
 and \code{"normal"}.
-\item \code{cardinalities} :: \code{integer()}\cr
-The number of categories for each feature.
-Only needs to be provided when working with \code{\link{lazy_tensor}} inputs.
 }
 }
 
-\section{Input and Output Channels}{
-
-One input channel called \code{"input"} and one output channel called \code{"output"}.
-For an explanation see \code{\link{PipeOpTorch}}.
-}
-
-\section{State}{
-
-The state is the value calculated by the public method \verb{$shapes_out()}.
-}
-
 \examples{
 \dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 # Construct the PipeOp
@@ -72,9 +68,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -94,7 +87,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_tokenizer_num.Rd b/man/mlr_pipeops_nn_tokenizer_num.Rd
index 2a6936953..d3641c77e 100644
--- a/man/mlr_pipeops_nn_tokenizer_num.Rd
+++ b/man/mlr_pipeops_nn_tokenizer_num.Rd
@@ -6,11 +6,10 @@
 \title{Numeric Tokenizer}
 \description{
 Tokenizes numeric features into a dense embedding.
-For an input of shape \verb{(batch, n_features)} the output shape is \verb{(batch, n_features, d_token)}.
 }
 \section{nn_module}{
 
-Calls \code{\link[=nn_tokenizer_num]{nn_tokenizer_num()}} when trained where the parameter \code{n_features} is inferred.
+Calls \code{\link[=nn_tokenizer_numeric]{nn_tokenizer_numeric()}} when trained where the parameter \code{n_features} is inferred.
 The output shape is \verb{(batch, n_features, d_token)}.
 }
 
@@ -69,9 +68,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -91,7 +87,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_nn_unsqueeze.Rd b/man/mlr_pipeops_nn_unsqueeze.Rd
index 3f6386116..cd7d8565c 100644
--- a/man/mlr_pipeops_nn_unsqueeze.Rd
+++ b/man/mlr_pipeops_nn_unsqueeze.Rd
@@ -63,9 +63,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -85,7 +82,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch.Rd b/man/mlr_pipeops_torch.Rd
index ad1f66934..3c25023cc 100644
--- a/man/mlr_pipeops_torch.Rd
+++ b/man/mlr_pipeops_torch.Rd
@@ -38,22 +38,18 @@ When inheriting from this class, one should overload either the \code{private$.s
 \itemize{
 \item \code{.make_module(shapes_in, param_vals, task)}\cr
 (\code{list()}, \code{list()}) -> \code{nn_module}\cr
-This private method is called to generate the \code{nn_module} that is passed as argument \code{module} to
+This private method is called to generated the \code{nn_module} that is passed as argument \code{module} to
 \code{\link{PipeOpModule}}. It must be overwritten, when no \code{module_generator} is provided.
 If left as is, it calls the provided \code{module_generator} with the arguments obtained by
 the private method \code{.shape_dependent_params()}.
 \item \code{.shapes_out(shapes_in, param_vals, task)}\cr
 (\code{list()}, \code{list()}, \code{\link[mlr3:Task]{Task}} or \code{NULL}) -> named \code{list()}\cr
-This private method gets a list of \code{integer} vectors (\code{shapes_in}), the parameter values (\code{param_vals}),
+This private method gets a list of \code{numeric} vectors (\code{shapes_in}), the parameter values (\code{param_vals}),
 as well as an (optional) \code{\link[mlr3:Task]{Task}}.
 The \code{shapes_in} can be assumed to be in the same order as the input names of the \code{PipeOp}.
 The output shapes must be in the same order as the output names of the \code{PipeOp}.
 In case the output shapes depends on the task (as is the case for \code{\link{PipeOpTorchHead}}), the function should return
 valid output shapes (possibly containing \code{NA}s) if the \code{task} argument is provided or not.
-It is important to properly handle the presence of \code{NA}s in the input shapes.
-By default (if construction argument \code{only_batch_unknown} is \code{TRUE}), only the batch dimension can be \code{NA}.
-If you set this to \code{FALSE}, you need to take other unknown dimensions into account.
-The method can also throw an error if the input shapes violate some assumptions.
 \item \code{.shape_dependent_params(shapes_in, param_vals, task)}\cr
 (\code{list()}, \code{list()}) -> named \code{list()}\cr
 This private method has the same inputs as \code{.shapes_out}.
@@ -111,7 +107,7 @@ network_generator = torch::nn_module(
     self$output = if (task$task_type == "regr") {
       torch::nn_linear(d_hidden, 1)
     } else if (task$task_type == "classif") {
-      torch::nn_linear(d_hidden, output_dim_for(task))
+      torch::nn_linear(d_hidden, length(task$class_names))
     }
   },
   forward = function(x) {
@@ -213,7 +209,7 @@ po_torch$state
 
 # the resulting graphs are identical
 identical(mds_out[[1L]]$graph, mds_out[[2L]]$graph)
-# note that as a side-effect, also one of the input graphs is modified in-place for efficiency
+# not that as a side-effect, also one of the input graphs is modified in-place for efficiency
 mds_in[[1L]]$graph$edges
 
 # The new task has both Sepal and Petal features
@@ -298,8 +294,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class.
   inname = "input",
   outname = "output",
   packages = "torch",
-  tags = NULL,
-  only_batch_unknown = TRUE
+  tags = NULL
 )}\if{html}{\out{</div>}}
 }
 
@@ -340,11 +335,6 @@ The R packages this object depends on.}
 
 \item{\code{tags}}{(\code{character()})\cr
 The tags of the \code{\link[mlr3pipelines:PipeOp]{PipeOp}}. The tags \code{"torch"} is always added.}
-
-\item{\code{only_batch_unknown}}{(\code{logical(1)})\cr
-Whether only the batch dimension can be missing in the input shapes or whether other
-dimensions can also be unknown.
-Default is \code{TRUE}.}
 }
 \if{html}{\out{</div>}}
 }
diff --git a/man/mlr_pipeops_torch_ingress.Rd b/man/mlr_pipeops_torch_ingress.Rd
index 329e7682c..97b317882 100644
--- a/man/mlr_pipeops_torch_ingress.Rd
+++ b/man/mlr_pipeops_torch_ingress.Rd
@@ -54,9 +54,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -76,7 +73,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_ingress_categ.Rd b/man/mlr_pipeops_torch_ingress_categ.Rd
index 820bc7529..044db7e57 100644
--- a/man/mlr_pipeops_torch_ingress_categ.Rd
+++ b/man/mlr_pipeops_torch_ingress_categ.Rd
@@ -65,9 +65,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -87,7 +84,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_ingress_ltnsr.Rd b/man/mlr_pipeops_torch_ingress_ltnsr.Rd
index d6e338611..8c00c22f1 100644
--- a/man/mlr_pipeops_torch_ingress_ltnsr.Rd
+++ b/man/mlr_pipeops_torch_ingress_ltnsr.Rd
@@ -100,9 +100,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -122,7 +119,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_ingress_num.Rd b/man/mlr_pipeops_torch_ingress_num.Rd
index d07249889..0db5489a8 100644
--- a/man/mlr_pipeops_torch_ingress_num.Rd
+++ b/man/mlr_pipeops_torch_ingress_num.Rd
@@ -70,9 +70,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -92,7 +89,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_loss.Rd b/man/mlr_pipeops_torch_loss.Rd
index 92797609b..dda11a573 100644
--- a/man/mlr_pipeops_torch_loss.Rd
+++ b/man/mlr_pipeops_torch_loss.Rd
@@ -60,9 +60,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -82,7 +79,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_model.Rd b/man/mlr_pipeops_torch_model.Rd
index 8d3c6e47f..ec70d1a90 100644
--- a/man/mlr_pipeops_torch_model.Rd
+++ b/man/mlr_pipeops_torch_model.Rd
@@ -154,9 +154,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -176,7 +173,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_model_classif.Rd b/man/mlr_pipeops_torch_model_classif.Rd
index 24e004bdd..f2e5f8d2f 100644
--- a/man/mlr_pipeops_torch_model_classif.Rd
+++ b/man/mlr_pipeops_torch_model_classif.Rd
@@ -71,9 +71,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_pipeops_torch_model_regr.Rd b/man/mlr_pipeops_torch_model_regr.Rd
index 184482440..b7b40325f 100644
--- a/man/mlr_pipeops_torch_model_regr.Rd
+++ b/man/mlr_pipeops_torch_model_regr.Rd
@@ -71,9 +71,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_dropout}},
 \code{\link{mlr_pipeops_nn_elu}},
 \code{\link{mlr_pipeops_nn_flatten}},
-\code{\link{mlr_pipeops_nn_ft_cls}},
-\code{\link{mlr_pipeops_nn_ft_transformer_block}},
-\code{\link{mlr_pipeops_nn_geglu}},
 \code{\link{mlr_pipeops_nn_gelu}},
 \code{\link{mlr_pipeops_nn_glu}},
 \code{\link{mlr_pipeops_nn_hardshrink}},
@@ -93,7 +90,6 @@ Other PipeOps:
 \code{\link{mlr_pipeops_nn_merge_prod}},
 \code{\link{mlr_pipeops_nn_merge_sum}},
 \code{\link{mlr_pipeops_nn_prelu}},
-\code{\link{mlr_pipeops_nn_reglu}},
 \code{\link{mlr_pipeops_nn_relu}},
 \code{\link{mlr_pipeops_nn_relu6}},
 \code{\link{mlr_pipeops_nn_reshape}},
diff --git a/man/mlr_tasks_cifar.Rd b/man/mlr_tasks_cifar.Rd
index 838c8cf71..30a1200dd 100644
--- a/man/mlr_tasks_cifar.Rd
+++ b/man/mlr_tasks_cifar.Rd
@@ -51,6 +51,8 @@ as the cache directory.
 \examples{
 task_cifar10 = tsk("cifar10")
 task_cifar100 = tsk("cifar100")
+print(task_cifar10)
+print(task_cifar100)
 }
 \references{
 Krizhevsky, Alex (2009).
diff --git a/man/mlr_tasks_melanoma.Rd b/man/mlr_tasks_melanoma.Rd
index 9af5f3116..5701a0226 100644
--- a/man/mlr_tasks_melanoma.Rd
+++ b/man/mlr_tasks_melanoma.Rd
@@ -59,6 +59,7 @@ as the cache directory.
 
 \examples{
 task = tsk("melanoma")
+task
 }
 \references{
 Rotemberg, V., Kurtansky, N., Betz-Stablein, B., Caffery, L., Chousakos, E., Codella, N., Combalia, M., Dusza, S., Guitera, P., Gutman, D., Halpern, A., Helba, B., Kittler, H., Kose, K., Langer, S., Lioprys, K., Malvehy, J., Musthaq, S., Nanda, J., Reiter, O., Shih, G., Stratigos, A., Tschandl, P., Weber, J., Soyer, P. (2021).
diff --git a/man/mlr_tasks_mnist.Rd b/man/mlr_tasks_mnist.Rd
index cf75bee14..032b1b657 100644
--- a/man/mlr_tasks_mnist.Rd
+++ b/man/mlr_tasks_mnist.Rd
@@ -42,7 +42,10 @@ as the cache directory.
 }
 
 \examples{
+\dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 task = tsk("mnist")
+task
+\dontshow{\}) # examplesIf}
 }
 \references{
 Lecun, Y., Bottou, L., Bengio, Y., Haffner, P. (1998).
diff --git a/man/mlr_tasks_tiny_imagenet.Rd b/man/mlr_tasks_tiny_imagenet.Rd
index 26b91c064..001f78ec8 100644
--- a/man/mlr_tasks_tiny_imagenet.Rd
+++ b/man/mlr_tasks_tiny_imagenet.Rd
@@ -41,7 +41,10 @@ as the cache directory.
 }
 
 \examples{
+\dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 task = tsk("tiny_imagenet")
+task
+\dontshow{\}) # examplesIf}
 }
 \references{
 Deng, Jia, Dong, Wei, Socher, Richard, Li, Li-Jia, Li, Kai, Fei-Fei, Li (2009).
diff --git a/man/nn_graph.Rd b/man/nn_graph.Rd
index 67e261b3b..978bd6e27 100644
--- a/man/nn_graph.Rd
+++ b/man/nn_graph.Rd
@@ -23,28 +23,8 @@ Whether output should be a list of tensors. If \code{FALSE} (default), then \cod
 \code{\link{nn_graph}}
 }
 \description{
-Represents a neural network using a \code{\link[mlr3pipelines:Graph]{Graph}} that contains mostly \code{\link{PipeOpModule}}s.
+Represents a neural network using a \code{\link[mlr3pipelines:Graph]{Graph}} that usually costains mostly \code{\link{PipeOpModule}}s.
 }
-\section{Fields}{
-
-\itemize{
-\item \code{graph} :: \code{\link[mlr3pipelines:Graph]{Graph}}\cr
-The graph (consisting primarily of \code{\link{PipeOpModule}}s) that is wrapped by the network.
-\item \code{input_map} :: \code{character()}\cr
-The names of the input arguments of the network.
-\item \code{shapes_in} :: \code{list()}\cr
-The shapes of the input tensors of the network.
-\item \code{output_map} :: \code{character()}\cr
-Which output elements of the graph are returned by the \verb{$forward()} method.
-\item \code{list_output} :: \code{logical(1)}\cr
-Whether the output is a list of tensors.
-\item \code{module_list} :: \code{\link[torch:nn_module_list]{nn_module_list}}\cr
-The list of modules in the network.
-\item \code{list_output} :: \code{logical(1)}\cr
-Whether the output is a list of tensors.
-}
-}
-
 \examples{
 \dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 graph = mlr3pipelines::Graph$new()
diff --git a/man/nn_tokenizer_categ.Rd b/man/nn_tokenizer_categ.Rd
index 24935c419..f13a29d05 100644
--- a/man/nn_tokenizer_categ.Rd
+++ b/man/nn_tokenizer_categ.Rd
@@ -22,7 +22,6 @@ and \code{"normal"}.}
 }
 \description{
 Tokenizes categorical features into a dense embedding.
-For an input of shape \verb{(batch, n_features)} the output shape is \verb{(batch, n_features, d_token)}.
 }
 \references{
 Gorishniy Y, Rubachev I, Khrulkov V, Babenko A (2021).
diff --git a/man/nn_tokenizer_num.Rd b/man/nn_tokenizer_num.Rd
index acc1e6d70..406644baa 100644
--- a/man/nn_tokenizer_num.Rd
+++ b/man/nn_tokenizer_num.Rd
@@ -2,9 +2,10 @@
 % Please edit documentation in R/PipeOpTorchTokenizer.R
 \name{nn_tokenizer_num}
 \alias{nn_tokenizer_num}
+\alias{nn_tokenizer_numeric}
 \title{Numeric Tokenizer}
 \usage{
-nn_tokenizer_num(n_features, d_token, bias, initialization)
+nn_tokenizer_numeric(n_features, d_token, bias, initialization)
 }
 \arguments{
 \item{n_features}{(\code{integer(1)})\cr
@@ -22,7 +23,6 @@ and \code{"normal"}.}
 }
 \description{
 Tokenizes numeric features into a dense embedding.
-For an input of shape \verb{(batch, n_features)} the output shape is \verb{(batch, n_features, d_token)}.
 }
 \references{
 Gorishniy Y, Rubachev I, Khrulkov V, Babenko A (2021).
diff --git a/man/pipeop_preproc_torch.Rd b/man/pipeop_preproc_torch.Rd
index 2e5470c79..7296bc726 100644
--- a/man/pipeop_preproc_torch.Rd
+++ b/man/pipeop_preproc_torch.Rd
@@ -26,9 +26,12 @@ The preprocessing function.}
 \item{shapes_out}{(\code{function} or \code{NULL} or \code{"infer"})\cr
 The private \code{.shapes_out(shapes_in, param_vals, task)} method of \code{\link{PipeOpTaskPreprocTorch}}
 (see section Inheriting).
-Special values are \code{NULL} and \code{"infer"}:
+Special values are \code{NULL} and \code{infer}:
 If \code{NULL}, the output shapes are unknown.
-Option \code{"infer"} uses \code{\link{infer_shapes}}.
+If "infer", the output shape function is inferred and calculates the output shapes as follows:
+For an input shape of (NA, ...) a meta-tensor of shape (1, ...) is created and the preprocessing function is
+applied. Afterwards the batch dimension (1) is replaced with NA and the shape is returned.
+If the first dimension is not \code{NA}, the output shape of applying the preprocessing function is returned.
 Method \code{"infer"} should be correct in most cases, but might fail in some edge cases.}
 
 \item{param_set}{(\code{\link[paradox:ParamSet]{ParamSet}} or \code{NULL})\cr