From 579b80ce8df466ba4ab22722eb1ba37c41876a63 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 08:15:39 +0200 Subject: [PATCH 01/21] name chunks --- book/chapters/appendices/solutions.qmd | 266 +++++++++--------- .../solutions_large-scale_benchmarking.qmd | 38 +-- book/chapters/appendices/tasks.qmd | 2 +- .../chapter1/introduction_and_overview.qmd | 22 +- .../advanced_technical_aspects_of_mlr3.qmd | 92 +++--- .../chapter11/large-scale_benchmarking.qmd | 128 ++++----- .../chapter12/model_interpretation.qmd | 78 ++--- .../beyond_regression_and_classification.qmd | 94 +++---- .../chapter14/algorithmic_fairness.qmd | 16 +- .../chapter15/predsets_valid_inttune.qmd | 72 ++--- .../chapter2/data_and_basic_modeling.qmd | 160 +++++------ .../chapter3/evaluation_and_benchmarking.qmd | 108 +++---- .../chapter4/hyperparameter_optimization.qmd | 114 ++++---- ...ing_methods_and_black_box_optimization.qmd | 86 +++--- book/chapters/chapter6/feature_selection.qmd | 62 ++-- .../chapter7/sequential_pipelines.qmd | 44 +-- .../non-sequential_pipelines_and_tuning.qmd | 106 +++---- book/chapters/chapter9/preprocessing.qmd | 52 ++-- book/index.qmd | 2 +- 19 files changed, 771 insertions(+), 771 deletions(-) diff --git a/book/chapters/appendices/solutions.qmd b/book/chapters/appendices/solutions.qmd index a7c212f1e..4f8ccd1a3 100644 --- a/book/chapters/appendices/solutions.qmd +++ b/book/chapters/appendices/solutions.qmd @@ -19,7 +19,7 @@ aliases: (Note that the data set has NAs in its features. You can either rely on `rpart`'s capability to handle them internally ('surrogate splits') or remove them from the initial `data.frame` by using `na.omit`). -```{r} +```{r solutions-001} set.seed(1) data(PimaIndiansDiabetes2, package = "mlbench") @@ -44,7 +44,7 @@ prediction$score(measure) Try to solve this in two ways: (a) Using `mlr3measures`-predefined measure objects, and (b) without using `mlr3` tools by directly working on the ground truth and prediction vectors. Compare the results. -```{r} +```{r solutions-002} # true positive rate prediction$score(msr("classif.tpr")) @@ -58,7 +58,7 @@ prediction$score(msr("classif.tnr")) prediction$score(msr("classif.fnr")) ``` -```{r} +```{r solutions-003} # true positives TP = sum(prediction$truth == "pos" & prediction$response == "pos") @@ -89,7 +89,7 @@ The results are the same. 3. Change the threshold of the model from Exercise 1 such that the false negative rate is lower. What is one reason you might do this in practice? -```{r} +```{r solutions-004} # confusion matrix with threshold 0.5 prediction$confusion @@ -118,7 +118,7 @@ Finally, calculate the aggregated performance score. We start by instantiating our task and learner as usual: -```{r} +```{r solutions-005} set.seed(3) task = tsk("mtcars") learner = lrn("regr.rpart") @@ -126,7 +126,7 @@ learner = lrn("regr.rpart") We can instantiate a temporary resampling on the task to illustrate how it assigns observations across the 5 repeats (column `rep`) and 3 folds: -```{r} +```{r solutions-006} resampling = rsmp("repeated_cv", repeats = 5, folds = 3) resampling$instantiate(task) resampling$instance @@ -134,14 +134,14 @@ resampling$instance Note instantiating manually is not necessary when using `resample()`, as it automatically instantiates the resampling for us, so we pass it a new resampling which has not been instantiated: -```{r} +```{r solutions-007} resampling = rsmp("repeated_cv", repeats = 5, folds = 3) rr = resample(task, learner, resampling) ``` Now we can `$score()` the resampling with the MSE measure across each of the 5x3 resampling iterations: -```{r} +```{r solutions-008} scores = rr$score(msr("regr.mse")) scores ``` @@ -149,7 +149,7 @@ scores We can manually calculate these scores since `rr` contains all the individual predictions. The `$predictions()` method returns a list of predictions for each iteration, which we can use to calculate the MSE for the first iteration: -```{r} +```{r solutions-009} preds = rr$predictions() pred_1 = as.data.table(preds[[1]]) pred_1[, list(rmse = mean((truth - response)^2))] @@ -157,7 +157,7 @@ pred_1[, list(rmse = mean((truth - response)^2))] To visualize the results, we can use `ggplot2` directly on the `scores` object, which behaves like any other `data.table`: -```{r} +```{r solutions-010} library(ggplot2) # Barchart of the per-iteration scores ggplot(scores, aes(x = iteration, y = regr.mse)) + @@ -174,14 +174,14 @@ ggplot(scores, aes(x = regr.mse)) + Alternatively, the `autoplot()` function provides defaults for the `ResampleResult` object. Note that it internally scores the resampling using the MSE for regression tasks per default. -```{r} +```{r solutions-011} autoplot(rr) autoplot(rr, type = "histogram") ``` The aggregate score is the mean of the MSE scores across all iterations, which we can calculate using `$aggregate()` or by manually averaging the scores we stored before: -```{r} +```{r solutions-012} mean(scores$regr.mse) rr$aggregate(msr("regr.mse")) ``` @@ -195,7 +195,7 @@ What can be done to improve this? First we instantiate our learners with their initial parameters, setting the `predict_type = "prob"` once for all of them using `lrns()`. We then set the `nrounds` parameter for XGBoost to 100 and construct a resampling object for 5-fold CV: -```{r} +```{r solutions-013} set.seed(3) task = tsk("spam") @@ -207,7 +207,7 @@ resampling = rsmp("cv", folds = 5) We could have alternatively instantiated the learners like this, but would have needed to repeat the `predict_type = "prob"` argument multiple times. -```{r, eva=FALSE} +```{r solutions-014, eva=FALSE} learners = list( lrn("classif.ranger", predict_type = "prob"), lrn("classif.log_reg", predict_type = "prob"), @@ -217,7 +217,7 @@ learners = list( Next we can construct a benchmark design grid with the instantiated objects using `benchmark_grid()`: -```{r} +```{r solutions-015} design = benchmark_grid( tasks = task, learners = learners, @@ -228,14 +228,14 @@ design To perform the benchmark, we use the aptly named `benchmark()` function: -```{r, warning=FALSE} +```{r solutions-016, warning=FALSE} bmr = benchmark(design) bmr ``` And visualize the results as a boxplot: -```{r} +```{r solutions-017} autoplot(bmr, measure = msr("classif.auc")) ``` @@ -244,7 +244,7 @@ Naturally this is only a visual inspection of the results --- proper statistical When we re-run the same experiment with a different seed, we get a slightly different result. -```{r, warning=FALSE} +```{r solutions-018, warning=FALSE} set.seed(3235) resampling = rsmp("cv", folds = 5) design = benchmark_grid( @@ -270,14 +270,14 @@ See if you can reproduce their results. We make use of the `custom_cv` resampling strategy here: -```{r} +```{r solutions-019} task = tsk("penguins_simple") rsmp_cv = rsmp("custom_cv") ``` We apply the rule to assign resampling folds we were provided with: Every third observation is assigned to the same fold: -```{r} +```{r solutions-020} rsmp_cv$instantiate(task = task, f = factor(task$row_ids %% 3)) str(rsmp_cv$instance) @@ -285,7 +285,7 @@ str(rsmp_cv$instance) We are now ready to conduct the resampling and aggregate results: -```{r} +```{r solutions-021} rr = resample( task = task, learner = lrn("classif.rpart"), @@ -306,7 +306,7 @@ These are set as thresholds with the `$set_threshold()` method of the `Predictio This way we construct the ROC curve by iteratively calculating its x and y values, after which we can use `geom_step()` to draw a step function. Note that we do not need to re-train the learner, we merely adjust the threshold applied to the predictions we made at the top of the function -```{r} +```{r solutions-022} my_roc_plot = function(task, learner, train_indices, test_indices) { # Train learner, predict once. learner$train(task, train_indices) @@ -355,7 +355,7 @@ my_roc_plot = function(task, learner, train_indices, test_indices) { We try our function using `tsk("sonar")` and `lrn("classif.ranger")` learner with 100 trees. We set `predict_type = "prob"` since we need probability predictions to apply thresholds, rather than hard class predictions. -```{r} +```{r solutions-023} set.seed(3) # Setting up example task and learner for testing @@ -368,7 +368,7 @@ my_roc_plot(task, learner, split$train, split$test) We can compare it with the pre-built plot function in `mlr3viz`: -```{r} +```{r solutions-024} learner$train(task, split$train) pred = learner$predict(task, split$test) autoplot(pred, type = "roc") @@ -385,7 +385,7 @@ There are different approaches to drawing ROC curves, and our implementation abo Evaluate with a three-fold CV and the root mean squared error. Visualize the effects that each hyperparameter has on the performance via simple marginal plots, which plot a single hyperparameter versus the cross-validated MSE. -```{r} +```{r solutions-025} set.seed(1) task = tsk("mtcars") @@ -427,7 +427,7 @@ autoplot(instance, type = "marginal", cols_x = "num.trees") 2. Evaluate the performance of the model created in Exercise 1 with nested resampling. Use a holdout validation for the inner resampling and a three-fold CV for the outer resampling. -```{r} +```{r solutions-026} set.seed(1) task = tsk("mtcars") @@ -457,7 +457,7 @@ We see that the performance estimated while tuning overestimates the true perfor 3. Tune and benchmark an XGBoost model against a logistic regression (without tuning the latter) and determine which has the best Brier score. Use `mlr3tuningspaces` and nested resampling, try to pick appropriate inner and outer resampling strategies that balance computational efficiency vs. stability of the results. -```{r} +```{r solutions-027} #| warning: false set.seed(1) @@ -502,7 +502,7 @@ The learner is wrapped in an `r ref("auto_tuner()")`, which is then benchmarked Ensure that this `new_range` respects the initial bound of the original `search_space` by taking the `max()` of the new and old lower bound, and the `min()` of the new and the old upper bound ("clipping"). (4) Iterate the previous steps `random_search_stages` times and at the end return the best configuration you have ever evaluated. -```{r} +```{r solutions-028} library(mlr3misc) focus_search = function(task, learner, search_space, resampling, measure, random_search_stages, random_search_size) { @@ -565,7 +565,7 @@ focus_search( As a stretch goal, look into `mlr3tuning`'s internal source code and turn your function into an R6 class inheriting from the `TunerBatch` class -- test it out on a learner of your choice. -```{r} +```{r solutions-029} library(R6) library(mlr3tuning) @@ -662,7 +662,7 @@ Plot the progress of performance over iterations and visualize the spatial distr We first construct the learner, task, resampling, measure and terminator and then the instance. -```{r} +```{r solutions-030} library(mlr3mbo) library(bbotk) library(data.table) @@ -691,14 +691,14 @@ instance_rs = ti( Using a random search results in the following final performance: -```{r, warning = FALSE} +```{r solutions-031, warning = FALSE} tuner = tnr("random_search", batch_size = 50) tuner$optimize(instance_rs) ``` We then construct a new instance and optimize it via Bayesian Optimization (BO) using `tnr("mbo")` in its default configuration (see also `r ref("mbo_defaults")`): -```{r, warning = FALSE} +```{r solutions-032, warning = FALSE} instance_bo = ti( learner = learner, task = task, @@ -712,7 +712,7 @@ tuner$optimize(instance_bo) We then add relevant information to the archives of the instances so that we can combine their data and use this data for generating the desired plots. -```{r} +```{r solutions-033} instance_rs$archive$data[, iteration := seq_len(.N)] instance_rs$archive$data[, best_rmse := cummin(regr.rmse)] instance_rs$archive$data[, method := "Random Search"] @@ -738,7 +738,7 @@ We could even use the same first few random samples as the initial design in BO To visualize the spatial distribution of the evaluated hyperparameter configurations we will plot for each evaluated configuration the number of trees on the x-axis and the sample fraction on the y-axis. The label of each point corresponds to the mtry parameter directly. -```{r} +```{r solutions-034} relevant_columns = c("mtry", "sample.fraction", "num.trees", "iteration", "method") plot_data_sampling = rbind( instance_rs$archive$data[, ..relevant_columns, with = FALSE], @@ -766,7 +766,7 @@ Use a budget of 40 function evaluations. Run this with both the "default" Gaussian process surrogate model with Matérn 5/2 kernel, and the "default" random forest surrogate model. Compare their anytime performance (similarly as in @fig-bayesian-sinusoidal_bo_rs). You can construct the surrogate models with default settings using: -```{r} +```{r solutions-035} surrogate_gp = srlrn(default_gp()) surrogate_rf = srlrn(default_rf()) ``` @@ -774,7 +774,7 @@ surrogate_rf = srlrn(default_rf()) We first construct the function, making use of efficient evaluation operating on a `data.table` directly. We then wrap this function in the corresponding `r ref("ObjectiveRFunDt")` objective class and construct the instance. -```{r} +```{r solutions-036} rastrigin = function(xdt) { D = ncol(xdt) y = 10 * D + rowSums(xdt^2 - (10 * cos(2 * pi * xdt))) @@ -795,7 +795,7 @@ instance = OptimInstanceSingleCrit$new( We then construct the surrogates as well as the acquisition function and acquisition function optimizer (we will terminate the acquisition function optimization once optimization process stagnates by `1e-5` over the last 100 iterations) and construct the two BO optimizers. -```{r} +```{r solutions-037} surrogate_gp = srlrn(default_gp()) surrogate_rf = srlrn(default_rf()) @@ -820,7 +820,7 @@ optimizer_rf = opt("mbo", We will use the following initial design for both optimizers: -```{r} +```{r solutions-038} initial_design = data.table( x1 = c(-3.95, 1.16, 3.72, -1.39, -0.11, 5.00, -2.67, 2.44), x2 = c(1.18, -3.93, 3.74, -1.37, 5.02, -0.09, -2.65, 2.46) @@ -830,7 +830,7 @@ instance$eval_batch(initial_design) We then proceed to optimize the instance with each of the two optimizers and make sure to extract the relevant data from the archive of the instance. -```{r, warning = FALSE} +```{r solutions-039, warning = FALSE} optimizer_gp$optimize(instance) gp_data = instance$archive$data @@ -839,7 +839,7 @@ gp_data[, iteration := seq_len(.N)] gp_data[, surrogate := "Gaussian Process"] ``` -```{r, warning = FALSE} +```{r solutions-040, warning = FALSE} instance$archive$clear() instance$eval_batch(initial_design) @@ -854,7 +854,7 @@ rf_data[, surrogate := "Random forest"] We then combine the data and use it to generate the desired plot: -```{r} +```{r solutions-041} plot_data = rbind(gp_data, rf_data) ggplot(aes(x = iteration, y = y_min, colour = surrogate), data = plot_data) + geom_step() + @@ -876,7 +876,7 @@ Plot the resulting Pareto front and compare it to the analytical solution, $y_2 We first construct the function, wrap it in the objective and then create the instance. -```{r} +```{r solutions-042} fun = function(xss) { evaluations = lapply(xss, FUN = function(xs) { list(y1 = xs$x ^ 2, y2 = (xs$x - 2)^2) @@ -900,7 +900,7 @@ As a surrogate we will use a random forest. ParEGO is a scalarization based multi-objective BO algorithm and therefore we use the Expected Improvement as acquisition function. We will use the same acquisition functon optimizer as earlier. -```{r} +```{r solutions-043} surrogate = srlrn(default_rf()) acq_function = acqf("ei") @@ -918,12 +918,12 @@ optimizer = opt("mbo", We then optimize the instance: -```{r, warning = FALSE} +```{r solutions-044, warning = FALSE} optimizer$optimize(instance) ``` Finally, we visualize the resulting Pareto front (in black) and its analytical counterpart (in darkgrey). -```{r} +```{r solutions-045} true_pareto = data.table(y1 = seq(from = 0, to = 4, length.out = 1001)) true_pareto[, y2 := (sqrt(y1) - 2) ^2] @@ -942,7 +942,7 @@ ggplot(aes(x = y1, y = y2), data = instance$archive$best()) + NB: Here, we have performed the feature filtering outside of CV, which is generally not a good idea as it biases the CV performance estimation. To do this properly, filtering should be embedded inside the CV via pipelines -- try to come back to this exercise after you read @sec-pipelines-nonseq to implement this with less bias. -```{r} +```{r solutions-046} set.seed(1) task = tsk("mtcars") @@ -975,7 +975,7 @@ The `"mse"` is much lower on the filtered task. Do the selected features differ? Which feature selection method reports a higher classification accuracy in its `$result`? -```{r} +```{r solutions-047} set.seed(1) fselector_sbs = fs("sequential", strategy = "sbs") @@ -1020,7 +1020,7 @@ The sequential backward search reports a higher classification accuracy. Change the process by applying forward feature selection with `auto_fselector()`. Compare the performance to backward feature selection from Exercise 2 using nested resampling. -```{r} +```{r solutions-048} set.seed(1) afs_sfs = auto_fselector( @@ -1055,7 +1055,7 @@ Now the sequential forward search selects yields a slightly higher classificatio Features with high filter scores should be added with higher probability. Start by coding a stand-alone R method for this search (based on a learner, task, resampling, performance measure and some control settings). -```{r} +```{r solutions-049} library(mlr3verse) library(data.table) @@ -1119,7 +1119,7 @@ filter_forward_selection_search(task, learner, resampling, measure, filter, n, m Then, as a stretch goal, see if you can implement this as an R6 class inheriting from `FSelectorBatch`. -```{r} +```{r solutions-050} library(R6) library(checkmate) library(mlr3verse) @@ -1211,7 +1211,7 @@ instance = fselect( 1. Concatenate the PipeOps named in the exercise by using `%>>%`. The resulting `r ref("Graph")` can then be converted to a `r ref("Learner")` by using `r ref("as_learner()")`. -```{r pipelines-001} +```{r solutions-051} library(mlr3pipelines) library(mlr3learners) @@ -1222,7 +1222,7 @@ graph_learner = as_learner(graph) 2. The `r ref("GraphLearner")` can be trained like any other `Learner` object, thereby filling in its `$model` field. It is possible to access the `$state` of any `PipeOp` through this field: the states are named after the `PipeOp`'s `$id`. The logistic regression model can then be extracted from the state of the `po("learner")` that contains the `lrn("classif.log_reg")`. -```{r pipelines-002-0} +```{r solutions-052} graph_learner$train(tsk("pima")) # access the state of the po("learner") to get the model @@ -1230,13 +1230,13 @@ model = graph_learner$model$classif.log_reg$model coef(model) ``` Alternatively, the underlying `lrn("classif.log_reg")` can be accessed through the `$base_learner()` method: -```{r pipelines-002-1} +```{r solutions-053} model = graph_learner$base_learner()$model coef(model) ``` As a third option, the trained `PipeOp` can be accessed through the `$graph_model` field of the `GraphLearner`. The trained `PipeOp` has a `$learner_model` field, which contains the trained `Learner` object, which contains the model. -```{r pipelines-002-2} +```{r solutions-054} pipeop = graph_learner$graph_model$pipeops$classif.log_reg model = pipeop$learner_model$model coef(model) @@ -1245,7 +1245,7 @@ coef(model) 3. Set the `$keep_results` flag of the Graph to `TRUE` to keep the results of the individual PipeOps. Afterwards, the input of the `lrn("classif.log_reg")` can be accessed through the `$.result` field of its predecessor, the `po("scale")`. Note that the `$.result` is a `list`, we want to access its only element, named `$output`. -```{r pipelines-003} +```{r solutions-055} graph_learner$graph$keep_results = TRUE graph_learner$train(tsk("pima")) @@ -1269,7 +1269,7 @@ sd(age_column) To restrict this operator to only columns without missing values, the `affect_columns` with a fitting `r ref("Selector")` can be used: The `selector_missing()`, which selects columns *with* missing values, combined with `selector_invert()`, which inverts the selection. Since `po("pca")` only operates on numeric columns, it is not necessary to use a `Selector` to select numeric columns. -```{r pipelines-004-0} +```{r solutions-056} graph = as_graph(po("pca", affect_columns = selector_invert(selector_missing())) ) @@ -1289,7 +1289,7 @@ Another `po("select")` can be used to select all the other columns. It is put in parallel with the first `po("select")` using `gunion()`. It is necessary to use different `$id` values for both `po("select")` to avoid a name clash in the `Graph`. To combine the output from both paths, `po("featureunion")` can be used. -```{r pipelines-004-1} +```{r solutions-057} path1 = po("select", id = "select_non_missing", selector = selector_invert(selector_missing())) %>>% po("pca") @@ -1304,7 +1304,7 @@ graph_result[[1]]$feature_names 2. First, observe the feature names produced by the level 0 learners when applied to the `tsk("wine")` task: -```{r pipelines-005-0} +```{r solutions-058} lrn_rpart = lrn("classif.rpart", predict_type = "prob") po_rpart_cv = po("learner_cv", learner = lrn_rpart, resampling.folds = 2, id = "rpart_cv" @@ -1328,7 +1328,7 @@ gr_combined$train(tsk("wine"))[[1]]$head() To use `po("select")` to *remove*, instead of *keep*, a feature based on a pattern, use `r ref("selector_invert")` together with `r ref("selector_grep")`. To remove the "`1`" class columns, i.e. all columns with names that end in "1", the following `po("select")` could be used: -```{r pipelines-005-1} +```{r solutions-059} drop_one = po("select", selector = selector_invert(selector_grep("\\.1$"))) # Train it on the wine task with lrn("classif.multinom"): @@ -1345,7 +1345,7 @@ glrn_stack$base_learner()$model 3. A solution that does not need to specify the target classes at all is to use a custom `r ref("Selector")`, as was shown in @sec-pipelines-bagging: -```{r pipelines-005} +```{r solutions-060} selector_remove_one_prob_column = function(task) { class_removing = task$class_names[[1]] selector_use = selector_invert(selector_grep(paste0("\\.", class_removing ,"$"))) @@ -1354,7 +1354,7 @@ selector_remove_one_prob_column = function(task) { ``` Using this selector in @sec-pipelines-stack, one could use the resulting stacking learner on any classification task with arbitrary target classes. It can be used as an alternative to the `Selector` used in exercise 2: -```{r pipelines-005-2} +```{r solutions-061} drop_one_alt = po("select", selector = selector_remove_one_prob_column) # The same as above: @@ -1369,7 +1369,7 @@ glrn_stack$base_learner()$model 4. We choose to use the following options for imputation, factor encoding, and model training. Note the use of `pos()` and `lrns()`, which return lists of `PipeOp` and `Learner` objects, respectively. -```{r pipelines-005-3} +```{r solutions-062} imputing = pos(c("imputeoor", "imputesample")) factor_encoding = pos(c("encode", "encodeimpact")) @@ -1379,7 +1379,7 @@ models = lrns(c("classif.rpart", "classif.log_reg", "classif.svm")) Use the `ppl("branch")` pipeline to get `Graphs` with alternative path branching, controlled by its own hyperparameter. We need to give the `po("branch")` operators that are created here individual prefixes to avoid nameclashes when we put everything together. -```{r pipelines-005-4} +```{r solutions-063} full_graph = ppl("branch", prefix_branchops = "impute_", graphs = imputing ) %>>% ppl("branch", @@ -1394,7 +1394,7 @@ full_graph$plot() The easiest way to set up the search space for this pipeline is to use `to_tune()`. It is necessary to record the dependencies of the hyperparameters of the preprocessing and model `PipeOps` on the branch hyperparameters. For this, `to_tune()` needs to be applied to a `Domain` object -- `p_dbl()`, `p_fct()`, etc. -- that has its dependency declared using the `depends` argument. -```{r pipelines-005-5} +```{r solutions-064} library("paradox") full_graph$param_set$set_values( impute_branch.selection = to_tune(), @@ -1417,7 +1417,7 @@ full_graph$param_set$set_values( We also set a few SVM kernel hyperparameters record their dependency on the model selection branch hyperparameter. We could record these dependencies in the `Graph`, using the `$add_dep()` method of the `r ref("ParamSet")`, but here we use the simpler approach of adding a single item search space component. -```{r pipelines-005-5-1} +```{r solutions-065} full_graph$param_set$set_values( classif.svm.type = to_tune(p_fct("C-classification", depends = model_branch.selection == "classif.svm")), @@ -1428,7 +1428,7 @@ full_graph$param_set$set_values( To turn this `Graph` into an AutoML-system, we use an `AutoTuner`. Here we use random search, but any other `Tuner` could be used. -```{r pipelines-005-6} +```{r solutions-066} library("mlr3tuning") automl_at = auto_tuner( tuner = tnr("random_search"), @@ -1440,7 +1440,7 @@ automl_at = auto_tuner( ``` We can now benchmark this `AutoTuner` on a few tasks and compare it with the untuned random forest with out-of-range (OOR) imputation: -```{r pipelines-005-7} +```{r solutions-067} #| warning: false learners = list( automl_at, @@ -1472,7 +1472,7 @@ We will consider a prediction problem similar to the one from this chapter, but To evaluate the models, we again use 10-fold CV, mean absolute error and `lrn("regr.glmnet")`. For now we will ignore the `date` column and simply remove it: -```{r} +```{r solutions-068} set.seed(1) library("mlr3data") @@ -1483,14 +1483,14 @@ task$select(setdiff(task$feature_names, "date")) 1. Have a look at the features, are there any features which might be problematic? If so, change or remove them. Check the dataset and learner properties to understand which preprocessing steps you need to do. -```{r} +```{r solutions-069} summary(task) ``` The `zipcode` should not be interpreted as a numeric value, so we cast it to a factor. We could argue to remove `lat` and `long` as handling them as linear effects is not necessarily a suitable, but we will keep them since `glmnet` performs internal feature selection anyways. -```{r, warning=FALSE, message=FALSE} +```{r solutions-070, warning=FALSE, message=FALSE} zipencode = po("mutate", mutation = list(zipcode = ~ as.factor(zipcode)), id = "zipencode") ``` @@ -1498,11 +1498,11 @@ zipencode = po("mutate", mutation = list(zipcode = ~ as.factor(zipcode)), id = " Construct a new `glmnet` model with `ppl("robustify")`. Compare the two pipelines in a benchmark experiment. -```{r, warning=FALSE, message=FALSE} +```{r solutions-071, warning=FALSE, message=FALSE} lrn_glmnet = lrn("regr.glmnet") ``` -```{r, warning=FALSE, message=FALSE} +```{r solutions-072, warning=FALSE, message=FALSE} graph_preproc = zipencode %>>% po("fixfactors") %>>% @@ -1523,7 +1523,7 @@ First we fix the factor levels to ensure that all 70 zipcodes are fixed. We can consider 70 levels high cardinality, so we use impact encoding. We use the same imputation strategy as in @sec-preprocessing. -```{r, warning=FALSE, message=FALSE} +```{r solutions-073, warning=FALSE, message=FALSE} graph_robustify = pipeline_robustify(task = task, learner = lrn_glmnet) %>>% lrn_glmnet @@ -1531,7 +1531,7 @@ graph_robustify = graph_robustify$plot() ``` -```{r} +```{r solutions-074} glrn_preproc = as_learner(graph_preproc, id = "glmnet_preproc") glrn_robustify = as_learner(graph_robustify, id = "glmnet_robustify") @@ -1554,7 +1554,7 @@ Our preprocessing pipeline performs slightly better than the robustified one. Can you extract an additional feature from the lat / long coordinates? (Hint: Downtown Seattle has lat/long coordinates `47.605`/`122.334`). -```{r, warning=FALSE, message=FALSE} +```{r solutions-075, warning=FALSE, message=FALSE} task = tsk("kc_housing") graph_mutate = @@ -1591,7 +1591,7 @@ This improves the average error of our model by a further 1400$. 1. Consider the following example where you resample a learner (debug learner, sleeps for 3 seconds during `$train()`) on 4 workers using the multisession backend: -```{r technical-050} +```{r solutions-076} task = tsk("penguins") learner = lrn("classif.debug", sleep_train = function() 3) resampling = rsmp("cv", folds = 6) @@ -1609,7 +1609,7 @@ All 4 of them are occupied for the first 4 iterations of the cross-validation. The 5th iteration, however, only runs in parallel to the 6th fold, leaving 2 cores idle. This is supported by the elapsed time of roughly 6 seconds for 6 jobs compared to also roughly 6 seconds for 8 jobs: -```{r solutions-022} +```{r solutions-077} task = tsk("penguins") learner = lrn("classif.debug", sleep_train = function() 3) @@ -1631,7 +1631,7 @@ Therefore, a simple adaptation either increases the number of folds for improved The rules can easily be translated to R code where we first convert select the predicted probabilities for the positive class, 0-1 encode the truth vector and then calculate the mean absolute error between the two vectors. -```{r solutions-023} +```{r solutions-078} mae_prob = function(truth, prob, task) { # retrieve positive class from task positive = task$positive @@ -1646,7 +1646,7 @@ mae_prob = function(truth, prob, task) { This function can be embedded in the `Measure` class accordingly. -```{r solutions-024} +```{r solutions-079} MeasureMaeProb = R6::R6Class("MeasureMaeProb", inherit = mlr3::MeasureClassif, # classification measure public = list( @@ -1673,7 +1673,7 @@ MeasureMaeProb = R6::R6Class("MeasureMaeProb", Because this is a custom class that is not available in the `mlr_measures` dictionary, we have to create a new instance using the `$new()` constructor. -```{r} +```{r solutions-080} msr_mae_prob = MeasureMaeProb$new() msr_mae_prob ``` @@ -1681,7 +1681,7 @@ msr_mae_prob To try this measure, we resample a logistic regression on the sonar task using five-fold cross-validation. -```{r} +```{r solutions-081} # predict_type is set to "prob", as otherwise our measure does not work learner = lrn("classif.log_reg", predict_type = "prob") task = tsk("sonar") @@ -1690,13 +1690,13 @@ rr = resample(task, learner, rsmp("cv", folds = 5)) We now score the resample result using our custom measure and `msr("classif.acc")`. -```{r} +```{r solutions-082} score = rr$score(list(msr_mae_prob, msr("classif.acc"))) ``` In this case, there is a clear relationship between the classification accuracy and our custom measure, i.e. the higher the accuracy, the lower the mean absolute error of the predicted probabilities. -```{r} +```{r solutions-083} cor(score$mae_prob, score$classif.acc) ``` @@ -1709,7 +1709,7 @@ cor(score$mae_prob, score$classif.acc) First, we create the learner that we want to tune, mark the relevant parameter for tuning and set the fallback learner to a classification tree. -```{r} +```{r solutions-084} lrn_debug = lrn("classif.debug", error_train = to_tune(0, 1) ) @@ -1722,7 +1722,7 @@ Nonetheless it serves as a good example to illustrate the effects of training er We proceed with optimizing the classification accuracy of the learner on the penguins task. -```{r} +```{r solutions-085} instance = tune( learner = lrn_debug, task = tsk("penguins"), @@ -1736,14 +1736,14 @@ instance To find out which evaluations resulted in an error, we can inspect the `$archive` slot of the instance, which we convert to a `data.table` for easier filtering. -```{r} +```{r solutions-086} archive = as.data.table(instance$archive) archive[, c("error_train", "classif.acc", "errors")] ``` Below, we visualize the relationship between the error probabilty and the classification accuracy. -```{r} +```{r solutions-087} ggplot(data = archive, aes(x = error_train, y = classif.acc, color = errors)) + geom_point() + theme_minimal() @@ -1753,13 +1753,13 @@ Higher values for `error_train` lead to more resampling iterations using the cla Therefore, the best found hyperparameter configurations will tend to have values of `error_train` close to 1. When multiple parameter configurations have the same test performance, the first one is chosen by `$result_learner_param_vals`. -```{r} +```{r solutions-088} instance$result_learner_param_vals ``` We repeat the same experiment for the tuning interval from 0.3 to 0.7. -```{r} +```{r solutions-089} lrn_debug$param_set$set_values( error_train = to_tune(0.3, 0.7) ) @@ -1779,7 +1779,7 @@ instance2 As before, higher error probabilities during training lead to higher classification accuracies. -```{r} +```{r solutions-090} ggplot(data = archive2, aes(x = error_train, y = classif.acc, color = errors)) + geom_point() + theme_minimal() @@ -1787,7 +1787,7 @@ ggplot(data = archive2, aes(x = error_train, y = classif.acc, color = errors)) + However, the best found configurations for the `error_train` parameter, now tend to be close to 0.7 instead of 1 as before. -```{r} +```{r solutions-091} instance2$result_learner_param_vals ``` @@ -1804,7 +1804,7 @@ Note that in most real-world scenarios, the fallback learner performs worse than 1. Prepare a `mlr3` regression task for `fifa` data. Select only variables describing the age and skills of footballers. Train any predictive model for this task, e.g. `lrn("regr.ranger")`. -```{r solutions-044, warning=FALSE, message=FALSE} +```{r solutions-092, warning=FALSE, message=FALSE} library(DALEX) library(ggplot2) data("fifa", package = "DALEX") @@ -1828,7 +1828,7 @@ learner$model **With `iml`** -```{r solutions-045, warning=FALSE, message=FALSE} +```{r solutions-093, warning=FALSE, message=FALSE} library(iml) model = Predictor$new(learner, data = fifa20, @@ -1841,7 +1841,7 @@ effect$plot() **With `DALEX`** -```{r solutions-046, warning=FALSE, message=FALSE} +```{r solutions-094, warning=FALSE, message=FALSE} library(DALEX) ranger_exp = DALEX::explain(learner, data = fifa20[, setdiff(names(fifa20), "value_eur")], @@ -1858,7 +1858,7 @@ plot(ranger_effect) **With `iml`** -```{r solutions-047, warning=FALSE, message=FALSE} +```{r solutions-095, warning=FALSE, message=FALSE} impfeat = c("skill_ball_control") effect = FeatureEffects$new(model, features = impfeat) @@ -1867,7 +1867,7 @@ plot(effect) **With `DALEX`** -```{r solutions-048, warning=FALSE, message=FALSE} +```{r solutions-096, warning=FALSE, message=FALSE} impfeat = c("skill_ball_control") ranger_profiles = model_profile(ranger_exp, variables = impfeat) @@ -1876,20 +1876,20 @@ plot(ranger_profiles) 4. Choose Robert Lewandowski as a specific example and calculate and plot the Shapley values. Which feature is locally the most important and has the strongest influence on his valuation as a soccer player? -```{r solutions-049, warning=FALSE, message=FALSE} +```{r solutions-097, warning=FALSE, message=FALSE} player_1 = fifa20["R. Lewandowski",] ``` **With `iml`** -```{r solutions-050, warning=FALSE, message=FALSE} +```{r solutions-098, warning=FALSE, message=FALSE} shapley = Shapley$new(model, x.interest = player_1) plot(shapley) ``` **With `DALEX`** -```{r solutions-051, warning=FALSE, message=FALSE} +```{r solutions-099, warning=FALSE, message=FALSE} ranger_shap = predict_parts(ranger_exp, new_observation = player_1, type = "shap", B = 1) @@ -1900,7 +1900,7 @@ plot(ranger_shap, show_boxplots = FALSE) 1. Run a benchmark experiment on `tsk("german_credit")` with `lrn("classif.featureless")`, `lrn("classif.log_reg")`, and `lrn("classif.ranger")`. Tune the prediction thresholds of all learners by encapsulating them in a `po("learner_cv")` (with two-fold CV), followed by a `po("tunethreshold")`. Use `msr("classif.costs", costs = costs)`, where the `costs` matrix is as follows: true positive is `-10`, true negative is `-1`, false positive is `2`, and false negative is `3`. Use this measure in `po("tunethreshold")` and when evaluating your benchmark experiment. -```{r, message=FALSE, warning=FALSE} +```{r solutions-100, message=FALSE, warning=FALSE} set.seed(1) # Load task and learners tsk_german = tsk("german_credit") @@ -1916,7 +1916,7 @@ costs Our cost matrix is as expected so we can plug it into our measure and setup our pipeline. -```{r results='hide'} +```{r solutions-101='hide'} # Create measure meas_costs = msr("classif.costs", costs = costs) @@ -1935,7 +1935,7 @@ bmr = benchmark(design)$aggregate(meas_costs) Now exploring our results... -```{r} +```{r solutions-102} bmr[, .(learner_id, classif.costs)] ``` @@ -1943,7 +1943,7 @@ Based on these results, the logistic regression performs the best with the great 2. Train and test a survival forest using `lrn("surv.rfsrc")` (from `mlr3extralearners`). Run this experiment using `tsk("rats")` and `partition()`. Evaluate your model with the RCLL measure. -```{r} +```{r solutions-103} # Get learners library(mlr3extralearners) # Get survival models @@ -1960,7 +1960,7 @@ prediction$score(msr("surv.rcll")) The right-censored logloss provides a measure of predictive accuracy, but it is quite hard to interpret it without comparison to another model. To yield a more informative value, we could either compute the RCLL for an uninformed baseline like the Kaplan-Meier estimator, or we could use the `ERV` (explained residual variation) parameter in the measure, which returns the RCLL as a percentage increase in performance compared to an uninformed baseline (in this case the Kaplan-Meier estimator): -```{r} +```{r solutions-104} lrn("surv.kaplan")$ train(tsk_rats, splits$train)$ predict(tsk_rats, splits$test)$ @@ -1974,7 +1974,7 @@ Now we can see that our model is only marginally better than the Kaplan-Meier ba 3. Estimate the density of the "precip" task from the `mlr3proba` package using `lrn("dens.hist")`, evaluate your estimation with the logloss measure. As a stretch goal, look into the documentation of `distr6` to learn how to analyse your estimated distribution further. -```{r} +```{r solutions-105} # Get density models library(mlr3proba) set.seed(1) @@ -1988,7 +1988,7 @@ prediction$score(msr("dens.logloss")) As before the logloss is not too informative by itself but as the Histogram is itself a baseline, we can use this value for comparison to more sophisticated models. To learn more about our predicted distribution, we could use `distr6` to summarise the distribution and to compute values such as the pdf and cdf: -```{r} +```{r solutions-106} prediction$distr$summary() # pdf evaluated at `50` prediction$distr$pdf(50) @@ -1996,7 +1996,7 @@ prediction$distr$pdf(50) 4. Run a benchmark clustering experiment on the "wine" dataset without a label column. Compare the performance of k-means learner with `k` equal to `2`, `3` and `4` using the silhouette measure and the insample resampling technique. What value of `k` would you choose based on the silhouette scores? -```{r, messages=FALSE, warnings=FALSE} +```{r solutions-107, messages=FALSE, warnings=FALSE} set.seed(1) # Load clustering models and tasks library(mlr3cluster) @@ -2024,7 +2024,7 @@ We can see that we get the silhouette closest to `1` with `K=2` so we might use For now we simply load the data and look at the data. -```{r} +```{r solutions-108} library(mlr3) library(mlr3fairness) set.seed(8) @@ -2036,7 +2036,7 @@ tsk_adult_train We can now train a simple model, e.g., a decision tree and evaluate for accuracy. -```{r} +```{r solutions-109} learner = lrn("classif.rpart") learner$train(tsk_adult_train) prediction = learner$predict(tsk_adult_test) @@ -2046,7 +2046,7 @@ prediction$score() The *false omission rate parity* metric is available via the key `"fairness.fomr"`. Note, that evaluating our prediction now requires that we also provide the task. -```{r} +```{r solutions-110} msr_1 = msr("fairness.fomr") prediction$score(msr_1, tsk_adult_test) ``` @@ -2054,14 +2054,14 @@ prediction$score(msr_1, tsk_adult_test) In addition, we can look at false omission rates in each group. The `groupwise_metrics` function creates a metric for each group specified in the `pta` column role: -```{r} +```{r solutions-111} tsk_adult_test$col_roles$pta ``` We can then use this metric to evaluate our model again. This gives us the false omission rates for male and female individuals separately. -```{r} +```{r solutions-112} msr_2 = groupwise_metrics(base_measure = msr("classif.fomr"), task = tsk_adult_test) prediction$score(msr_2, tsk_adult_test) ``` @@ -2069,7 +2069,7 @@ prediction$score(msr_2, tsk_adult_test) 2. Improve your model by employing pipelines that use pre- or post-processing methods for fairness. Evaluate your model along the two metrics and visualize the resulting metrics. Compare the different models using an appropriate visualization. First we can again construct the learners above. -```{r} +```{r solutions-113} library(mlr3pipelines) lrn_1 = po("reweighing_wts") %>>% lrn("classif.rpart") lrn_2 = po("learner_cv", lrn("classif.rpart")) %>>% @@ -2078,7 +2078,7 @@ lrn_2 = po("learner_cv", lrn("classif.rpart")) %>>% And run the benchmark again. Note, that we use three-fold CV this time for comparison. -```{r} +```{r solutions-114} learners = list(learner, lrn_1, lrn_2) design = benchmark_grid(tsk_adult_train, learners, rsmp("cv", folds = 3L)) bmr = benchmark(design) @@ -2087,7 +2087,7 @@ bmr$aggregate(msrs(c("classif.acc", "fairness.fomr"))) We can now again visualize the result. -```{r} +```{r solutions-115} library(ggplot2) fairness_accuracy_tradeoff(bmr, msr("fairness.fomr")) + scale_color_viridis_d("Learner") + @@ -2105,12 +2105,12 @@ We can notice two main results: This can be achieved by adding "race" to the `"pta"` col_role. -```{r} +```{r solutions-116} tsk_adult_train$set_col_roles("race", add_to = "pta") tsk_adult_train ``` -```{r} +```{r solutions-117} tsk_adult_test$set_col_roles("race", add_to = "pta") prediction$score(msr_1, tsk_adult_test) ``` @@ -2120,12 +2120,12 @@ Note, that the metric by default computes the maximum discrepancy between all me If we now compute the `groupwise_metrics`, we will get a metric for the intersection of each group. -```{r} +```{r solutions-118} msr_3 = groupwise_metrics(msr("classif.fomr"), tsk_adult_train) unname(sapply(msr_3, function(x) x$id)) ``` -```{r} +```{r solutions-119} prediction$score(msr_3, tsk_adult_test) ``` @@ -2143,7 +2143,7 @@ We'll go through them one by one to deepen our understanding: We can investigate this further by looking at actual counts: -```{r} +```{r solutions-120} table(tsk_adult_test$data(cols = c("race", "sex", "target"))) ``` @@ -2155,7 +2155,7 @@ We'll go through them one by one to deepen our understanding: First, we create a subset of only `sex`: `Female` and `race`: `"Black", "White`. -```{r} +```{r solutions-121} adult_subset = tsk_adult_test$clone() df = adult_subset$data() rows = seq_len(nrow(df))[df$race %in% c("Black", "White") & df$sex %in% c("Female")] @@ -2164,7 +2164,7 @@ adult_subset$set_col_roles("race", add_to = "pta") ``` And evaluate our measure again: -```{r} +```{r solutions-122} prediction$score(msr_3, adult_subset) ``` @@ -2181,7 +2181,7 @@ We can see, that between women there is an even bigger discrepancy compared to m We start by loading the packages and creating the task. -```{r} +```{r solutions-123} library(mlr3) library(mlr3extralearners) library(mlr3pipelines) @@ -2192,14 +2192,14 @@ tsk_pima Below, we see that the task has five features with missing values. -```{r} +```{r solutions-124} tsk_pima$missings() ``` Next, we create the LightGBM classifier, but don't specify the validation data yet. We handle the missing values using a simple median imputation. -```{r} +```{r solutions-125} lrn_lgbm = lrn("classif.lightgbm", num_iterations = 1000, early_stopping_rounds = 10, @@ -2216,7 +2216,7 @@ The call below sets the `$validate` field of the LightGBM pipeop to `"predefined Recall that only the graphlearner itself can specify *how* the validation data is generated. The individual pipeops can either use it (`"predefined"`) or not (`NULL`). -```{r} +```{r solutions-126} set_validate(glrn, validate = 0.3, ids = "classif.lightgbm") glrn$validate glrn$graph$pipeops$classif.lightgbm$validate @@ -2224,7 +2224,7 @@ glrn$graph$pipeops$classif.lightgbm$validate Finally, we train the learner and inspect the validation scores and internally tuned parameters. -```{r} +```{r solutions-127} glrn$train(tsk_pima) glrn$internal_tuned_values @@ -2240,7 +2240,7 @@ glrn$internal_valid_scores We start by setting the number of boosting iterations to an internal tune token where the maximum number of boosting iterations is 1000 and the aggregation function the maximum. Note that the input to the aggregation function is a list of integer values (the early stopped values for the different resampling iterations), so we need to `unlist()` it first before taking the maximum. -```{r} +```{r solutions-128} library(mlr3tuning) glrn$param_set$set_values( @@ -2252,14 +2252,14 @@ glrn$param_set$set_values( Now, we change the validation data from `0.3` to `"test"`, where we can omit the `ids` specification as LightGBM is the base learner. -```{r} +```{r solutions-129} set_validate(glrn, validate = "test") ``` Next, we create the autotuner using the configuration given in the instructions. As the internal validation measures are calculated by `lightgbm` and not `mlr3`, we need to specify whether the metric should be minimized. -```{r} +```{r solutions-130} at_lgbm = auto_tuner( learner = glrn, tuner = tnr("internal"), @@ -2272,7 +2272,7 @@ at_lgbm$id = "at_lgbm" Finally, we set up the benchmark design, run it, and evaluate the learners in terms of their classification accuracy. -```{r} +```{r solutions-131} design = benchmark_grid( task = tsk_pima, learners = list(at_lgbm, lrn("classif.rpart")), @@ -2286,7 +2286,7 @@ bmr$aggregate(msr("classif.acc")) 3. Consider the code below: - ```{r} + ```{r solutions-132} branch_lrn = as_learner( ppl("branch", list( lrn("classif.ranger"), @@ -2349,7 +2349,7 @@ Note that we would normally recommend setting the validation data to `"test"` wh 4. Look at the (failing) code below: - ```{r, error = TRUE} + ```{r solutions-133, error = TRUE} tsk_sonar = tsk("sonar") glrn = as_learner( po("pca") %>>% lrn("classif.xgboost", validate = 0.3) diff --git a/book/chapters/appendices/solutions_large-scale_benchmarking.qmd b/book/chapters/appendices/solutions_large-scale_benchmarking.qmd index 51d344f0c..5daf0dae7 100644 --- a/book/chapters/appendices/solutions_large-scale_benchmarking.qmd +++ b/book/chapters/appendices/solutions_large-scale_benchmarking.qmd @@ -1,4 +1,4 @@ -```{r solutions-026} +```{r solutions_large-scale_benchmarking-001} #| cache: false #| include: false library(mlr3verse) @@ -20,12 +20,12 @@ options(mlr3oml.cache = here::here("book", "openml", "cache")) We access the AutoML benchmark suite with ID 269 using the `r ref("mlr3oml::ocl()")` function. -```{r} +```{r solutions_large-scale_benchmarking-002} #| include: false path_automl_suite = here::here("book", "openml", "manual", "automl_suite.rds") ``` -```{r solutions-028, eval = !file.exists(path_automl_suite)} +```{r solutions_large-scale_benchmarking-003, eval = !file.exists(path_automl_suite)} library(mlr3oml) automl_suite = ocl(id = 269) automl_suite$task_ids @@ -33,7 +33,7 @@ automl_suite$task_ids To create a summary of the underlying datasets, we pass their IDs to `r ref("mlr3oml::list_oml_data()")`. -```{r} +```{r solutions_large-scale_benchmarking-004} #| include: false if (file.exists(path_automl_suite)) { automl_suite = readRDS(path_automl_suite) @@ -44,31 +44,31 @@ if (file.exists(path_automl_suite)) { } ``` -```{r} +```{r solutions_large-scale_benchmarking-005} data_tbl = list_oml_data(automl_suite$data_ids) data_tbl[, c("data_id", "name", "NumberOfInstances")] ``` To find those datasets with up to 4000 observations, we can simply filter the table. -```{r} +```{r solutions_large-scale_benchmarking-006} data_tbl = data_tbl[NumberOfInstances < 4000, ] ``` Alternatively, the `r ref("mlr3oml::list_oml_tasks()")` also allows to filter OpenML tasks by their characteristics. -```{r} +```{r solutions_large-scale_benchmarking-007} #| include: false path_automl_table = here::here("book", "openml", "manual", "automl_table.rds") ``` -```{r solutions-030, eval = !file.exists(path_automl_table)} +```{r solutions_large-scale_benchmarking-008, eval = !file.exists(path_automl_table)} task_tbl = list_oml_tasks( task_id = automl_suite$task_ids, number_instances = c(0, 4000) ) ``` -```{r} +```{r solutions_large-scale_benchmarking-009} #| include: false if (file.exists(path_automl_table)) { task_tbl = readRDS(path_automl_table) @@ -79,13 +79,13 @@ if (file.exists(path_automl_table)) { The resulting table contains matching OpenML tasks from the AutoML benchmark suite. -```{r solutions-032} +```{r solutions_large-scale_benchmarking-010} task_tbl[, .(task_id, data_id, name, NumberOfInstances)] ``` We create `mlr3` tasks from these OpenML IDs using `tsk("oml")`. -```{r solutions-033} +```{r solutions_large-scale_benchmarking-011} tasks = lapply(task_tbl$task_id, function(id) tsk("oml", task_id = id)) tasks[[1]] @@ -98,7 +98,7 @@ tasks[[1]] Use default hyperparameter settings and do not perform any tuning to keep the experiments simple. -```{r solutions-034} +```{r solutions_large-scale_benchmarking-012} lrn_ranger = as_learner( ppl("robustify", learner = lrn("regr.ranger")) %>>% po("learner", lrn("regr.ranger")) @@ -118,7 +118,7 @@ learners = list(lrn_ranger, lrn_rpart) We set a seed before calling `r ref("benchmark_grid()")` as this instantiates the resamplings, which is stochastic. -```{r solutions-035} +```{r solutions_large-scale_benchmarking-013} set.seed(123) resampling = rsmp("cv", folds = 3) design = benchmark_grid(tasks, learners, resampling) @@ -128,7 +128,7 @@ design To execute this benchmark design using `r ref_pkg("mlr3batchmark")` we start by creating and configuring an experiment registry. We set `file.dir = NA` to use a temporary directory for the registry. -```{r solutions-036} +```{r solutions_large-scale_benchmarking-014} #| cache: false library(mlr3batchmark) library(batchtools) @@ -143,7 +143,7 @@ reg = makeExperimentRegistry( The next two steps are to populate the registry with the experiments using `r ref("mlr3batchmark::batchmark()")` and to submit them. By specifying no IDs in `r ref("batchtools::submitJobs()")`, all jobs returned by `r ref("batchtools::findNotSubmitted()")` are queued, which in this case are all existing jobs. -```{r solutions-038} +```{r solutions_large-scale_benchmarking-015} #| output: false batchmark(design, reg = reg) submitJobs(reg = reg) @@ -152,7 +152,7 @@ waitForJobs(reg = reg) After the execution of the experiment finished we can load the results as a `r ref("BenchmarkResult")`. -```{r solutions-040} +```{r solutions_large-scale_benchmarking-016} bmr = reduceResultsBatchmark(reg = reg) bmr$aggregate(msr("regr.mse")) ``` @@ -162,7 +162,7 @@ bmr$aggregate(msr("regr.mse")) First, we load the `r ref_pkg("mlr3benchmark")` package and create a `r ref("mlr3benchmark::BenchmarkAggr")` from the benchmark result using `msr("regr.mse")`. -```{r solutions-041} +```{r solutions_large-scale_benchmarking-017} library(mlr3benchmark) bma = as_benchmark_aggr(bmr, measures = msr("regr.mse")) bma @@ -170,14 +170,14 @@ bma We can also visualize this result using the `r ref("mlr3benchmark::autoplot()")` function. -```{r} +```{r solutions_large-scale_benchmarking-018} autoplot(bma) ``` Below, we conduct a global Friedman test. Note that a post-hoc test is not needed because we are only comparing two algorithms. -```{r solutions-042} +```{r solutions_large-scale_benchmarking-019} bma$friedman_test() ``` diff --git a/book/chapters/appendices/tasks.qmd b/book/chapters/appendices/tasks.qmd index 8c5594fcd..2aba75489 100644 --- a/book/chapters/appendices/tasks.qmd +++ b/book/chapters/appendices/tasks.qmd @@ -8,7 +8,7 @@ aliases: {{< include ../../common/_setup.qmd >}} -```{r include=FALSE} +```{r tasks-001=FALSE} library(mlr3verse) library(mlr3proba) library(mlr3spatiotempcv) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 8d7b45026..ccf4778ef 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -23,7 +23,7 @@ While `tidymodels` in particular makes it very easy to perform simple ML tasks, Before we can show you the full power of `mlr3`, we recommend installing the `r mlr3verse` package, which will install several, important packages in the `mlr3` ecosystem. -```{r C0 install mlr3verse, eval = FALSE} +```{r introduction_and_overview-001, eval = FALSE} install.packages("mlr3verse") ``` @@ -38,13 +38,13 @@ To do this, install `r ref_pkg("usethis")` and run the following: [^runiverse]: R-universe is an alternative package repository to CRAN. The bit of code below tells R to look at both R-universe and CRAN when trying to install packages. R will always install the latest version of a package. -```{r universe1, eval = FALSE} +```{r introduction_and_overview-002, eval = FALSE} usethis::edit_r_profile() ``` In the file that opens add or change the `repos` argument in `options` so it looks something like the code below (you might need to add the full code block below or just edit the existing `options` function). -```{r universe2, eval = FALSE} +```{r introduction_and_overview-003, eval = FALSE} options(repos = c( mlrorg = "https://mlr-org.r-universe.dev", CRAN = "https://cloud.r-project.org/" @@ -55,7 +55,7 @@ Save the file, restart your R session, and you are ready to go! If you want the latest development version of any of our packages, run -```{r remotes, eval = FALSE} +```{r introduction_and_overview-004, eval = FALSE} remotes::install_github("mlr-org/{pkg}") ``` @@ -105,7 +105,7 @@ Now let us see this in practice with our first example. The `mlr3` universe includes a wide range of tools taking you from basic ML to complex experiments. To get started, here is an example of the simplest functionality -- training a model and making predictions. -```{r C0 egBasic} +```{r introduction_and_overview-005} library(mlr3) task = tsk("penguins") split = partition(task) @@ -125,7 +125,7 @@ In @sec-basics we will break this down in more detail. The `mlr3` interface also lets you run more complicated experiments in just a few lines of code: -```{r C0 egHard, eval = FALSE} +```{r introduction_and_overview-006, eval = FALSE} library(mlr3verse) tasks = tsks(c("breast_cancer", "sonar")) @@ -149,7 +149,7 @@ bmr = benchmark(benchmark_grid(tasks, learners, rsmp("cv", folds = 3))) bmr$aggregate(msr("classif.acc")) ``` -```{r, output = FALSE, echo = FALSE} +```{r introduction_and_overview-007, output = FALSE, echo = FALSE} library(mlr3verse) library(mlr3pipelines) library(mlr3benchmark) @@ -173,7 +173,7 @@ learners = c(glrn_rf_tuned, glrn_stack) bmr = benchmark(benchmark_grid(tasks, learners, rsmp("holdout"))) ``` -```{r C0 egHardOut, echo = FALSE} +```{r introduction_and_overview-008, echo = FALSE} aggr = bmr$aggregate(msr("classif.acc"))[, c("task_id", "learner_id", "classif.acc")] aggr$learner_id = rep(c("RF", "Stack"), 2) @@ -192,7 +192,7 @@ Throughout this book, we often refer to `mlr3`, which may refer to the single `r The `r mlr3` *package* provides the base functionality that the rest of the ecosystem depends on for building more advanced machine learning tools. @fig-mlr3verse shows the packages in our ecosystem that extend `r mlr3` with capabilities for preprocessing, pipelining, visualizations, additional learners, additional task types, and much more. -```{r intro-001, echo = FALSE, out.width = "100%"} +```{r introduction_and_overview-009, echo = FALSE, out.width = "100%"} #| label: fig-mlr3verse #| fig-cap: Overview of the `mlr3` ecosystem, the packages with gray dashed lines are still in development, all others have a stable interface. #| fig-alt: "Mindmap showing the packages of the mlr3verse and their relationship. Center `mlr3`, immediately connected to that are 'Learners', 'Evaluation', 'Tuning', 'Feature Selection', 'Utilities', 'Special Tasks', 'Data', and 'Pipelines'. Within each group is: Learners: `mlr3learners`, `mlr3extralearners`, `mlr3torch`; Evaluation: `mlr3measures`, `mlr3benchmark`; Tuning: `mlr3tuning`, `miesmuschel`, `mlr3hyperband`, `mlr3mbo`, `bbotk`, `mlr3tuningspaces`; Feature Selection: `mlr3filters`, `mlr3fselect`; Utilities: `mlr3misc`, `mlr3viz`, `mlr3verse`, `mlr3batchmark`, `paradox`; Special Tasks: `mlr3spatiotempcv`, `mlr3spatial`, `mlr3proba`, `mlr3cluster`, `mlr3fda`, `mlr3fairness`; Data: `mlr3db`, `mlr3oml`, `mlr3data`; Pipelines: `mlr3pipelines`. `mlr3fda` and `mlr3torch` are connected by gray dashed lines." @@ -254,7 +254,7 @@ We use `r ref_pkg("data.table")` because it is blazingly fast and scales well to As with `data.frame`, `data.table`s can be constructed with `r ref("data.table()")` or `r ref("as.data.table()")`: -```{r intro-002.table-001.table-002} +```{r introduction_and_overview-010.table-001.table-002} library(data.table) # converting a matrix with as.data.table as.data.table(matrix(runif(4), 2, 2)) @@ -267,7 +267,7 @@ dt `data.table`s can be used much like `data.frame`s, but they provide additional functionality that makes complex operations easier. For example, data can be summarized by groups with a `by` argument in the `[` operator and they can be modified in-place with the `:=` operator. -```{r intro-004.table-003.table-004} +```{r introduction_and_overview-011.table-003.table-004} # mean of x column in groups given by y dt[, mean(x), by = "y"] # adding a new column with := diff --git a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd index 0fbece769..7d76f27f2 100644 --- a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd +++ b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd @@ -61,7 +61,7 @@ It is possible to control the `r index("granularity", aside = TRUE)` of the para For example, we could reduce the overhead of parallelizing a `for`-loop with 1000 iterations on four CPU cores by `r index('chunking')` the work of the 1000 jobs into four computational jobs performing 250 iterations each, resulting in four big jobs and not 1000 small ones. This effect is illustrated in the following code chunk using a `r index('socket cluster')` with the `r ref_pkg("parallel")` package, which has a `chunk.size` option so we do not need to manually create chunks: -```{r technical-001, eval = TRUE} +```{r advanced_technical_aspects_of_mlr3-001, eval = TRUE} # set up a socket cluster with 4 workers on the local machine library(parallel) cores = 4 @@ -124,7 +124,7 @@ Therefore, threading can conflict with certain parallel backends, leading the sy For this reason, we introduced the convention that threading parallelization is turned off by default. Hyperparameters that control the number of threads are tagged with the label `"threads"`: -```{r technical-002} +```{r advanced_technical_aspects_of_mlr3-002} lrn_ranger = lrn("classif.ranger") # show all hyperparameters tagged with "threads" @@ -135,7 +135,7 @@ lrn_ranger$param_set$values$num.threads ``` To enable the parallelization for this learner, `mlr3` provides the helper function `r ref("set_threads()")`, which automatically adjusts the hyperparameters associated with builtin learner parallelization: -```{r technical-003} +```{r advanced_technical_aspects_of_mlr3-003} # use four CPUs set_threads(lrn_ranger, n = 4) ``` @@ -147,14 +147,14 @@ This heuristic is not always ideal (interested readers might want to look up "Am * You are on a multi-user system and want to spare some resources for other users. * You have linked R to a threaded `r index('BLAS', lower = FALSE)` implementation like OpenBLAS and your learners make heavy use of linear algebra. -```{r} +```{r advanced_technical_aspects_of_mlr3-004} # auto-detect cores on the local machine set_threads(lrn_ranger) ``` To control how many cores are set, we recommend manually setting the number of CPUs in your system's `.Rprofile` file: -```{r technical-004, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-005, eval = FALSE} options(mc.cores = 4) ``` @@ -174,7 +174,7 @@ By definition, resampling is performed by aggregating over independent repetitio By example, we will look at parallelizing three-fold CV for a decision tree on the sonar task (@fig-parallel-overview). We use the `r ref("future::multisession")` plan (which internally uses socket clusters from the `parallel` package) that should work on all operating systems. -```{r technical-005} +```{r advanced_technical_aspects_of_mlr3-006} library(future) # select the multisession backend to use @@ -203,7 +203,7 @@ There are two `mlr3` options to control the execution and granularity: Tuning the chunk size can help in some rare cases to mitigate the parallelization overhead but is unlikely to be useful in larger problems or longer runtimes. -```{r large_benchmarking-051, echo = FALSE} +```{r advanced_technical_aspects_of_mlr3-007_benchmarking-051, echo = FALSE} #| label: fig-parallel-overview #| fig-cap: "Parallelization of a resampling using three-fold CV. The main process calls the `resample()` function, which starts the parallelization process and the computational task is split into three parts for three-fold CV. The folds are passed to three workers, each fitting a model on the respective subset of the task and predicting on the left-out observations. The predictions (and trained models) are communicated back to the main process which combines them into a `ResampleResult`." #| fig-alt: "Flow chart starting with a rectangular box that says 'Main', with an arrow to a diamond that says 'resample()'. This has three arrows to 'Worker 1-3' respectively, each arrow is labeled 'Fold 1-3' respectively. Each of the worker boxes points to the same diamond that says 'ResampleResult' and each arrow is labeled 'Prediction 1-3' respectively." @@ -230,7 +230,7 @@ Therefore we could either: Therefore, there is no need to decide whether you want to parallelize the tuning *or* the resampling, you always parallelize both. This approach makes the computation fine-grained and allows the `future` backend to group the jobs into chunks of suitable size (depending on the number of workers), it also makes the procedure identical to parallelizing resampling: -```{r technical-006} +```{r advanced_technical_aspects_of_mlr3-008} # simple benchmark design design = benchmark_grid(tsks(c("sonar", "penguins")), lrns(c("classif.featureless", "classif.rpart")), rsmp_cv3) @@ -267,7 +267,7 @@ The following code shows a parallelized execution of random search with the term The batch size, set to a multiple of the number of workers, ensures that available resources are used efficiently. However, note that the tuning only terminates after a multiple of the given batch size, in this case after 24 evaluations. -```{r} +```{r advanced_technical_aspects_of_mlr3-009} future::plan("multisession", workers = 4) instance = tune( @@ -297,7 +297,7 @@ Nested resampling can conceptually be parallelized at three different levels, ea This is demonstrated in the pseudocode below, which is a simplified form of Algorithm 3 from @hpo_practical: -```{r, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-010, eval = FALSE} # outer resampling, level 1: for (i in seq_len(n_outer_splits)) { # tuning instance, in this example mainly represents the archive @@ -338,7 +338,7 @@ A five-fold CV is used for our outer resampling. For the sake of simplicity, we will also ignore the final model fit the `AutoTuner` performs after tuning. Below, we run the example sequentially without parallelization: -```{r technical-007} +```{r advanced_technical_aspects_of_mlr3-011} library(mlr3tuning) # reset to default sequential plan future::plan("sequential") @@ -356,7 +356,7 @@ We can now either opt to parallelize the outer CV or the inner benchmarking. Let us assume we have a single CPU with four cores (C1 - C4) available and each inner holdout evaluation during tuning takes four seconds. If we parallelize the outer five-fold CV (@fig-parallel-outer), each of the four cores would run one outer resampling first, the computation of the fifth iteration has to wait as there are no more available cores. -```{r technical-009, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-012, eval = FALSE} # Parallelize outer loop future::plan(list("multisession", "sequential")) @@ -372,12 +372,12 @@ During the computation of the fifth outer resampling iteration, only C1 is busy, In contrast, if we parallelize the inner benchmark (@fig-parallel-inner) then the outer resampling runs sequentially: the five inner benchmarks are scheduled one after the other, each of which runs its two holdout evaluations in parallel on two cores; meanwhile, C3 and C4 are idle. -```{r technical-010, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-013, eval = FALSE} # Parallelize inner loop future::plan(list("sequential", "multisession")) ``` -```{r, cpu-utilization-1} +```{r advanced_technical_aspects_of_mlr3-014, cpu-utilization-1} #| echo: false #| fig-width: 5.5 #| label: fig-parallel-outer @@ -386,7 +386,7 @@ include_multi_graphics("cpu_utilization_1") ``` -```{r, include = FALSE} +```{r advanced_technical_aspects_of_mlr3-015, include = FALSE} #| fig-responsive: false #gantt # title CPU Utilization @@ -415,7 +415,7 @@ include_multi_graphics("cpu_utilization_1") ``` -```{r, cpu-utilization-2} +```{r advanced_technical_aspects_of_mlr3-016, cpu-utilization-2} #| echo: false #| fig-width: 5.5 #| label: fig-parallel-inner @@ -423,7 +423,7 @@ include_multi_graphics("cpu_utilization_1") include_multi_graphics("cpu_utilization_2") ``` -```{r, include = FALSE} +```{r advanced_technical_aspects_of_mlr3-017, include = FALSE} #| fig-responsive: false #gantt # title CPU Utilization @@ -456,7 +456,7 @@ With parallelization of the outer loop, all results are computed after 16 second `mlr3` and `future` make it possible to enable parallelization for both loops for nested parallelization, even on different parallelization backends, which can be useful in some distributed computing setups. Note that the detection of available cores does not work for such a nested parallelization and the number of workers must be manually set instead: -```{r technical-011, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-018, eval = FALSE} # Runs both loops in parallel future::plan(list( tweak("multisession", workers = 2), @@ -477,7 +477,7 @@ To predict in parallel, the test data is first split into multiple groups and th The resulting predictions are then combined internally in a second step. To avoid predicting in parallel accidentally, parallel predictions must be enabled in the learner via the `parallel_predict` field: -```{r technical-012} +```{r advanced_technical_aspects_of_mlr3-019} # train random forest on sonar task tsk_sonar = tsk("sonar") lrn_rpart = lrn("classif.rpart") @@ -503,7 +503,7 @@ In this section, we will discuss how to prevent these errors from causing the pr For illustration (and internal testing) of error handling, `mlr3` ships with `lrn("classif.debug")` and `lrn("regr.debug")`: -```{r technical-013} +```{r advanced_technical_aspects_of_mlr3-020} tsk_penguins = tsk("penguins") lrn_debug = lrn("classif.debug") lrn_debug @@ -515,7 +515,7 @@ It can be configured to stochastically trigger warnings, errors, and even segfau With the learner's default settings, the learner will remember a random label and constantly predict this label without signaling any conditions. In the following code we tell the learner to signal an error during the training step: -```{r technical-016, error = TRUE} +```{r advanced_technical_aspects_of_mlr3-021, error = TRUE} # set probability to signal an error to `1` lrn_debug$param_set$values$error_train = 1 lrn_debug$train(tsk_penguins) @@ -533,7 +533,7 @@ In @sec-fallback, we will discuss fallback learners to replace missing models an Each `r ref("Learner")` has the method `r index("$encapsulate()", parent = "Learner", aside = TRUE, code = TRUE)` to control how the train or predict steps are wrapped. The first way to encapsulate the execution is provided by the package `r ref_pkg("evaluate")`, which evaluates R expressions and captures and tracks conditions (outputs, messages, warnings or errors) without letting them stop the process (see documentation of `r ref("mlr3misc::encapsulate()")` for full details): -```{r technical-017} +```{r advanced_technical_aspects_of_mlr3-022} # trigger warning and error in training lrn_debug = lrn("classif.debug", warning_train = 1, error_train = 1) @@ -548,7 +548,7 @@ In this case, use the encapsulation method `"try"` instead, which catches signal After training the learner, one can access the log via the fields `log`, `warnings` and `errors`: -```{r technical-018} +```{r advanced_technical_aspects_of_mlr3-023} lrn_debug$log lrn_debug$warnings lrn_debug$errors @@ -559,7 +559,7 @@ In contrast to `evaluate`, the computation is handled in a separate R process. This guards the calling session against segmentation faults which otherwise would tear down the complete main R session (if we demonstrate that here we would break our book). On the downside, starting new processes comes with comparably more computational overhead. -```{r technical-019} +```{r advanced_technical_aspects_of_mlr3-024} lrn_debug$encapsulate("callr", fallback = lrn("classif.featureless")) # set segfault_train and remove warning_train and error_train lrn_debug$param_set$values = list(segfault_train = 1) @@ -571,7 +571,7 @@ This works most reliably when using `callr` encapsulation, since the `evaluate` If learners are interrupted, then this is logged as an error by the encapsulation process. Again, the timeout can be set separately for training and prediction: -```{r technical-020} +```{r advanced_technical_aspects_of_mlr3-025} # near instant timeout for training, no timeout for predict lrn_debug$timeout = c(train = 1e-5, predict = Inf) lrn_debug$train(task = tsk_penguins)$errors @@ -582,7 +582,7 @@ With these methods, we can now catch all conditions and post hoc analyze message Unfortunately, catching errors and ensuring an upper time limit is only half the battle. If there are errors during training then we will not have a trained model to query, or if there are errors during predicting, then we will not have predictions to analyze: -```{r technical-021, error = TRUE} +```{r advanced_technical_aspects_of_mlr3-026, error = TRUE} # no saved model as there was an error during training lrn("classif.debug", error_train = 1)$train(tsk_penguins)$model # saved model @@ -613,7 +613,7 @@ We strongly recommend the final option, which is statistically sound and can be To make this procedure convenient during resampling and benchmarking, we support fitting a baseline (though in theory you could use any `Learner`) as a `r index('fallback learner')` by passing a `r ref("Learner")` to `r index('$encapsulate()', parent = "Learner", aside = TRUE, code = TRUE)`. In the next example, we add a classification baseline to our debug learner, so that when the debug learner errors, `mlr3` falls back to the predictions of the featureless learner internally. -```{r technical-022} +```{r advanced_technical_aspects_of_mlr3-027} lrn_debug = lrn("classif.debug", error_train = 1) lrn_debug$encapsulate("evaluate", fallback = lrn("classif.featureless")) @@ -623,7 +623,7 @@ lrn_debug The learner's log contains the captured error, and although no model is stored as the error was in training, we can still obtain predictions from our fallback: -```{r technical-023} +```{r advanced_technical_aspects_of_mlr3-028} lrn_debug$log lrn_debug$model prediction = lrn_debug$predict(tsk_penguins) @@ -633,7 +633,7 @@ prediction$score() In the following snippet, we compare the debug learner with a simple classification tree. We re-parametrize the debug learner to fail in roughly 50% of the resampling iterations during the training step: -```{r technical-024} +```{r advanced_technical_aspects_of_mlr3-029} lrn_debug = lrn("classif.debug", error_train = 0.5) lrn_debug$encapsulate("evaluate", fallback = lrn("classif.featureless")) @@ -648,7 +648,7 @@ Even though the debug learner occasionally failed to provide predictions, we sti It is also possible to split the benchmark up into separate `r ref("ResampleResult")` objects which sometimes helps to get more context. E.g., if we only want to have a closer look into the debug learner, we can extract the errors from the corresponding resample results: -```{r technical-025} +```{r advanced_technical_aspects_of_mlr3-030} rr = aggr[learner_id == "classif.debug"]$resample_result[[1L]] rr$errors[1:2] ``` @@ -670,13 +670,13 @@ The default log level in `mlr3` is `"info"`, this means that messages are only d To change the logging threshold you need to retrieve the `R6` logger object from `lgr`, and then call `$set_threshold()`, for example, to lower the logging threshold to enable debugging messaging we would change the threshold to `"debug"`: -```{r technical-027, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-031, eval = FALSE} lgr::get_logger("mlr3")$set_threshold("debug") ``` Or to suppress all messaging except warnings: -```{r, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-032, eval = FALSE} lgr::get_logger("mlr3")$set_threshold("warn") ``` @@ -685,14 +685,14 @@ lgr::get_logger("mlr3")$set_threshold("warn") The packages in `mlr3` that make use of optimization, i.e., `r mlr3tuning` or `r mlr3fselect`, use the logger of their base package `r ref_pkg("bbotk")`. This means you could disable "info"-logging from the `mlr3` logger, but keep the output from `mlr3tuning`: -```{r technical-031, eval=FALSE} +```{r advanced_technical_aspects_of_mlr3-033, eval=FALSE} lgr::get_logger("mlr3")$set_threshold("warn") lgr::get_logger("bbotk")$set_threshold("info") ``` By default, output from `lgr` is printed in the console, however, you could choose to redirect this to a file in various formats, for example to a JSON file: -```{r technical-032} +```{r advanced_technical_aspects_of_mlr3-034} tf = tempfile("mlr3log_", fileext = ".json") # get the logger as R6 object @@ -746,7 +746,7 @@ In the following, we will show how to work with each of these choices using `r r To demonstrate `r ref("mlr3db::DataBackendDplyr")` we use the (pretty big) NYC flights dataset from the `r ref_pkg("nycflights13")` package and move it into a `r index("SQLite", lower = FALSE)` database. Although `r ref("mlr3db::as_sqlite_backend()")` provides a convenient function to perform this step, we construct the database manually here. -```{r technical-034, message = FALSE} +```{r advanced_technical_aspects_of_mlr3-035, message = FALSE} # load data requireNamespace("DBI") requireNamespace("RSQLite") @@ -769,7 +769,7 @@ rm(flights) With the SQLite database stored in file `path`, we now re-establish a connection and switch to `r ref_pkg("dplyr")`/`r ref_pkg("dbplyr")` for some essential preprocessing. -```{r technical-035, message = FALSE} +```{r advanced_technical_aspects_of_mlr3-036, message = FALSE} # establish connection con = DBI::dbConnect(RSQLite::SQLite(), path) @@ -787,7 +787,7 @@ Therefore, we build up an SQL query in a step-wise fashion using `dplyr` verbs a 3. Filter the data to only use every second row (to reduce example runtime); and 4. Merge factor levels of the feature `carrier` so infrequent carriers are replaced by level "other". -```{r technical-036} +```{r advanced_technical_aspects_of_mlr3-037} # 1. subset columns keep = c("row_id", "year", "month", "day", "hour", "minute", "dep_time", "arr_time", "carrier", "flight", "air_time", "distance", "arr_delay") @@ -808,7 +808,7 @@ tbl = mutate(tbl, carrier = case_when( Having prepared our data, we can now create a `r ref("mlr3db::DataBackendDplyr")` and can then query basic information from our new `r ref("DataBackend")`: -```{r technical-040} +```{r advanced_technical_aspects_of_mlr3-038} library(mlr3db) backend_flights = as_data_backend(tbl, primary_key = "row_id") c(nrow = backend_flights$nrow, ncol = backend_flights$ncol) @@ -819,7 +819,7 @@ Note that the `DataBackendDplyr` can only operate on the data we provided, so do With a backend constructed, we can now use the standard `mlr3` API: -```{r technical-042} +```{r advanced_technical_aspects_of_mlr3-039} tsk_flights = as_task_regr(backend_flights, id = "flights_sqlite", target = "arr_delay") rsmp_sub002 = rsmp("subsampling", ratio = 0.02, repeats = 3) @@ -828,7 +828,7 @@ rsmp_sub002 = rsmp("subsampling", ratio = 0.02, repeats = 3) Above we created a regression task by passing a backend as the first argument and then created a resampling strategy where we will subsample 2% of the observations three times. In each resampling iteration, only the required subset of the data is queried from the SQLite database and passed to our learner: -```{r technical-043, message=FALSE} +```{r advanced_technical_aspects_of_mlr3-040, message=FALSE} rr = resample(tsk_flights, lrn("regr.rpart"), rsmp_sub002) measures = msrs(c("regr.rmse", "time_train", "time_predict")) rr$aggregate(measures) @@ -836,7 +836,7 @@ rr$aggregate(measures) As we have finished our experiment we can now close our connection, which we can do by removing the `tbl` object referencing the connection and then closing it. -```{r technical-044} +```{r advanced_technical_aspects_of_mlr3-041} rm(tbl) DBI::dbDisconnect(con) ``` @@ -849,7 +849,7 @@ DBI::dbDisconnect(con) Converting a `data.frame` to DuckDB is possible by passing the `data.frame` to convert and the `path` to store the data to `r ref("mlr3db::as_duckdb_backend()")`. By example, below we first query the location of an example dataset in a Parquet file shipped with `mlr3db` and then convert the resulting `r ref("DataBackendDuckDB")` object into a classification task, all without loading the dataset into memory: -```{r technical-045} +```{r advanced_technical_aspects_of_mlr3-042} path = system.file(file.path("extdata", "spam.parquet"), package = "mlr3db") backend = as_duckdb_backend(path) @@ -872,7 +872,7 @@ As an example, let us consider a regression measure that scores a prediction as In maths this would be defined as $f(y, \hat{y}) = \frac{1}{n} \sum_{i=1}^n \mathbb{I}(|y_i - \hat{y}_i| < \sigma_y)$, where $\sigma_y$ is the standard deviation of the truth and $\mathbb{I}$ is the indicator function. In code, this measure may be written as: -```{r technical-047} +```{r advanced_technical_aspects_of_mlr3-043} threshold_acc = function(truth, response) { mean(ifelse(abs(truth - response) < sd(truth), 1, 0)) } @@ -886,7 +886,7 @@ To use this measure in `mlr3`, we need to create a new `r ref("R6::R6Class")`, w The code for this new measure is in the snippet below, with an explanation following it. This code chunk can be used as a template for the majority of performance measures. -```{r technical-048} +```{r advanced_technical_aspects_of_mlr3-044} MeasureRegrThresholdAcc = R6::R6Class("MeasureRegrThresholdAcc", inherit = mlr3::MeasureRegr, # regression measure public = list( @@ -927,7 +927,7 @@ You can also consult the manual page of the `Measure` for an overview of other p Once you have defined your measure you can load it with the `R6` constructor (`$new()`), or make it available to be constructed with the `msr()` sugar function by adding it to the `r ref("mlr_measures")` dictionary: -```{r technical-049} +```{r advanced_technical_aspects_of_mlr3-045} tsk_mtcars = tsk("mtcars") split = partition(tsk_mtcars) lrn_featureless = lrn("regr.featureless")$train(tsk_mtcars, split$train) @@ -976,7 +976,7 @@ For an overview of available DBMS in R, see the CRAN task view on databases at ` ## Exercises 1. Consider the following example where you resample a learner (debug learner, sleeps for three seconds during train) on four workers using the multisession backend: -```{r technical-050, eval = FALSE} +```{r advanced_technical_aspects_of_mlr3-046, eval = FALSE} tsk_penguins = tsk("penguins") lrn_debug = lrn("classif.debug", sleep_train = function() 3) rsmp_cv6 = rsmp("cv", folds = 6) diff --git a/book/chapters/chapter11/large-scale_benchmarking.qmd b/book/chapters/chapter11/large-scale_benchmarking.qmd index 812f05935..5277ed08a 100644 --- a/book/chapters/chapter11/large-scale_benchmarking.qmd +++ b/book/chapters/chapter11/large-scale_benchmarking.qmd @@ -10,7 +10,7 @@ aliases: `r chapter = "Large-Scale Benchmarking"` `r authors(chapter)` -```{r large_benchmarking-001} +```{r large-scale_benchmarking-001_benchmarking-001} #| include: false #| cache: false lgr::get_logger("mlr3oml")$set_threshold("off") @@ -41,7 +41,7 @@ We will also assume that you have read @sec-pipelines and @sec-technical. We make use of `ppl("robustify")` (@sec-prepro-robustify) for automating common preprocessing steps. We also set a featureless baseline as a fallback learner (@sec-fallback) and set `"try"` as our encapsulation method (@sec-encapsulation), which logs errors/warnings to an external file that can be read by `batchtools` (we will return to this in @sec-batchtools-monitoring). -```{r large_benchmarking-002, warning = FALSE} +```{r large-scale_benchmarking-002_benchmarking-002, warning = FALSE} # featureless baseline lrn_baseline = lrn("classif.featureless", id = "featureless") @@ -62,7 +62,7 @@ learners = list(lrn_lr, lrn_rf, lrn_baseline) As a starting example, we will compare our learners across three classification tasks using accuracy and three-fold CV. -```{r} +```{r large-scale_benchmarking-003} #| warning: false design = benchmark_grid(tsks(c("german_credit", "sonar", "pima")), learners, rsmp("cv", folds = 10)) @@ -89,12 +89,12 @@ In particular, we will discuss OpenML datasets, tasks, and task collections, but Finding data from OpenML is possible via the website or its REST API that `mlr3oml` interfaces. `r ref("list_oml_data()")` can be used to filter datasets for specific properties, for example by number of features, rows, or number of classes in a classification problem: -```{r} +```{r large-scale_benchmarking-004} #| include: false path_odatasets = here::here("book", "openml", "manual", "odatasets_filter.rds") ``` -```{r large_benchmarking-014, eval = !file.exists(path_odatasets)} +```{r large-scale_benchmarking-005_benchmarking-014, eval = !file.exists(path_odatasets)} library(mlr3oml) odatasets = list_oml_data( @@ -104,7 +104,7 @@ odatasets = list_oml_data( ) ``` -```{r} +```{r large-scale_benchmarking-006} #| include: false if (file.exists(path_odatasets)) { odatasets = readRDS(path_odatasets) @@ -114,7 +114,7 @@ if (file.exists(path_odatasets)) { ``` -```{r} +```{r large-scale_benchmarking-007} odatasets[NumberOfFeatures < 16, c("data_id", "name", "NumberOfFeatures", "NumberOfInstances")] ``` @@ -125,7 +125,7 @@ We can see that some datasets have duplicated names, which is why each dataset a By example, let us consider the 'adult' dataset with ID 1590. Metadata for the dataset is loaded with `r ref("odt()", aside = TRUE)`, which returns an object of class `r ref("OMLData")`. -```{r large_benchmarking-003} +```{r large-scale_benchmarking-008_benchmarking-003} odata = odt(id = 1590) odata ``` @@ -133,14 +133,14 @@ odata The `OMLData` object contains metadata about the dataset but importantly does not (yet) contain the data. This means that information about the dataset can be queried without having to load the entire data into memory, for example, the license and dimension of the data: -```{r large_benchmarking-004} +```{r large-scale_benchmarking-009_benchmarking-004} odata$license c(nrow = odata$nrow, ncol = odata$ncol) ``` If we want to work with the actual data, then accessing the `$data` field will download the data, import it into R, and then store the `data.frame` in the `OMLData` object: -```{r large_benchmarking-005} +```{r large-scale_benchmarking-010_benchmarking-005} # first 5 rows and columns odata$data[1:5, 1:5] ``` @@ -154,7 +154,7 @@ Additionally, many objects can be permanently cached on the local file system by Data can then be converted into `mlr3` backends (see @sec-backends) with the `r ref("as_data_backend()")` function and then into tasks: -```{r large_benchmarking-006} +```{r large-scale_benchmarking-011_benchmarking-006} backend = as_data_backend(odata) tsk_adult = as_task_classif(backend, target = "class") tsk_adult @@ -164,7 +164,7 @@ Some datasets on OpenML contain columns that should neither be used as a feature The column names that are usually included as features are accessible through the field `$feature_names`, and we assign them to the `mlr3` task accordingly. Note that for the dataset at hand, this would not have been necessary, as all non-target columns are to be treated as predictors, but we include it for clarity. -```{r} +```{r large-scale_benchmarking-012} tsk_adult$col_roles$feature = odata$feature_names tsk_adult ``` @@ -177,17 +177,17 @@ Similarly to `mlr3`, OpenML has different types of tasks, such as regression and Analogously to filtering datasets, tasks can be filtered with `r ref("list_oml_tasks()")`. To find a task that makes use of the data we have been using, we would pass the data ID to the `data_id` argument: -```{r} +```{r large-scale_benchmarking-013} #| include: false path_adult_tasks = here::here("book", "openml", "manual", "adult_tasks.rds") ``` -```{r, eval = !file.exists(path_adult_tasks)} +```{r large-scale_benchmarking-014, eval = !file.exists(path_adult_tasks)} # tasks making use of the adult data adult_tasks = list_oml_tasks(data_id = 1590) ``` -```{r} +```{r large-scale_benchmarking-015} #| include: false if (file.exists(path_adult_tasks)) { adult_tasks = readRDS(path_adult_tasks) @@ -196,34 +196,34 @@ if (file.exists(path_adult_tasks)) { } ``` -```{r} +```{r large-scale_benchmarking-016} adult_tasks[task_type == "Supervised Classification", task_id] ``` From these tasks, we randomly select the task with ID 359983. We can load the object using `r ref("otsk()", aside = TRUE)`, which returns an `r ref("OMLTask")` object. -```{r large_benchmarking-009} +```{r large-scale_benchmarking-017_benchmarking-009} otask = otsk(id = 359983) otask ``` The `OMLData` object associated with the underlying dataset can be accessed through the `$data` field. -```{r large_benchmarking-010} +```{r large-scale_benchmarking-018_benchmarking-010} otask$data ``` The data splits associated with the estimation procedure are accessible through the field `$task_splits`. In `mlr3` terms, these are the instantiation of a `r ref("mlr3::Resampling")` on a specific `r ref("mlr3::Task")`. -```{r large_benchmarking-011} +```{r large-scale_benchmarking-019_benchmarking-011} otask$task_splits ``` The OpenML task can be converted to both an `mlr3::Task` and `r ref("mlr3::ResamplingCustom")` instantiated on the task using `r ref("as_task()")` and `r ref("as_resampling()")`, respectively: -```{r large_benchmarking-012} +```{r large-scale_benchmarking-020_benchmarking-012} tsk_adult = as_task(otask) tsk_adult @@ -233,7 +233,7 @@ resampling `mlr3oml` also allows direct construction of `mlr3` tasks and resamplings with the standard `r ref("tsk()")` and `r ref("rsmp()")` constructors, e.g.: -```{r} +```{r large-scale_benchmarking-021} tsk("oml", task_id = 359983) ``` @@ -245,16 +245,16 @@ This allows for the creation of `r index("benchmark suites")`, which are curated Examples include the OpenML CC-18 benchmark suite [@bischl2021openml], the AutoML benchmark [@amlb2022] and the benchmark for tabular deep learning [@grinsztajn2022why]. `r ref("OMLCollection")` objects are loaded with `r ref("ocl()", aside = TRUE)`, by example we will look at CC-18, which has ID 99: -```{r} +```{r large-scale_benchmarking-022} #| include: false path_otask_collection = here::here("book", "openml", "manual", "otask_collection99.rds") ``` -```{r large_benchmarking-017, eval = !file.exists(path_otask_collection)} +```{r large-scale_benchmarking-023_benchmarking-017, eval = !file.exists(path_otask_collection)} otask_collection = ocl(id = 99) ``` -```{r} +```{r large-scale_benchmarking-024} #| include: false if (file.exists(path_otask_collection)) { otask_collection = readRDS(path_otask_collection) @@ -265,20 +265,20 @@ if (file.exists(path_otask_collection)) { } ``` -```{r large_benchmarking-019} +```{r large-scale_benchmarking-025_benchmarking-019} otask_collection ``` The task includes 72 classification tasks on different datasets that can be accessed through `$task_ids`: -```{r large_benchmarking-020} +```{r large-scale_benchmarking-026_benchmarking-020} otask_collection$task_ids[1:5] # first 5 tasks in the collection ``` Task collections can be used to quickly define benchmark experiments in `mlr3`. To easily construct all tasks and resamplings from the benchmarking suite, you can use `r ref("as_tasks()", index = TRUE)` and `r ref("as_resamplings()", index = TRUE)` respectively: -```{r, eval = FALSE} +```{r large-scale_benchmarking-027, eval = FALSE} tasks = as_tasks(otask_collection) resamplings = as_resamplings(otask_collection) ``` @@ -286,12 +286,12 @@ resamplings = as_resamplings(otask_collection) Alternatively, if we wanted to filter the collection further, say to a binary classification experiment with six tasks, we could run `r ref("list_oml_tasks()")` with the task IDs from the CC-18 collection as argument `task_id`. We can either use the `list_oml_tasks()` argument to request the number of classes to be `2`, or we can make use of the fact that the result of `list_oml_tasks()` is a `data.table` and subset the resulting table. -```{r} +```{r large-scale_benchmarking-028} #| include: false path_binary_cc18 = here::here("book", "openml", "manual", "binary_cc18.rds") ``` -```{r large_benchmarking-021, eval = !file.exists(path_binary_cc18)} +```{r large-scale_benchmarking-029_benchmarking-021, eval = !file.exists(path_binary_cc18)} binary_cc18 = list_oml_tasks( limit = 6, task_id = otask_collection$task_ids, @@ -299,7 +299,7 @@ binary_cc18 = list_oml_tasks( ) ``` -```{r} +```{r large-scale_benchmarking-030} #| include: false if (!file.exists(path_binary_cc18)) { saveRDS(binary_cc18, path_binary_cc18) @@ -311,7 +311,7 @@ if (!file.exists(path_binary_cc18)) { We now define the tasks and resamplings which we will use for comparing the logistic regression with the random forest learner. Note that all resamplings in this collection consist of exactly 10 iterations. -```{r large_benchmarking-024} +```{r large-scale_benchmarking-031_benchmarking-024} # load tasks as a list otasks = lapply(binary_cc18$task_id, otsk) @@ -322,7 +322,7 @@ resamplings = as_resamplings(otasks) To define the design table, we use `r ref("benchmark_grid()")` and set `paired` to `TRUE`, which is used in situations where each resampling is instantiated on a corresponding task (therefore the `tasks` and `resamplings` below must have the same length) and each learner should be evaluated on every resampled task. -```{r large_benchmarking-025} +```{r large-scale_benchmarking-032_benchmarking-025} large_design = benchmark_grid(tasks, learners, resamplings, paired = TRUE) large_design[1:6] # first 6 rows @@ -357,7 +357,7 @@ The scheduling system controls when these computational jobs are executed. For the rest of this section, we will look at how to use `batchtools` and `r mlr3batchmark` for submitting jobs, adapting jobs to clusters, ensuring reproducibility, querying job status, and debugging failures. -```{r large_benchmarking-026, echo = FALSE} +```{r large-scale_benchmarking-033_benchmarking-026, echo = FALSE} #| label: fig-hpc #| fig-cap: "Illustration of an HPC cluster architecture." #| fig-alt: "Flow diagram of objects. Left is a laptop with an arrow to an object that says 'Head Node - Scheduler', the arrow has text 'SSH'. The scheduler has a bidirectional arrow with text 'Submit' to 'Queue' that has an arrow to 'Computing Nodes'. The scheduler also has an arrow to 'File System' which has a double arrow connecting it to/from the 'Computing Nodes' object with text 'Data'." @@ -379,14 +379,14 @@ Among other things, the experiment registry stores the algorithms, problems, and Below, we create a registry in a subdirectory of our working directory -- on a real cluster, make sure that this folder is stored on a shared network filesystem, otherwise, the nodes cannot access it. We also set the registry's `seed` to `1` and the `packages` to `"mlr3verse"`, which will make these packages available in all our experiments. -```{r include = FALSE} +```{r large-scale_benchmarking-034= FALSE} #| cache: false if (dir.exists("experiments")) { unlink("experiments", recursive = TRUE) } ``` -```{r large_benchmarking-027, message=FALSE, warning=FALSE} +```{r large-scale_benchmarking-035_benchmarking-027, message=FALSE, warning=FALSE} #| cache: false library(batchtools) @@ -401,7 +401,7 @@ reg = makeExperimentRegistry( Once the registry has been created, we need to populate it with problems and algorithms to form the jobs, this is most easily carried out with `mlr3batchmark`\index{\texttt{mlr3batchmark}}, although finer control is possible with `batchtools` and will be explored in @sec-custom-experiments. `r ref("batchmark()", aside = TRUE)` converts `mlr3` tasks and resamplings to `batchtools` problems, and converts `mlr3` learners to `batchtools` algorithms; jobs are then created for all resampling iterations. -```{r large_benchmarking-029} +```{r large-scale_benchmarking-036_benchmarking-029} #| cache: false #| output: false library(mlr3batchmark) @@ -412,7 +412,7 @@ Now the registry includes six problems, one for each resampled task, and $180$ j The single algorithm in the registry is because `mlr3batchmark` specifies a single algorithm that is parametrized with the learner IDs. -```{r large_benchmarking-030} +```{r large-scale_benchmarking-037_benchmarking-030} reg ``` @@ -420,7 +420,7 @@ By default, the "Interactive" cluster function (see `r ref("makeClusterFunctions `r ref("getJobTable()")` can be used to get more detailed information about the jobs. Here, we only show a few selected columns for readability and unpack the list columns `algo.pars` and `prob.pars` using `r ref("unwrap()")`. -```{r large_benchmarking-032} +```{r large-scale_benchmarking-038_benchmarking-032} job_table = getJobTable(reg = reg) job_table = unwrap(job_table) job_table = job_table[, @@ -438,7 +438,7 @@ With the experiments defined, we can now submit them to the cluster. However, it is best practice to first test each algorithm individually using `r ref("testJob()", aside = TRUE)`. By example, we will only test the first job (`id = 1`) and will use an external R session (`external = TRUE`). -```{r large_benchmarking-033} +```{r large-scale_benchmarking-039_benchmarking-033} #| eval: false result = testJob(1, external = TRUE, reg = reg) ``` @@ -451,10 +451,10 @@ A template file is a shell script with placeholders filled in by `batchtools` an The exemplary template should work on many Slurm installations out-of-the-box, but you might have to modify it for your cluster -- it can be customized to work with more advanced configurations. -```{r large_benchmarking-034} +```{r large-scale_benchmarking-040_benchmarking-034} cf = makeClusterFunctionsSlurm(template = "slurm-simple") ``` -```{r} +```{r large-scale_benchmarking-041} #| include: false #| cache: false cf = makeClusterFunctionsInteractive() @@ -463,7 +463,7 @@ cf = makeClusterFunctionsInteractive() To proceed with the examples on a local machine, we recommend setting the cluster function to a Socket backend with `r ref("makeClusterFunctionsSocket()")`. The chosen cluster function can be saved to the registry by passing it to the `$cluster.functions` field. -```{r large_benchmarking-036, output = FALSE} +```{r large-scale_benchmarking-042_benchmarking-036, output = FALSE} #| cache: false reg$cluster.functions = cf saveRegistry(reg = reg) @@ -473,7 +473,7 @@ With the registry setup, we can now decide if we want to run the experiments in For this example, we will use `r ref("chunk()", aside = TRUE)` to `r index('chunk')` the jobs such that five iterations of one resample experiment are run sequentially in one computational job -- in practice the optimal grouping will be highly dependent on your experiment (@sec-parallelization). -```{r large_benchmarking-037} +```{r large-scale_benchmarking-043_benchmarking-037} ids = job_table$job.id chunks = data.table( job.id = ids, chunk = chunk(ids, chunk.size = 5, shuffle = FALSE) @@ -487,14 +487,14 @@ If you are unsure about the resource requirements, you can start a subset of job Measured runtimes and memory usage can later be queried with `r ref("getJobTable()")` and used to better estimate the required resources for the remaining jobs. In this example we will set the number of CPUs per job to `1`, the walltime (time limit before jobs are stopped by the scheduler) to one hour (`3600` seconds), and the RAM limit (memory limit before jobs are stopped by the scheduler) to `8000` megabytes. -```{r} +```{r large-scale_benchmarking-044} resources = list(ncpus = 1, walltime = 3600, memory = 8000) ``` With all the elements in place, we can now submit our jobs. -```{r large_benchmarking-038, output = FALSE} +```{r large-scale_benchmarking-045_benchmarking-038, output = FALSE} #| cache: false submitJobs(ids = chunks, resources = resources, reg = reg) @@ -515,7 +515,7 @@ This means that you can also submit jobs from an interactive R session, terminat Once jobs have been submitted, they can then be queried with `r ref("getStatus()", aside = TRUE)` to find their current status and the results (or errors) can be investigated. If you terminated your R sessions after job submission, you can load the experiment registry with `r ref("loadRegistry()", aside = TRUE)`. -```{r large_benchmarking-041} +```{r large-scale_benchmarking-046_benchmarking-041} getStatus(reg = reg) ``` @@ -529,7 +529,7 @@ In these situations, it is important to quickly determine what went wrong and to To see `r index('debugging')` in practice we will use the debug learner (see @sec-error-handling) with a 50% probability of erroring in training. When calling `r ref("batchmark()")` again, the new experiments will be added to the registry on top of the existing jobs. -```{r, output = FALSE} +```{r large-scale_benchmarking-047, output = FALSE} #| cache: false extra_design = benchmark_grid(tasks, lrn("classif.debug", error_train = 0.5), resamplings, paired = TRUE) @@ -546,19 +546,19 @@ We pass it explicitly in this section for clarity. Now we can get the IDs of the new jobs (which have not been submitted yet) and submit them by passing their IDs. -```{r, output = FALSE} +```{r large-scale_benchmarking-048, output = FALSE} #| cache: false ids = findNotSubmitted(reg = reg) submitJobs(ids, reg = reg) ``` -```{r, include=FALSE} +```{r large-scale_benchmarking-049, include=FALSE} waitForJobs(reg = reg) ``` After these jobs have terminated, we can get a summary of those that failed: -```{r} +```{r large-scale_benchmarking-050} getStatus(reg = reg) error_ids = findErrors(reg = reg) @@ -571,7 +571,7 @@ In a real experiment, we would now investigate the debug learner further to unde Assuming learners have been debugged (or we are happy to ignore them), we can then collect the results of our experiment with `r ref("mlr3batchmark::reduceResultsBatchmark()")`, which constructs a `r ref("BenchmarkResult")` from the results. Below we filter out results from the debug learner. -```{r} +```{r large-scale_benchmarking-051} ids = findExperiments(algo.pars = learner_id != "classif.debug", reg = reg) bmr = reduceResultsBatchmark(ids, reg = reg) @@ -585,14 +585,14 @@ bmr$aggregate()[1:5] In general, we recommend using `mlr3batchmark` for scheduling simpler `mlr3` jobs on an HPC, however, we will also briefly show you how to use `batchtools` without `mlr3batchmark` for finer control over your experiment. Again we start by creating an experiment registry. -```{r include = FALSE} +```{r large-scale_benchmarking-052= FALSE} #| cache: false if (dir.exists("experiments-custom")) { unlink("experiments-custom", recursive = TRUE) } ``` -```{r large_benchmarking-046, message = FALSE} +```{r large-scale_benchmarking-053_benchmarking-046, message = FALSE} #| cache: false reg = makeExperimentRegistry( file.dir = "./experiments-custom", @@ -608,7 +608,7 @@ Finally, we pass a function (`fun`, dynamic problem part) that takes in the stat The `fun` shown below is the default behavior and could be omitted, we show it here for clarity. This function could be more complex and take further parameters to modify the problem instance dynamically. -```{r large_benchmarking-047, output = FALSE} +```{r large-scale_benchmarking-054_benchmarking-047, output = FALSE} #| cache: false for (i in seq_along(tasks)) { addProblem( @@ -621,7 +621,7 @@ for (i in seq_along(tasks)) { ``` -```{r large_benchmarking-051, echo = FALSE} +```{r large-scale_benchmarking-055_benchmarking-051, echo = FALSE} #| label: fig-batchtools-illustration #| fig-cap: "Illustration of a batchtools problem, algorithm, and experiment. " #| fig-alt: "The diagram shows a rectangle that says 'static problem part, data', with an arrow pointing to 'dynamic problem function, fun(data, ...)' and 'algorithm function, fun(data, instance, ...)'. A box that says 'problem design, (addExperiments)' also has an arrow to the 'dynamic...' box. The 'dynamic...' box then has an arrow with text 'instance' that points to the 'algorithm function' box. A box that says 'algorithm design, (addExperiments)' also points to the 'algorithm function' box. Finally the 'algorithm function' box points to 'result'." @@ -634,7 +634,7 @@ Algorithms are again specified with a unique `name`, as well as a function to de Here, we define one job to represent a complete resample experiment. In general, algorithms in `batchtools` may return arbitrary objects -- those are simply stored on the file system and can be processed with a custom function while collecting the results. -```{r large_benchmarking-048, message=FALSE} +```{r large-scale_benchmarking-056_benchmarking-048, message=FALSE} #| cache: false addAlgorithm( "run_learner", @@ -652,29 +652,29 @@ By leaving `prob.designs` unspecified, experiments for all existing problems are We set the `learner` parameter of our algorithm (`"run_learner"`) to be the three learners from our `large_design` object. Note that whenever an experiment is added, the current seed is assigned to the experiment and then incremented. -```{r large_benchmarking-049, eval = FALSE} +```{r large-scale_benchmarking-057_benchmarking-049, eval = FALSE} alg_des = list(run_learner = data.table(learner = learners)) addExperiments(algo.designs = alg_des, reg = reg) summarizeExperiments() ``` -```{r, include = FALSE, output = FALSE} +```{r large-scale_benchmarking-058, include = FALSE, output = FALSE} #| cache: false alg_des = list(run_learner = data.table(learner = learners)) addExperiments(algo.designs = alg_des, reg = reg) ``` -```{r, include = FALSE, output = TRUE} +```{r large-scale_benchmarking-059, include = FALSE, output = TRUE} #| cache: false summarizeExperiments() ``` Our jobs can now be submitted to the cluster; by not specifying specific job IDs, *all* experiments are submitted. -```{r, output = FALSE} +```{r large-scale_benchmarking-060, output = FALSE} #| cache: false submitJobs(reg = reg) ``` -```{r, include = FALSE} +```{r large-scale_benchmarking-061, include = FALSE} #| cache: false waitForJobs(reg = reg) ``` @@ -683,7 +683,7 @@ We can retrieve the job results using `r ref("loadResult()")`, which outputs the To retrieve all results at once, we can use `r ref("reduceResults()")` to create a single `r ref("BenchmarkResult")`. For this, we use the combine function `c()` which can combine multiple objects of type `ResampleResult` or `BenchmarkResult` to a single `BenchmarkResult`. -```{r large_benchmarking-054} +```{r large-scale_benchmarking-062_benchmarking-054} rr = loadResult(1, reg = reg) as.data.table(rr)[1:5] @@ -701,7 +701,7 @@ As a first step, we recommend performing a pairwise comparison of learners using This method first performs a global comparison to see if any learner is statistically better than another. To use these methods we first convert the benchmark result to a `r ref("BenchmarkAggr")` object using `r ref("as_benchmark_aggr()", aside = TRUE)`. -```{r large_benchmarking-056} +```{r large-scale_benchmarking-063_benchmarking-056} library(mlr3benchmark) bma = as_benchmark_aggr(bmr, measures = msr("classif.ce")) bma$friedman_posthoc() @@ -710,7 +710,7 @@ bma$friedman_posthoc() These results indicate a statistically significant difference between the `"featureless"` learner and `"ranger"` (assuming $p\leq0.05$ is significant). This table can be visualized in a critical difference plot (@fig-lsb-cd), which typically shows the mean rank of a learning algorithm on the x-axis along with a thick horizontal line that connects learners that are pairwise not significantly different (while correcting for multiple tests). -```{r large_benchmarking-057, fig.height = 1.5} +```{r large-scale_benchmarking-064_benchmarking-057, fig.height = 1.5} #| label: fig-lsb-cd #| fig-cap: "Critical difference diagram comparing the random forest, logistic regression, and featureless baseline. The critical difference of 1.35 in the title refers to the difference in mean rank required to conclude that one learner performs statistically different to another." #| fig-alt: "Figure shows a one-axis diagram ranging from 0 to 4, above the diagram is a thick black line with text 'Critical Difference = 1.35'. Diagram shows 'ranger' on the far left just to the right of '1', then 'logreg' just to the left of '2', then 'featureless' just under '3'. There is a thick, black line connecting 'ranger' and 'logreg', as well as a thick, black line connecting 'logreg' and 'featureless'." diff --git a/book/chapters/chapter12/model_interpretation.qmd b/book/chapters/chapter12/model_interpretation.qmd index eb93d71d6..0c59d480c 100644 --- a/book/chapters/chapter12/model_interpretation.qmd +++ b/book/chapters/chapter12/model_interpretation.qmd @@ -35,7 +35,7 @@ As a running example throughout this chapter, we will consider a gradient boosti In practice, we would tune the hyperparameters of GBM as discussed in @sec-optimization and perform feature selection as discussed in @sec-feature-selection to select the most relevant features. However, for the sake of simplicity, we utilize an untuned GBM in these examples as it exhibited satisfactory performance even without fine-tuning. -```{r interpretation-003, results = 'hide'} +```{r model_interpretation-001, results = 'hide'} library(mlr3verse) tsk_german = tsk("german_credit")$select( cols = c("duration", "amount", "age", "status", "savings", "purpose", @@ -59,7 +59,7 @@ However, the differences in interpretation between training and test data are le This object contains the prediction model as well as the data used for analyzing the model and producing the desired explanation. We construct the `Predictor` object using our trained learner and heldout test data: -```{r iml-Predictor} +```{r model_interpretation-002} #| cache: false library(iml) @@ -101,13 +101,13 @@ The default number of repetitions when constructing a `FeatureImp` object is `5` However, the number of repetitions should be increased if you want to obtain useful error bars from the resulting plot. ::: -```{r iml-007} +```{r model_interpretation-003} #| cache: false importance = FeatureImp$new(predictor, loss = "ce", n.repetitions = 100) importance$plot() ``` -```{r} +```{r model_interpretation-004} #| fig-height: 3 #| label: fig-iml-pfi #| fig-cap: Permutation feature importance (PFI). Points indicate the median and bars the 5% and 95% quantiles of the PFI over five repetitions of the permutation process. @@ -154,7 +154,7 @@ Below we initialize an object of class `r ref("iml::FeatureEffect")` by passing We recommend always plotting PD and ICE curves together as PD plots on their own could mask heterogeneous effects. We use `$plot()` to visualize the results (@fig-iml-pdice). -```{r iml-pdp} +```{r model_interpretation-005} #| fig-height: 3 #| label: fig-iml-pdice #| fig-cap: Partial dependence (PD) plot (yellow) and individual conditional expectation (ICE) curves (black) that show how the credit amount affects the predicted credit risk. @@ -188,7 +188,7 @@ The augmented data can then be used to train an interpretable model that capture This class extracts the decision rules created by the tree surrogate and the `$plot()` method visualizes the distribution of the predicted outcomes from each terminal node. Below, we pass `maxdepth = 2` to the constructor to build a tree with two binary splits, yielding four terminal nodes. -```{r iml-globalsurrogate,message=FALSE} +```{r model_interpretation-006,message=FALSE} #| cache: false tree_surrogate = TreeSurrogate$new(predictor, maxdepth = 2L) ``` @@ -196,7 +196,7 @@ tree_surrogate = TreeSurrogate$new(predictor, maxdepth = 2L) Before inspecting this model, we need to first check if the surrogate model approximates the prediction model accurately, which we can assess by comparing the predictions of the tree surrogate and the predictions of the black box model. For example, we could quantify the number of matching predictions and measure the accuracy of the surrogate in predicting the predictions of the black box GBM model: -```{r iml-crosstable} +```{r model_interpretation-007} #| cache: false pred_surrogate = tree_surrogate$predict(credit_x, type = "class")$.class pred_surrogate = factor(pred_surrogate, levels = c("good", "bad")) @@ -208,7 +208,7 @@ confusion This shows an accuracy of around `r round(confusion$measures[["acc"]] * 100)`% in predictions from the surrogate compared to the black box model, which is good enough for us to use our surrogate for further interpretation, for example by plotting the splits in the terminal node: -```{r iml-globalsurrogate-plot,message=FALSE} +```{r model_interpretation-008,message=FALSE} #| fig-cap: Distribution of the predicted outcomes for each terminal node identified by the tree surrogate. The top two nodes consist of applications with a positive balance in the account (`status`is either `"0 <= ... < 200 DM"`, `"... >= 200 DM"` or `"salary for at least 1 year"`) and either a duration of less or equal than 42 months (top left), or more than 42 months (top right). The bottom nodes contain applicants that either have no checking account or a negative balance (`status`) and either a duration of less than or equal to 36 months (bottom left) or more than 36 months (bottom right). #| fig-alt: Four barplots with 'count' on the y-axis and '.class' on the x-axis. Top left shows 150 'good' credit predictions and around 1 'bad' prediction. Top right shows around 10 'good' predictions and 1 'bad' one. Bottom left shows around 120 'good' predictions and 40 'bad' ones. Bottom right shows about 23 'bad' predictions and around 5 'good' ones. #| label: fig-iml-surro @@ -217,7 +217,7 @@ tree_surrogate$plot() Or we could access the trained tree surrogate via the `$tree` field of the `TreeSurrogate` object and then have access to all methods in `r ref_pkg("partykit")`: -```{r iml-globalsurrogate-tree} +```{r model_interpretation-009} partykit::print.party(tree_surrogate$tree) ``` @@ -237,7 +237,7 @@ Local surrogate models are constructed as follows: To illustrate this, we will select a random data point to explain. As we are dealing with people, we will name our observation "Charlie" and first look at the black box predictions: -```{r Charlie, asis='results'} +```{r model_interpretation-010, asis='results'} Charlie = tsk_german$data(rows = 127L, cols = tsk_german$feature_names) gbm_predict = predictor$predict(Charlie) gbm_predict @@ -248,7 +248,7 @@ The underlying surrogate model is a locally weighted L1-penalized linear regress We can also set the parameter `gower.power` which specifies the size of the neighborhood for the local model (default is `gower.power = 1`), the smaller the value, the more the model will focus on points closer to the point of interest, below we set `gower.power = 0.1`. This implementation is very closely related to Local Interpretable Model-agnostic Explanations (`r index('LIME', lower = FALSE)`) [@Ribeiro2016lime], the differences are outlined in the documentation of `iml::LocalModel`. -```{r iml-local_surrogate,message=FALSE,warning=FALSE} +```{r model_interpretation-011_surrogate,message=FALSE,warning=FALSE} predictor$class = "good" # explain the 'good' class local_surrogate = LocalModel$new(predictor, Charlie, gower.power = 0.1, k = 2) @@ -258,7 +258,7 @@ If the prediction of the local model and the prediction of the black box GBM mod These parameters can be considered as hyperparameters of the local surrogate model, which should be tuned to obtain an accurate local surrogate. First, we check if the predictions for Charlie match: -```{r} +```{r model_interpretation-012} c(gbm = gbm_predict[[1]], local = local_surrogate$predict()[[1]]) ``` @@ -266,7 +266,7 @@ Ideally, we should assess the fidelity of the surrogate model in the local neigh A practical approach to assess this local model fidelity involves generating artificial data points within Charlie's local neighborhood (and potentially applying distance-based weighting) or selecting the $k$ nearest neighbors from the original data. For illustration purposes, we now quantify the approximation error using the mean absolute error calculated from the 10 nearest neighbors (including Charlie) according to the Gower distance [@gower1971general]: -```{r} +```{r model_interpretation-013} ind_10nn = gower::gower_topn(Charlie, credit_x, n = 10)$index[, 1] Charlie_10nn = credit_x[ind_10nn, ] @@ -277,10 +277,10 @@ mean(abs(gbm_pred_10nn - local_pred_10nn)) As we see good agreement between the local and black box model (on average, the predictions of both the local surrogate and the black box model for Charlie's 10 nearest neighbors differ only by `r round(mean(abs(gbm_pred_10nn - local_pred_10nn)), 3)`), we can move on to look at the most influential features for Charlie's predictions: -```{r, eval = FALSE} +```{r model_interpretation-014, eval = FALSE} local_surrogate$results[, c("feature.value", "effect")] ``` -```{r, echo = FALSE} +```{r model_interpretation-015, echo = FALSE} x = local_surrogate$results[, c("feature.value", "effect")] rownames(x) = NULL x @@ -309,7 +309,7 @@ The exact computation of Shapley values is time consuming, as it involves taking Therefore, the estimation of Shapley values is often approximated. The `sample.size` argument (default is `sample.size = 100`) can be increased to obtain a more accurate approximation of exact Shapley values. -```{r iml-006} +```{r model_interpretation-016} #| fig-height: 3 #| fig-cap: Shapley values for Charlie. The actual prediction (0.63) displays the prediction of the model for the observation we are interested in, the average prediction (0.71) displays the average prediction over the given test dataset. Each horizontal bar is the Shapley value (phi) for the given feature. #| fig-alt: 10 bar plots of Shapley values, one for each feature. x-axis says 'phi' and ranges from -0.1 to 0.05. The strongest positive contributions are from the `duration`, `purpose` and `property` variables. The strongest negative contributions are `status`, `amount`, and `savings`. @@ -334,7 +334,7 @@ Counterfactual explanations can have many applications in different areas such a For example, a counterfactual explanation could be used to suggest lifestyle changes to a patient to reduce their risk of developing a particular disease, or to suggest actions that would increase the chance of a credit being approved. For our `tsk("german_credit")` example, we might consider what changes in features would turn a 'bad' credit prediction into a 'good' one (@fig-counterfactuals-ill). -```{r interpretation-counterfactuals-fig, echo=FALSE} +```{r model_interpretation-017, echo=FALSE} #| label: fig-counterfactuals-ill #| out-width: 50% #| fig-cap: Illustration of a counterfactual explanation. The real observation (blue, right dot) is predicted to have 'bad' credit. The brown (left) dot is one possible counterfactual that would result in a 'good' credit prediction. @@ -370,7 +370,7 @@ We initialize a `r ref("counterfactuals::WhatIfClassif")` object with our `Predi The `$find_counterfactuals()` method generates a counterfactual of class `r ref("counterfactuals::Counterfactuals")`, below we set our desired predicted probability to be between `0.75` and `1` (`desired_prob = c(0.75, 1)`). The `$evaluate(show_diff = TRUE)` method tells us how features need to be changed to generate our desired class. -```{r interpretation-whatif} +```{r model_interpretation-018} library(counterfactuals) whatif = WhatIfClassif$new(predictor, n_counterfactuals = 1L) cfe = whatif$find_counterfactuals(Charlie, @@ -387,7 +387,7 @@ We set the `epsilon` parameter to 0 to penalize counterfactuals in the optimizat With MOC, we can also prohibit changes in specific features via the `fixed_features` argument, below we restrict changes in the 'age' variable. For illustrative purposes, we only run the multi-objective optimizer for 30 generations. -```{r interpretation-mocmulti,message=FALSE} +```{r model_interpretation-019,message=FALSE} moc = MOCClassif$new(predictor, epsilon = 0, n_generations = 30L, fixed_features = "age") cfe_multi = moc$find_counterfactuals(Charlie, @@ -395,7 +395,7 @@ cfe_multi = moc$find_counterfactuals(Charlie, ``` The multi-objective approach does not guarantee that all counterfactuals have the desired prediction so we use `$subset_to_valid()` to restrict counterfactuals to those we are interested in: -```{r interpretation-mocmulti-subset} +```{r model_interpretation-020} cfe_multi$subset_to_valid() cfe_multi ``` @@ -403,7 +403,7 @@ cfe_multi This method generated `r nrow(cfe_multi$data)` counterfactuals but as these are artificially generated they are not necessarily equal to actual observations in the underlying dataset. For a concise overview of the required feature changes, we can use the `plot_freq_of_feature_changes()` method, which visualizes the frequency of feature changes across all returned counterfactuals. -```{r interpretation-mocfreq} +```{r model_interpretation-021} #| fig-height: 3.5 #| fig-cap: Barplots of the relative frequency of feature changes of the counterfactuals found by MOC. #| fig-alt: x-axis says 'relative frequency' and ranges from 0 to just over 0.3. Changed features were 'status' (in 35% of the counterfactuals), 'savings' (35%), 'purpose' (10%), 'employment_duration' (10%), 'duration' (10%), and 'amount' (10%). @@ -414,7 +414,7 @@ cfe_multi$plot_freq_of_feature_changes() We can see that 'status' and 'savings' were changed most frequently in the counterfactuals. To see *how* the features were changed, we can visualize the counterfactuals for two features on a two-dimensional ICE plot. -```{r interpretation-mocsurface} +```{r model_interpretation-022} #| fig-height: 3.5 #| fig-cap: Two-dimensional surface plot for the 'status' and 'savings' variables, higher predictions are lighter. The colors and contour lines indicate the predicted value of the model when 'status' and 'savings' differ while all other features are set to the true (Charlie's) values. The white point displays the true prediction (Charlie), and the black points are the counterfactuals that only propose changes in the two features. #| fig-alt: Surface plot that is primarily light blue when status is positive and dark blue when status is negative. y-axis is the 'savings' variable and x-axis is the 'status' variable. There is a white dot in the bottom left corner at (status = 'no checking account', savings = unknown/no savings account'). Two black dots are in a straight line above the white dot and two black dots are in a roughly straight line to the right of the white dot. @@ -442,7 +442,7 @@ This `r index('explanatory model analysis', aside = TRUE)` (EMA) process can foc @fig-dalex-fig-plot-01 visualizes an overview of the key functions in these two scenarios that we will discuss in this section. An in-depth description of this methodology can be found in @biecek_burzykowski_2021. -```{r interpretation-012, echo=FALSE} +```{r model_interpretation-023, echo=FALSE} #| label: fig-dalex-fig-plot-01 #| out-width: 92% #| fig-cap: Taxonomy of methods for model exploration presented in this section. The left side shows global analysis methods and the right shows local analysis methods. Methods increase in analysis complexity from top to bottom. @@ -453,7 +453,7 @@ knitr::include_graphics("Figures/DALEX_ema_process.png") As with `iml`, `DALEX` also implements a wrapper that enables a unified interface to its functionality. For models created with the `mlr3` package, we would use `r ref("DALEXtra::explain_mlr3()")`, which creates an S3 `explainer` object, which is a list containing at least: the model object, the dataset that will be used for calculation of explanations, the predict function, the function that calculates residuals, name/label of the model name and other additional information about the model. -```{r interpretation-019, eval=FALSE} +```{r model_interpretation-024, eval=FALSE} library(DALEX) library(DALEXtra) @@ -465,7 +465,7 @@ gbm_exp = DALEXtra::explain_mlr3(lrn_gbm, gbm_exp ``` -```{r, results='hide', echo=FALSE, include=FALSE} +```{r model_interpretation-025, results='hide', echo=FALSE, include=FALSE} #| cache: false library(DALEX) library(DALEXtra) @@ -478,7 +478,7 @@ gbm_exp = DALEXtra::explain_mlr3(lrn_gbm, gbm_exp ``` -```{r, echo=FALSE} +```{r model_interpretation-026, echo=FALSE} gbm_exp ``` @@ -490,19 +490,19 @@ In `DALEX`, functions for global level analysis are prefixed with `model_`. The model exploration process starts (@fig-dalex-fig-plot-01) by evaluating the performance of a model. `r ref("DALEX::model_performance()")` detects the task type and selects the most appropriate measure, as we are using binary classification the function automatically suggests recall, precision, F1-score, accuracy, and AUC; similarly the default plotting method is selected based on the task type, below ROC is selected. -```{r interpretation-020a} +```{r model_interpretation-027} #| cache: false perf_credit = model_performance(gbm_exp) perf_credit ``` -```{r} +```{r model_interpretation-028} #| cache: false old_theme = set_theme_dalex("ema") plot(perf_credit, geom = "roc") ``` -```{r} +```{r model_interpretation-029} #| fig-height: 6 #| fig-width: 5 #| label: fig-dalex-roc @@ -526,18 +526,18 @@ For the credit risk task, the LIFT curve is a popular graphical summary. Feature importance methods can be calculated with `r ref("DALEX::model_parts()")` and then plotted. -```{r interpretation-021} +```{r model_interpretation-030} #| cache: false gbm_effect = model_parts(gbm_exp) head(gbm_effect) ``` -```{r} +```{r model_interpretation-031} #| cache: false plot(gbm_effect, show_boxplots = FALSE) ``` -```{r} +```{r model_interpretation-032} #| fig-height: 4 #| fig-width: 8 #| out-width: 90% @@ -559,20 +559,20 @@ The `type` argument in the `model_parts` function allows you to specify how the Feature effects can be calculated with `r ref("DALEX::model_profile()")` and by default are plotted as PD plots. -```{r interpretation-024, warning=FALSE} +```{r model_interpretation-033, warning=FALSE} #| cache: false gbm_profiles = model_profile(gbm_exp) gbm_profiles ``` -```{r} +```{r model_interpretation-034} #| cache: false plot(gbm_profiles) + theme(legend.position = "top") + ggtitle("Partial Dependence for GBM Credit model","") ``` -```{r} +```{r model_interpretation-035} #| fig-height: 5 #| fig-width: 8 #| out-width: 90% @@ -602,14 +602,14 @@ We will carry out the following examples using Charlie again. Local analysis starts with the calculation of a model prediction (@fig-dalex-fig-plot-01). -```{r interpretation-025} +```{r model_interpretation-036} predict(gbm_exp, Charlie) ``` As a next step, we might consider break-down plots, which decompose the model's prediction into contributions that can be attributed to different explanatory variables (see the *Break-down Plots for Additive Attributions* chapter in @biecek_burzykowski_2021 for more on this method). These are calculated with `r ref("DALEX::predict_parts()")`: -```{r interpretation-027} +```{r model_interpretation-037} #| fig-height: 4.5 #| fig-width: 8 #| out-width: 90% @@ -630,7 +630,7 @@ This is a useful option when the features have some relative conditional importa The `predict_parts()` function can also be used to plot Shapley values with the SHAP algorithm [@Lundberg2019] by setting `type = "shap"`: -```{r interpretation-028} +```{r model_interpretation-038} #| fig-height: 4.5 #| fig-width: 8 #| out-width: 90% @@ -653,7 +653,7 @@ The parameters `B` and `N` can be used to tune this trade-off, where `N` is the Finally, we can plot ICE curves using `r ref("DALEX::predict_profile()")`: -```{r interpretation-029, warning=FALSE} +```{r model_interpretation-039, warning=FALSE} #| fig-height: 5 #| fig-width: 8 #| out-width: 90% diff --git a/book/chapters/chapter13/beyond_regression_and_classification.qmd b/book/chapters/chapter13/beyond_regression_and_classification.qmd index 2cbf9cde7..58919c52c 100644 --- a/book/chapters/chapter13/beyond_regression_and_classification.qmd +++ b/book/chapters/chapter13/beyond_regression_and_classification.qmd @@ -45,7 +45,7 @@ This is a deterministic classification problem where we are predicting whether s Now let us consider some potential costs associated with each prediction and the eventual truth. As cost-sensitive classification is a minimization problem, we assume lower costs correspond to higher profits/positive outcomes, hence we write profits as negative values and losses as positive values: -```{r special-002} +```{r beyond_regression_and_classification-001} costs = matrix(c(-1, 0, 5, 0), nrow = 2, dimnames = list("Predicted Credit" = c("good", "bad"), Truth = c("good", "bad"))) @@ -62,7 +62,7 @@ We will now see how to implement a more nuanced approach to classification error This measure takes one argument, which is a matrix with row and column names corresponding to the class labels in the task of interest. Let us put our insurance example into practice, notice that we have already named the cost matrix as required for the measure: -```{r special-003} +```{r beyond_regression_and_classification-002} library(mlr3verse) tsk_german = tsk("german_credit") @@ -88,7 +88,7 @@ Currently in our running example, the models above will predict a customer has g Here, this might not be a sensible approach as we would likely act more conservatively and reject more credit applications with a higher threshold due to the non-uniform costs. This is highlighted in the `"threshold"` `autoplot` (@fig-costsens-threshold), which plots `msr("classif.costs")` over all possible thresholds. -```{r special-004} +```{r beyond_regression_and_classification-003} #| output: false #| cache: false prediction = lrn("classif.log_reg", @@ -96,7 +96,7 @@ prediction = lrn("classif.log_reg", autoplot(prediction, type = "threshold", measure = msr_costs) ``` -```{r} +```{r beyond_regression_and_classification-004} #| echo: false #| warning: false #| message: false @@ -113,7 +113,7 @@ As expected, the optimal threshold is greater than 0.5 which means the optimal m The optimal threshold can be automated by making use of `r mlr3tuning` (@sec-optimization) and `r mlr3pipelines` (@sec-pipelines) to tune `po("tunethreshold")`. Continuing the same example: -```{r special-005} +```{r beyond_regression_and_classification-005} po_cv = po("learner_cv", lrn("classif.log_reg", predict_type = "prob")) graph = po_cv %>>% po("tunethreshold", measure = msr_costs) @@ -142,7 +142,7 @@ This means that unlike classification and regression, learners are trained on tw Relating to our example above, the runner's outcome would then be $(T = 4, \Delta = 1)$ or $(T = 2, \Delta = 0)$. Another example is in the code below, where we randomly generate six survival times and six event indicators, an outcome with a `+` indicates the outcome is censored, otherwise, the event of interest occurred. -```{r special-009} +```{r beyond_regression_and_classification-006} library(survival) Surv(runif(6), rbinom(6, 1, 0.5)) ``` @@ -153,7 +153,7 @@ For a good introduction to survival analysis see @Collett2014 or for machine lea For the remainder of this section, we will look at how `r mlr3proba` [@mlr3proba] extends the building blocks of `mlr3` for survival analysis. We will begin by looking at objects used to construct machine learning tasks for survival analysis, then we will turn to the learners we have implemented to solve these tasks, before looking at measures for evaluating survival analysis predictions, and then finally we will consider how to transform prediction types. -```{r special-010, echo=FALSE} +```{r beyond_regression_and_classification-007, echo=FALSE} #| fig-alt: Figure shows give horizontal lines at 1,2,3,4,5 on the y-axis and a vertical line at 8 on the x-axis. Top line (subject 5) has a circle at x=8 and a diamond at x=9, second line (subject 4) has a circle at x=1 and a diamond at x=9, subject 3 has a circle at x=4 and a diamond at x=6, subject 2 has a diamond at x=8, and subject 1 has a diamond at x=7. #| fig-cap: Plot illustrating different censoring types. Dead and censored subjects (y-axis) over time (x-axis). Black diamonds indicate true death times and white circles indicate censoring times. Vertical line is the study end time. Subjects 1 and 2 die in the study time. Subject 3 is censored in the study and (unknown) dies within the study time. Subject 4 is censored in the study and (unknown) dies after the study. Subject 5 dies after the end of the study. Figure and caption from @Sonabend2021b. #| label: fig-censoring @@ -191,7 +191,7 @@ Note this has more arguments than `r ref("as_task_regr()")` to reflect multiple In this section we will use the `rats` dataset as a running example, this dataset looks at predicting if a drug treatment was successful in preventing 150 rats from developing tumors. The dataset, by its own admission, is not perfect and should generally be treated as 'dummy' data, which is good for examples but not real-world analysis. -```{r special-011, warning=FALSE, message=FALSE} +```{r beyond_regression_and_classification-008, warning=FALSE, message=FALSE} library(mlr3verse) library(mlr3proba) library(survival) @@ -204,10 +204,10 @@ tsk_rats$head() Plotting the task with `autoplot` results in a `r index('Kaplan-Meier', lower = FALSE)` plot (@fig-autokm) which is a non-parametric estimator of the probability of survival for the average observation in the training set. -```{r, warning=FALSE, message=FALSE, output = FALSE, cache = FALSE} +```{r beyond_regression_and_classification-009, warning=FALSE, message=FALSE, output = FALSE, cache = FALSE} autoplot(tsk_rats) ``` -```{r, warning=FALSE, message=FALSE, echo = FALSE} +```{r beyond_regression_and_classification-010, warning=FALSE, message=FALSE, echo = FALSE} #| fig-cap: Kaplan-Meier plot of `tsk("rats")`. x-axis is time variable and y-axis is survival function, S(T), defined by $1 -$ F(T) where F is the cumulative distribution function. Crosses indicate points where censoring takes place. #| fig-alt: Figure shows a line plot with "Time" on the x-axis from 0 to 100 and "Survival" on the y-axis from 0.80 to 1.00. The line plot is a black line from (0, 1) to (25, 1) then starts to drop slowly and then quickly down to (100, 0.80). #| label: fig-autokm @@ -218,7 +218,7 @@ p As well as creating your own tasks, you can load any of the tasks shipped with `mlr3proba`: -```{r special-013} +```{r beyond_regression_and_classification-011} as.data.table(mlr_tasks)[task_type == "surv"] ``` @@ -239,7 +239,7 @@ In survival analysis, the following predictions can be made: We will go through each of these prediction types in more detail and with examples to make them less abstract. We will use `lrn("surv.coxph")`\index{Cox Proportional Hazards} trained on `tsk("rats")` as a running example, since for this model, all predict types except `response` can be computed. -```{r special-015} +```{r beyond_regression_and_classification-012} tsk_rats = tsk("rats") split = partition(tsk_rats) prediction_cph = lrn("surv.coxph")$train(tsk_rats, split$train)$ @@ -257,7 +257,7 @@ This is illustrated in the code below. In the example below we train and predict from a survival SVM\index{support vector machine!survival} (`lrn("surv.svm")`), note we use `type = "regression"` to select the algorithm that optimizes survival time predictions and `gamma.mu = 1e-3` is selected arbitrarily as this is a required parameter (this parameter should usually be tuned). We then compare the predictions from the model to the true data. -```{r special-016} +```{r beyond_regression_and_classification-013} library(mlr3extralearners) prediction_svm = lrn("surv.svm", type = "regression", gamma = 1e-3)$ train(tsk_rats, split$train)$predict(tsk_rats, split$test) @@ -276,7 +276,7 @@ You will therefore find that the majority of survival models in `mlr3proba` will These predictions are implemented using the `r ref_pkg("alan-turing-institute/distr6")` package, which allows visualization and evaluation of survival curves (defined as $1 -$ cumulative distribution function). Below we extract the first three `$distr` predictions from our running example and calculate the probability of survival at $t = 77$. -```{r special-017} +```{r beyond_regression_and_classification-014} prediction_cph$distr[1:3]$survival(77) ``` @@ -299,7 +299,7 @@ However, sometimes risk is defined as $\exp(-\eta)$, and sometimes it can be an To prevent this confusion in `mlr3proba`, we define the predict type `crank`, which stands for **c**ontinuous **rank**ing. This is best explained by example; continuing from the previous we output the first three `crank` predictions. -```{r special-018} +```{r beyond_regression_and_classification-015} prediction_cph$crank[1:3] ``` @@ -325,7 +325,7 @@ In general survival measures can be grouped into the following: 2. Calibration measures -- Quantify if the average prediction is close to the truth (all definitions of calibration are unfortunately vague in a survival context). Evaluate `crank` and/or `lp` predictions. 3. Scoring rules -- Quantify if probabilistic predictions are close to true values. Evaluate `distr` predictions. -```{r special-020} +```{r beyond_regression_and_classification-016} as.data.table(mlr_measures)[ task_type == "surv", c("key", "predict_type")][1:5] ``` @@ -335,7 +335,7 @@ We recommend ISBS (Integrated Survival Brier Score) (`msr("surv.graf")`) to eval Using these measures, we can now evaluate our predictions from the previous example. -```{r} +```{r beyond_regression_and_classification-017} prediction_cph$score(msrs(c("surv.graf", "surv.cindex", "surv.dcalib"))) ``` @@ -376,7 +376,7 @@ We construct the `distrcompositor` pipeline around a survival XGBoost Accelerate In the pipeline, we specify that we will estimate the baseline distribution with a `r index("Kaplan-Meier", lower = FALSE)` estimator (`estimator = "kaplan"`) and that we want to assume an AFT form for our estimated distribution (`form = "aft"`). We then train and predict in the usual way and in our output we can now see a `distr` prediction. -```{r special-021, warning=FALSE} +```{r beyond_regression_and_classification-018, warning=FALSE} library(mlr3verse) library(mlr3extralearners) @@ -417,7 +417,7 @@ We first load `tsk("grace")` (which only has numeric features) and sample 500 ro We then select the ISBS, D-Calibration, and C-index to evaluate predictions, set up the same pipeline we used in the previous experiment, and load a Cox PH and Kaplan-Meier estimator. We run our experiment with three-fold CV and aggregate the results. -```{r special-022, warning=FALSE} +```{r beyond_regression_and_classification-019, warning=FALSE} set.seed(42) library(mlr3extralearners) @@ -465,14 +465,14 @@ We will consider each in turn. As density estimation is an unsupervised task, there is no target for prediction. In the code below we construct a density task using `r ref("as_task_dens()")` which takes one argument, a `data.frame` type object with exactly one column (which we will use to estimate the underlying distribution). -```{r special-023} +```{r beyond_regression_and_classification-020} tsk_dens = as_task_dens(data.table(x = rnorm(1000))) tsk_dens ``` As with other tasks, we have included a couple of tasks that come shipped with `mlr3proba`: -```{r} +```{r beyond_regression_and_classification-021} as.data.table(mlr_tasks)[task_type == "dens", c(1:2, 4:5)] ``` @@ -488,7 +488,7 @@ All learners will return a `distr` and `pdf` prediction but only some can make ` Again, the `distr` predict type is implemented using `distr6`. In the code below we train and 'predict' with a histogram learner and then plot the estimated probability density function (@fig-dens-hist), which closely matches the underlying Normally-distributed data. -```{r special-024} +```{r beyond_regression_and_classification-022} #| label: fig-dens-hist #| fig-cap: Predicted density from the histogram learner, which closely resembles the underlying N(0, 1) data. #| fig-alt: Image shows a line plot with x-axis between (-2,2) and y-axis between (0,0.4). The plot closely resembles a Normal(0, 1) distribution with a peak at 0.4. @@ -501,7 +501,7 @@ ggplot(df, aes(x = x, y = y)) + geom_line() + theme_minimal() The `pdf` and `cdf` predict types are simply wrappers around `distr$pdf` and `distr$cdf` respectively: -```{r special-025} +```{r beyond_regression_and_classification-023} prediction = lrn_hist$train(tsk_dens, 1:10)$predict(tsk_dens, 11:13) # pdf and cdf columns in output prediction @@ -515,7 +515,7 @@ cbind(prediction$distr$cdf(tsk_dens$data()$x[11:13]), At the time of publication, the only measure implemented in `mlr3proba` for density estimation is logloss, which is defined in the same way as in classification, $L(y) = -\log(\hat{f}_Y(y))$, where $\hat{f}_Y$ is our estimated probability density function. Putting this together with the above we are now ready to train a density learner, estimate a distribution, and evaluate our estimation: -```{r special-026} +```{r beyond_regression_and_classification-024} msr_logloss = msr("dens.logloss") msr_logloss @@ -524,7 +524,7 @@ prediction$score(msr_logloss) This output is most easily interpreted when compared to other learners in a benchmark experiment, so let us put everything together to conduct a small benchmark study on `tsk("faithful")` task using some of the integrated density learners: -```{r special-028, message=FALSE, warning=FALSE, results='hide'} +```{r beyond_regression_and_classification-025, message=FALSE, warning=FALSE, results='hide'} library(mlr3extralearners) tsk_faithful = tsk("faithful") learners = lrns(c("dens.hist", "dens.pen", "dens.kde")) @@ -534,13 +534,13 @@ bmr = benchmark(benchmark_grid(tsk_faithful, learners, bmr$aggregate(measure) ``` -```{r} +```{r beyond_regression_and_classification-026} #| output: false #| cache: false autoplot(bmr, measure = measure) ``` -```{r} +```{r beyond_regression_and_classification-027} #| echo: false #| warning: false #| message: false @@ -576,7 +576,7 @@ Similarly to density estimation (@sec-density), there is no target for predictio By example, we will look at the `r ref("cluster::ruspini")` dataset, which has 75 rows and two columns and was first introduced in @Ruspini1970 to illustrate different clustering techniques. The observations in the dataset form four natural clusters (@fig-beyond-clust-ruspini). In the code below we construct a cluster task using `r ref("as_task_clust()")` which only takes one argument, a `data.frame` type object. -```{r special-030, warning=FALSE, message=FALSE} +```{r beyond_regression_and_classification-028, warning=FALSE, message=FALSE} #| label: fig-beyond-clust-ruspini #| fig-cap: Distribution of the `ruspini` dataset. #| fig-alt: "Four grids. Top-left shows a curve increasing sharply between (0,0.003) and (30,0.012) then decreasing to (120, 0.003). Top-right just says 'Corr: 0.065'. Bottom-left shows four distinct clusters of points. Bottom-right increases from (0, 50) to (140, 150) then decreases to (155, 100)." @@ -591,7 +591,7 @@ autoplot(tsk_ruspini) Technically, we did not need to create a new task for the `ruspini` dataset since it is already included in the package, along with one other task: -```{r special-031} +```{r beyond_regression_and_classification-029} as.data.table(mlr_tasks)[task_type == "clust", c(1:2, 4:5)] ``` @@ -607,7 +607,7 @@ Similarly to classification, prediction types of clustering learners are either Below we construct a C-Means clustering learner with `"prob"` prediction type and three clusters (`centers = 3`), train it on the `ruspini` dataset and then return the cluster assignments (`$assignments`) for six random observations. -```{r special-032} +```{r beyond_regression_and_classification-030} lrn_cmeans = lrn("clust.cmeans", predict_type = "prob", centers = 3) lrn_cmeans @@ -617,7 +617,7 @@ lrn_cmeans$assignments[sample(tsk_ruspini$nrow, 6)] As clustering is unsupervised, it often does not make sense to use `predict` for new data however this is still possible using the `mlr3` interface. -```{r special-033, warning = FALSE, message = FALSE} +```{r beyond_regression_and_classification-031, warning = FALSE, message = FALSE} #| label: fig-beyond-clust-ruspini-estimated #| fig-cap: Distribution of the estimated clusters. #| fig-alt: "Four grids. Top-left shows three overlapping curves in purple (cluster 1), blue (cluster 2) and green (cluster 3). The purple and blue curves are zero in most places but then peak at (30, 120) and (60, 120) respectively. The green curve starts at (0,0) then increases slowly to (40, 120) then decreases bumpily to (120, 60). Top-right says '-0.78' in green (cluster 3), '0' in blue (cluster 2), and '-0.05' in purple (cluster 1). Bottom-left shows four distinct clusters of points, two clusters are green, one (bottom) is blue, one (bottom left) is purple. Bottom-right: line graphs that show a similar but inverted shape as top-left." @@ -635,7 +635,7 @@ In hierarchical clustering, the goal is to build a hierarchy of nested clusters The final result is a tree or `r index('dendrogram')` which can change if a new data point is added. For consistency, `mlr3cluster` offers a `predict` method for hierarchical clusters but with a warning: -```{r special-034} +```{r beyond_regression_and_classification-032} #| label: fig-beyond-clust-dend #| fig-cap: Dendrogram representing hierarchical clustering of the `ruspini` dataset. y-axis is similarity of points such that the lower observations (x-axis) are connected, the greater their similarity. The top split represents the separation of the two clusters. #| fig-alt: "Plot shows a horizontal line that connects two vertical lines. Each vertical line connects to another horizontal line that splits into two more vertical lines, which continues for up to nine breaks." @@ -660,7 +660,7 @@ Note that the silhouette measure in `mlr3cluster` returns the mean silhouette sc Putting this together with the above we can now score our cluster estimation (note we must pass the `task` to `$score`): -```{r special-036} +```{r beyond_regression_and_classification-033} measures = msrs(c("clust.wss", "clust.silhouette")) prediction$score(measures, task = tsk_ruspini) @@ -679,7 +679,7 @@ As clustering is an unsupervised task, visualization can be essential not just f It is easy to rely on clustering measures to assess the quality of clustering however this should be done with care as choosing between models may come down to other decisions such as how clusters are formed. By example, consider data generated by `r ref("mlbench::mlbench.spirals")`, which results in two individual lines that spiral around each other (@fig-beyond-clust-spirals). -```{r special-037} +```{r beyond_regression_and_classification-034} #| label: fig-beyond-clust-spirals #| fig-cap: Distribution of `spirals` data. #| fig-alt: "Grid of four plots. Top-left: line increasing from (-1,0.1) to (0,0.5) then decreasing to (1,0.1). Top-right: text that says 'Corr: -0.145'. Bottom-left: two lines of dots that are in tight, non-overlapping spirals around each other. Bottom-right: same shape as top-left." @@ -690,7 +690,7 @@ autoplot(tsk_spirals) Now let us see what happens when fit two clustering learners on this data: -```{r special-038} +```{r beyond_regression_and_classification-035} learners = list( lrn("clust.kmeans"), lrn("clust.dbscan", eps = 0.1) @@ -703,7 +703,7 @@ bmr$aggregate(msr("clust.silhouette"))[, c(4, 7)] We can see that K-means clustering gives us a higher average silhouette score and so we might conclude that a K-means learner with two centroids is a better choice than the DBSCAN method. However, now take a look at the cluster assignment plots in @fig-beyond-clust-spirals-pred (`autoplot.PredictionClust` is available but we do not use it here so we can highlight two particular plots). -```{r special-039, message=FALSE, warning=FALSE} +```{r beyond_regression_and_classification-036, message=FALSE, warning=FALSE} #| label: fig-beyond-clust-spirals-pred #| fig-cap: Comparing estimated clusters from `lrn("clust.kmeans")` and `lrn("clust.dbscan")`. Both create two distinct clusters that are separated in different ways. #| fig-alt: "Two plots of the same spirals as in the previous plot. Left (K-means): points above the line x=y are purple (cluster 1) and other points are green (cluster 2). Right (DBSCAN): One of the spirals is purple and the other is green." @@ -739,7 +739,7 @@ Since our running example only has two features, PCA does not make sense to visu So we will use a task based on the `USArrests` dataset instead. By plotting the result of PCA (@fig-beyond-clust-usarrests), we see that our model has done a good job of separating observations into two clusters along the first two principal components. -```{r special-040, message=FALSE, warning=FALSE} +```{r beyond_regression_and_classification-037, message=FALSE, warning=FALSE} #| label: fig-beyond-clust-usarrests #| fig-cap: First two principal components using PCA on `tsk("usarrests")`. #| fig-alt: "Scatter plot of green (cluster 2) and purple (cluster 1) points. x-axis: PC1 (96.55%) between -0.3 and 0.2. y-axis: PC2 (2.78%) between -0.3 and 0.2. Points are cleanly separated into two clusters by color." @@ -756,7 +756,7 @@ If the average silhouette value for a given cluster is below the average silhoue Continuing with our new example, we find (@fig-beyond-clust-sil) that a lot of observations are actually below the average line and close to zero, and therefore the quality of our cluster assignments is not very good, meaning that many observations are likely assigned to the wrong cluster. -```{r special-041, message=FALSE, warning=FALSE} +```{r beyond_regression_and_classification-038, message=FALSE, warning=FALSE} #| label: fig-beyond-clust-sil #| fig-cap: 'Silhouette plot from predictions made by `lrn("clust.kmeans")` on `tsk("usarrests")`.' #| fig-alt: "Horizontal barplot with 'Silhouette Values' on x-axis between 0 and 1; y-axis is 'Observations' between 0 and 50. Observations between 0-20 are all colored purple (cluster 1) and observations between 21-50 are colored green (cluster 2). A dashed vertical line passes through x=0.59. The majority of bars finish before this line." @@ -767,7 +767,7 @@ autoplot(prediction, tsk_usarrests, type = "sil") Finally, we conduct a small benchmark study using `tsk("usarrests")` and a few integrated cluster learners: -```{r special-042, message=FALSE, warning=FALSE} +```{r beyond_regression_and_classification-039, message=FALSE, warning=FALSE} tsk_usarrests = tsk("usarrests") learners = list( lrn("clust.featureless"), @@ -798,7 +798,7 @@ Outbreaks radiate outwards from an epicenter and therefore countries closer to G Thus, looking at the data spatially shows clear signs of autocorrelation across nearby observations. Note in this example the autocorrelation is radial but in practice, this will not always be the case. -```{r special-043, warning=FALSE, echo=FALSE, message=FALSE, out.width = "60%", out.height = "60%", fig.height = 5} +```{r beyond_regression_and_classification-040, warning=FALSE, echo=FALSE, message=FALSE, out.width = "60%", out.height = "60%", fig.height = 5} #| label: fig-autocorrelation #| fig-cap: Heatmaps where darker countries indicate higher number of cases and lighter countries indicate lower number of cases of imaginary Disease X with epicenter in Germany. The top map imagines a world in which there is no spatial autocorrelation and the number of cases of a disease is randomly distributed. The bottom map shows a more accurate world in which the number of cases radiate outwards from the epicenter (Germany). #| fig-alt: Image shows two separate maps of Europe. Top map has a random distribution of colors from white to dark gray. Bottom map shows darkest color (dark gray) at Germany with increasing lightness as the countries are increasingly further away. @@ -845,7 +845,7 @@ Throughout this section we will use the `r ref("mlr3spatiotempcv::ecuador")` dat To make use of spatial resampling methods, we have implemented two extensions of `r ref("TaskClassif")` and `r ref("TaskRegr")` to accommodate spatial data, `r ref("TaskClassifST")` and `r ref("TaskRegrST")` respectively. Below we only show classification examples but regression follows trivially. -```{r special-044, message=FALSE, warning=FALSE} +```{r beyond_regression_and_classification-041, message=FALSE, warning=FALSE} library(mlr3spatial) library(mlr3spatiotempcv) @@ -863,7 +863,7 @@ tsk_ecuador Once a task is created, you can train and predict as normal. -```{r special-045} +```{r beyond_regression_and_classification-042} lrn("classif.rpart")$train(tsk_ecuador)$predict(tsk_ecuador) ``` @@ -875,7 +875,7 @@ Before we look at the spatial resampling methods implemented in `mlr3spatiotempc Below we benchmark a decision tree on `tsk("ecuador")` using two different repeated cross-validation resampling methods, the first ("NSpCV" (non-spatial cross-validation)) is a non-spatial resampling method from `mlr3`, the second ("SpCV" (spatial cross-validation)) is from `mlr3spatiotempcv` and is optimized for spatial data. The example highlights how "NSpCV" makes it appear as if the decision tree is performing better than it is with considerably higher estimated performance, however, this is an overconfident prediction due to the autocorrelation in the data. -```{r special-046, warning=FALSE,message=FALSE} +```{r beyond_regression_and_classification-043, warning=FALSE,message=FALSE} lrn_rpart = lrn("classif.rpart", predict_type = "prob") rsmp_nsp = rsmp("repeated_cv", folds = 3, repeats = 2, id = "NSpCV") rsmp_sp = rsmp("repeated_spcv_coords", folds = 3, repeats = 2, @@ -893,7 +893,7 @@ Visually this can be seen using `r ref("mlr3spatiotempcv::autoplot()")` methods. In @fig-sprsmp we visualize how the task is partitioned according to the spatial resampling method (@fig-sprsmp, left) and non-spatial resampling method (@fig-sprsmp, right). There is a clear separation in space for the respective partitions when using the spatial resampling whereas the train and test splits overlap a lot (and are therefore more correlated) using the non-spatial method. -```{r special-047} +```{r beyond_regression_and_classification-044} #| fig-cap: Scatterplots show separation of train (blue) and test (orange) data for the first fold of the first repetition of the cross-validation. Left is spatial resampling where train and test data are clearly separated. Right is non-spatial resampling where there is overlap in train and test data. #| fig-alt: Two scatter plots with points in blue (training data) and orange (test data). Left plot (Spatial Resampling) shows a clean separation between orange and blue points. Right plot (Non-spatial Resampling) shows blue and orange dots randomly scattered among each other. #| label: fig-sprsmp @@ -944,7 +944,7 @@ To enable these predictions we have created a new function, `r ref("mlr3spatial: In the example below we load the `leipzig_points` dataset for training and coerce this to a spatiotemporal task with `r ref("as_task_classif_st")`, and we load the `leipzig_raster` raster. Both files are included as example data in `r mlr3spatial`. -```{r special-048, warning=FALSE, message=FALSE} +```{r beyond_regression_and_classification-045, warning=FALSE, message=FALSE} #| cache: false library(mlr3spatial) library(sf) @@ -964,7 +964,7 @@ leipzig_raster = terra::rast(system.file("extdata", "leipzig_raster.tif", Now we can continue as normal to train and predict with a classification learner, in this case, a random forest. -```{r special-049, cache.lazy=FALSE, cache=FALSE} +```{r beyond_regression_and_classification-046, cache.lazy=FALSE, cache=FALSE} lrn_ranger = lrn("classif.ranger")$train(tsk_leipzig) prediction = predict_spatial(leipzig_raster, lrn_ranger, format = "terra") @@ -973,7 +973,7 @@ prediction In this example, we specified the creation of a `terra` object, which can be visualized with in-built plotting methods. -```{r special-050, message = FALSE, cache.lazy=FALSE, cache=FALSE, out.width = "50%", out.height = "50%"} +```{r beyond_regression_and_classification-047, message = FALSE, cache.lazy=FALSE, cache=FALSE, out.width = "50%", out.height = "50%"} #| fig-cap: Spatial predictions for forest (purple), pasture (blue), urban (green), and water (yellow) categories. #| fig-alt: Very zoomed-in map with x-axis from 732000 to 733000 and 5692500 to 5693500 on y-axis. Different clusters are colored in green, blue, purple and yellow. #| label: fig-beyond-raster diff --git a/book/chapters/chapter14/algorithmic_fairness.qmd b/book/chapters/chapter14/algorithmic_fairness.qmd index 6cf2fe552..01acf9160 100644 --- a/book/chapters/chapter14/algorithmic_fairness.qmd +++ b/book/chapters/chapter14/algorithmic_fairness.qmd @@ -23,7 +23,7 @@ As ML-driven systems are used for highly influential decisions, it is vital to d As we work through this chapter we will use the `"adult_train"` and `"adult_test"` tasks from `mlr3fairness`, which contain a subset of the `Adult` dataset [@uci]. This is a binary classification task to predict if an individual earns more than $50,000 per year and is useful for demonstrating biases in data. -```{r special-051} +```{r algorithmic_fairness-001} library(mlr3fairness) tsk_adult_train = tsk("adult_train") tsk_adult_train @@ -35,7 +35,7 @@ In the context of fairness, `r index('bias', aside = TRUE)` refers to disparitie In this chapter, we will concentrate on a subset of bias definitions, those concerning `r index('group fairness', aside = TRUE)`. For example, in the adult dataset, it can be seen that adults in the group 'Male' are significantly more likely to earn a salary greater than $50K per year when compared to the group 'Female'. -```{r} +```{r algorithmic_fairness-002} sex_salary = table(tsk_adult_train$data(cols = c("sex", "target"))) round(proportions(sex_salary), 2) chisq.test(sex_salary) @@ -45,7 +45,7 @@ In this example, we would refer to the 'sex' variable as a `r index('sensitive a The goal of group fairness is then to ascertain if decisions are fair across groups defined by a sensitive attribute. The sensitive attribute in a task is set with the `"pta"` (**p**ro**t**ected **a**ttribute) column role (@sec-row-col-roles). -```{r special-049, eval = FALSE} +```{r algorithmic_fairness-003, eval = FALSE} tsk_adult_train$set_col_roles("sex", add_to = "pta") ``` @@ -83,7 +83,7 @@ If we chose accuracy or PPV for $M$, then we would have concluded that the model In `mlr3fairness` we can construct a fairness metric from any `r ref("Measure")` by constructing `msr("fairness", base_measure, range)` with our metric of choice passed to `base_measure` as well as the possible range the metric can take (i.e., the range in differences possible based on the base measure): -```{r special-0491} +```{r algorithmic_fairness-004} fair_tpr = msr("fairness", base_measure = msr("classif.tpr"), range = c(0, 1)) fair_tpr @@ -95,7 +95,7 @@ We have implemented several `Measure`s in `mlr3fairness` that simplify this step With our sensitive attribute set and the fairness metric selected, we can now train a `r ref("Learner")` and test for bias. Below we use a random forest and evaluate the absolute difference in true positive rate across groups 'Male' and 'Female': -```{r special-052} +```{r algorithmic_fairness-005} tsk_adult_test = tsk("adult_test") lrn_rpart = lrn("classif.rpart", predict_type = "prob") prediction = lrn_rpart$train(tsk_adult_train)$predict(tsk_adult_test) @@ -108,7 +108,7 @@ As well as using fairness metrics to evaluate a single model, they can also be u Visualizations can also help better understand discrepancies between groups or differences between models. `r ref("fairness_prediction_density()")` plots the sub-group densities across group levels and `r ref("compare_metrics()")` scores predictions across multiple metrics: -```{r special-058, message=FALSE, warning=FALSE} +```{r algorithmic_fairness-006, message=FALSE, warning=FALSE} #| fig-cap: Fairness prediction density plot (left) showing the density of predictions for the positive class split by "Male" and "Female" individuals. The metrics comparison barplot (right) displays the model's scores across the specified metrics. #| fig-alt: "Two panel plot. Left: Density plot showing that 'Female' observations are more likely to be predicted as having a salary less than $50K than 'Male' observations. Right: Three bar charts for the metrics 'fairness.fpr', 'fairness.tpr', 'fairness.eod' with bars at roughly 0.08, 0.06, and 0.07 respectively." #| label: fig-fairness @@ -150,7 +150,7 @@ Pre- and postprocessing schemes can be integrated using `r mlr3pipelines` (@sec- We provide two examples below, first preprocessing to balance observation weights with `po("reweighing_wts")` and second post-processing predictions using `po("EOd")`. The latter enforces the equalized odds fairness definition by stochastically flipping specific predictions. We also test `lrn("classif.fairzlrm")` against the other methods. -```{r special-054, warning=FALSE, message=FALSE} +```{r algorithmic_fairness-007, warning=FALSE, message=FALSE} # load learners lrn_rpart = lrn("classif.rpart", predict_type = "prob") lrn_rpart$id = "rpart" @@ -181,7 +181,7 @@ bmr$aggregate(meas)[, We can study the result using built-in plotting functions, below we use `r ref("fairness_accuracy_tradeoff()")`, to compare classification accuracy (default accuracy measure for the function) and equalized odds (`msr("fairness.eod")`) across cross-validation folds. -```{r special-0542} +```{r algorithmic_fairness-008} #| fig-cap: Comparison of learners with respect to classification accuracy (x-axis) and equalized odds (y-axis) across (dots) and aggregated over (crosses) folds. #| fig-alt: "Scatterplot with dots and crosses. x-axis is 'classif.acc' between 0.75 and 0.89, y-axis is 'fairness.equalized_odds' between 0 and 0.4. Plot results described in text." #| label: fig-fairness-tradeoff diff --git a/book/chapters/chapter15/predsets_valid_inttune.qmd b/book/chapters/chapter15/predsets_valid_inttune.qmd index bc3f05168..0e3f7a386 100644 --- a/book/chapters/chapter15/predsets_valid_inttune.qmd +++ b/book/chapters/chapter15/predsets_valid_inttune.qmd @@ -13,7 +13,7 @@ aliases: In @sec-performance we have already studied in detail how to train, predict and evaluate many different learners. Evaluating a fully trained model usually requires making predictions on unseen test observations. When we predict directly with a trained learner, we can explicitly control which observations are used: -```{r} +```{r predsets_valid_inttune-001} tsk_sonar = tsk("sonar") lrn_rf = lrn("classif.ranger") lrn_rf$train(tsk_sonar, row_ids = 4:208) @@ -27,21 +27,21 @@ We will now look at how to predict on *train* sets. This is sometimes be of interest for further analysis or to study overfitting. Or maybe we are simply curious. Let's configure our learner to simultaneously predict on *train* and *test*: -```{r} +```{r predsets_valid_inttune-002} lrn_rf$predict_sets = c("train", "test") rr = resample(tsk_sonar, lrn_rf, rsmp("cv", folds = 3)) ``` The learner, during resampling, will now after having been trained for the current iteration, produce predictions on all requested sets. To access them, we can either ask for a list of 3 prediction objects, one per CV fold, or we can ask for a combined prediction object for the whole CV -- which in this case contains as many prediction rows as observations in the task. -```{r} +```{r predsets_valid_inttune-003} str(rr$predictions("test")) # or str(rr$predictions("train")) rr$prediction("test") # or rr$prediction("train") ``` We can also apply performance measures to specific sets of the resample result: -```{r} +```{r predsets_valid_inttune-004} rr$aggregate(list( msr("classif.ce", predict_sets = "train", id = "ce_train"), msr("classif.ce", predict_sets = "test", id = "ce_test") @@ -52,7 +52,7 @@ The default predict set for a measure is usually the test set. But we can reques If we only want to access information that is computed during training, we can even configure the learner not to make any predictions at all. This is useful, for example, for learners that already (in their underlying implementation) produce an estimate of their generalization error during training, e.g. using out-of-bag error estimates or validation scores. The former, which is only available to learners with the 'oob_error' property, can be accessed via `r ref("MeasureOOBError")`. The latter is available to learners with the 'validation' property and is implemented as `r ref("MeasureInternalValidScore")`. Below we evaluate a random forest using its out-of-bag error. Since we do not need any predict sets, we can use `r ref("ResamplingInsample")`, which will use the entire dataset for training. -```{r} +```{r predsets_valid_inttune-005} lrn_rf$predict_sets = NULL rsmp_in = rsmp("insample") rr = resample(tsk_sonar, lrn_rf, rsmp_in, store_models = TRUE) @@ -64,7 +64,7 @@ All this works in exactly the same way for benchmarking, tuning, nested resampli Instead of explicitly making predictions on some test data and evaluating them, we use OOB error to evaluate `mtry.ratio`. This can speed up the tuning process considerably, as in this case only one RF is fitted (it is simply trained) and we can access the OOB from this single model, instead of fitting multiple models. As the OOB observations are untouched during the training of each tree in the ensemble, this still produces a valid performance estimate. -```{r} +```{r predsets_valid_inttune-006} lrn_rf$param_set$set_values( mtry.ratio = to_tune(0.1, 1) ) @@ -86,7 +86,7 @@ For iterative training (which many learners use) it can be interesting to track In `mlr3`, learners can have the 'validation' and 'internal_tuning' properties to indicate whether they can make use of a validation set and whether they can internally optimize hyperparameters, for example by stopping early. To check if a given learner supports this, we can simply access its `$properties` field. Examples of such learners are boosting algorithms like XGBoost, LightGBM, or CatBoost, as well as deep learning models from `r ref_pkg("mlr3torch")`. In this section we will train XGBoost on sonar and keep track of its performance on a validation set. -```{r} +```{r predsets_valid_inttune-007} tsk_sonar = tsk("sonar") lrn_xgb = lrn("classif.xgboost") lrn_xgb @@ -107,14 +107,14 @@ If a learner's `$validate` field is set to 'test', we will leak the resampling t Below, we configure the XGBoost learner to use $1/3$ of its training data for validation: -```{r} +```{r predsets_valid_inttune-008} lrn_xgb$validate = 1/3 ``` Next, we set the number of iterations (`nrounds`) and which metric to track (`eval_metric`) and train the learner. Here, $1/3$ of the observations from the training task will be solely used for validation and the remaining $2/3$ for training. If stratification or grouping is enabled in the task, this will also be respected. For further details on this see @sec-performance. -```{r} +```{r predsets_valid_inttune-009} lrn_xgb$param_set$set_values( nrounds = 100, eval_metric = "logloss" @@ -124,13 +124,13 @@ lrn_xgb$train(tsk_sonar) Because the XGBoost learner kept a log of the validation performance, we can now access this through the `$model` slot. Where exactly in the model this information is stored, depends on the specific learning algorithm. For XGBoost, the history is stored in `$evaluation_log`: -```{r} +```{r predsets_valid_inttune-010} tail(lrn_xgb$model$evaluation_log) ``` The validation loss over time is visualized in the figure below, with the iterations on the x-axis and the validation logloss on the y-axis: -```{r, out.width = "70%", echo = FALSE, warning = FALSE} +```{r predsets_valid_inttune-011, out.width = "70%", echo = FALSE, warning = FALSE} library(ggplot2) set.seed(1) ggplot(lrn_xgb$model$evaluation_log, aes(x = iter, y = test_logloss)) + @@ -143,19 +143,19 @@ ggplot(lrn_xgb$model$evaluation_log, aes(x = iter, y = test_logloss)) + `mlr3` also provides a standardized acccessor for the final validation performance. We can access this via the `$internal_valid_scores` field, which is a named list containing possibly more than one validation metric. -```{r} +```{r predsets_valid_inttune-012} lrn_xgb$internal_valid_scores ``` In some cases one might want to have more control over the construction of the validation data. This can be useful, for example, if there is a predefined validation split to be used with a task. Such fine-grained control over the validation data is possible by setting the `validate` field to `"predefined"`. -```{r} +```{r predsets_valid_inttune-013} lrn_xgb$validate = "predefined" ``` This allows us to use the `$internal_valid_task` defined in the training task. Below, we set the validation task to use 60 randomly sampled ids and remove them from the primary task. -```{r} +```{r predsets_valid_inttune-014} valid_ids = sample(tsk_sonar$nrow, 60) tsk_valid = tsk_sonar$clone(deep = TRUE) tsk_valid$filter(valid_ids) @@ -166,20 +166,20 @@ tsk_sonar$internal_valid_task = tsk_valid Note that we could have achieved the same by simply setting `tsk_valid$internal_valid_task = valid_ids`, but showed the explicit way for completeness sake. The associated validation task now has 60 observations and the primary task 148: -```{r} +```{r predsets_valid_inttune-015} c(tsk_sonar$internal_valid_task$nrow, tsk_sonar$nrow) ``` When we now train, the learner will validate itself on the specified additional task. Note that the `$internal_valid_task` slot is always used internally, even if you set a ratio value in `learner$validate`, it is simply automatically auto-constructed (and then passed down). -```{r} +```{r predsets_valid_inttune-016} lrn_xgb$train(tsk_sonar) ``` In many cases, however, one does not only train an individual learner, but combines it with other (preprocessing) steps in a `r ref("GraphLearner")`, see @sec-preprocessing. Validation in a `GraphLearner` is still possible, because preprocessing `PipeOp`s also handle the validation task. While the *train* logic of the `PipeOp`s is applied to the primary task, the *predict* logic is applied to the validation data. This ensures that there is no data leakage when the XGBoost learner evaluates its performance on the validation data. Below, we construct a `PipeOpPCA` and apply it to the sonar task with a validation task. -```{r} +```{r predsets_valid_inttune-017} po_pca = po("pca") taskout = po_pca$train(list(tsk_sonar))[[1]] taskout$internal_valid_task @@ -187,13 +187,13 @@ taskout$internal_valid_task The preprocessing that is applied to the `$internal_valid_task` during `$train()` is equivalent to predicting on it: -```{r} +```{r predsets_valid_inttune-018} po_pca$predict(list(tsk_sonar$internal_valid_task))[[1L]] ``` This means that tracking validation performance works even in complex graph learners, which would not be possible when simply setting the `watchlist` parameter of XGBoost. Below, we chain the PCA operator to XGBoost and convert it to a learner. -```{r} +```{r predsets_valid_inttune-019} glrn = as_learner(po_pca %>>% lrn_xgb) ``` @@ -204,14 +204,14 @@ While this almost 'just works', we now need to specify the `$validate` field on This configuration can be simplified by using `set_validate()`. When applied to a `GraphLearner`, we can specify the arguments `validate` which determines *how* to create the validation data and optionally the argument `ids` which specifies *which* `PipeOp`s should use it. By default, the latter is set to the `$base_learner()` of the `Graph`, which is the last learner. This means that both calls below are equivalent: -```{r} +```{r predsets_valid_inttune-020} set_validate(glrn, validate = "predefined") set_validate(glrn, validate = "predefined", ids = "classif.xgboost") ``` We can now train the graph learner just as before and inspect the final validation metric, which is now prefixed with the ID of the corresponding `PipeOp`. -```{r} +```{r predsets_valid_inttune-021} glrn$validate = "predefined" glrn$train(tsk_sonar) glrn$internal_valid_scores @@ -227,13 +227,13 @@ Since individual `PipeOp`s cannot control how the validation data is generated, Not only can XGBoost log its validation performance, it can also monitor it to *early stop* its training, i.e. perform internal tuning of the `nrounds` hyperparameter during training. This is marked by the `"internal_tuning"` property: -```{r} +```{r predsets_valid_inttune-022} "internal_tuning" %in% lrn_xgb$properties ``` Early stopping for XGBoost can be enabled by specifying the `early_stopping_rounds` parameter. This is also known as *patience* and specifies for how many iterations the validation loss must not improve for the training to terminate. The metric that is used for early stopping is the first value that we passed to `eval_metric`, which was the logloss. -```{r} +```{r predsets_valid_inttune-023} lrn_xgb$param_set$set_values( early_stopping_rounds = 10, nrounds = 100 @@ -242,14 +242,14 @@ lrn_xgb$param_set$set_values( When we now train the learner, we can access the internally optimized `nrounds` through the `$internal_tuned_values` field. -```{r} +```{r predsets_valid_inttune-024} lrn_xgb$train(tsk_sonar) lrn_xgb$internal_tuned_values ``` By using early stopping, we were able to already terminate training after `r lrn_xgb$internal_tuned_values$nrounds + lrn_xgb$param_set$values$early_stopping_rounds` iterations. Below, we visualize the validation loss over time and the optimal nrounds is marked red. We can see that the logloss plateaus after `r lrn_xgb$internal_tuned_values$nrounds` rounds, but training continues for a while afterwards due to the patience setting. -```{r, echo = FALSE, out.width = "70%"} +```{r predsets_valid_inttune-025, echo = FALSE, out.width = "70%"} theme_set(theme_minimal()) data = lrn_xgb$model$evaluation_log ggplot(data, aes(x = iter, y = test_logloss)) + @@ -263,7 +263,7 @@ ggplot(data, aes(x = iter, y = test_logloss)) + So far we have only used the early stopping implementation of XGBoost to optimize `nrounds`, but have not tuned any other hyperparameters. This is where `r mlr3` comes in, as it allows us to combine the internal tuning of a learner with (non-internal) hyperparameter tuning via `r ref_pkg("mlr3tuning")`. To do this, we set both parameters to `to_tune()`, but mark `nrounds` to be tuned internally. -```{r} +```{r predsets_valid_inttune-026} lrn_xgb$param_set$set_values( eta = to_tune(0.001, 0.1, logscale = TRUE), nrounds = to_tune(upper = 500, internal = TRUE) @@ -272,7 +272,7 @@ lrn_xgb$param_set$set_values( In such scenarios, one might often want to use the same validation data to optimize `eta` and `nrounds`. This is possible by specifying the `"test"` option of the `validate` field. This means that in each resampling iteration the validation data will be set to the test set, i.e. the same data that will also be used to evaluate the parameter configuration (to tune `eta`). -```{r} +```{r predsets_valid_inttune-027} lrn_xgb$validate = "test" ``` @@ -287,7 +287,7 @@ When combining internal tuning with hyperparameter optimization via `r ref_pkg(" The advantage of using the first option is that the predict step can be skipped because the internal validation scores are already computed during training. In a certain sense, this is similar to the evaluation of the random forest with the OOB error in @sec-predict-sets. -```{r} +```{r predsets_valid_inttune-028} tsk_sonar = tsk("sonar") lrn_xgb$predict_sets = NULL @@ -304,13 +304,13 @@ ti = tune( The tuning result contains the best found configuration for both `eta` and `nrounds`. -```{r} +```{r predsets_valid_inttune-029} ti$result_learner_param_vals[c("eta", "nrounds")] ``` We now show how to extract the different parameter configurations from the tuning archive. All internally tuned parameters are accessible via the `$internal_tuned_values`. This is a list column, because it is possible to tune more than one parameter internally, e.g. in a `GraphLearner`. Below we extract the values for `eta` (transformed back from its log scale), `nrounds` (internally tuned) and the logloss. The latter was evaluated on the internal validation tasks, which corresponded to the `Resampling`'s test sets as we specified `validate = "test"`. By visualizing the results we can see an inverse relationship between the two tuning parameters: a smaller step size (eta) requires more boosting iterations (nrounds). -```{r, out.width = "70%"} +```{r predsets_valid_inttune-030, out.width = "70%"} d = ti$archive$data d = data.table( @@ -326,7 +326,7 @@ ggplot(data = d, aes(x = eta, y = nrounds, color = logloss)) + This also works with an `r ref("AutoTuner")`, which will use the internally optimized `nrounds`, as well as the offline tuned `eta` for the final model fit. This means that there is no validation or early stopping when training the final model, and we use all available data. -```{r} +```{r predsets_valid_inttune-031} at = auto_tuner( tuner = tnr("grid_search"), learner = lrn_xgb, @@ -346,7 +346,7 @@ However, care must be taken when using the test set of a resampling for validati If the purpose of resampling is to get an unbiased performance estimate of algorithms, some of which stop early and some of which don't, this is not OK. In such a situation, the former would have an unfair advantage over the latter. The example below illustrates such a case where this would not be a fair comparison between the two learners. -```{r} +```{r predsets_valid_inttune-032} lrn_xgb$param_set$set_values( eta = 0.1, nrounds = 500, early_stopping_rounds = 10 ) @@ -363,7 +363,7 @@ At last, we will cover how to enable internal tuning when manually specifying a While the latter is more convenient and therefore usually recommended, manually defining a search space gives you for more flexibility with respect to parameter transformations, see e.g. @sec-tune-trafo. We can include the internally tuned parameters in the `search_space`, but need to specify an aggregation function and tag them with `"internal_tuning"`. -```{r} +```{r predsets_valid_inttune-033} search_space = ps( eta = p_dbl(0.001, 0.1, logscale = TRUE), nrounds = p_int(upper = 500, tags = "internal_tuning", @@ -373,7 +373,7 @@ search_space = ps( This search space can be passed to the `AutoTuner` and the optimization will then proceed as before. -```{r} +```{r predsets_valid_inttune-034} at = auto_tuner( tuner = tnr("grid_search"), learner = lrn_xgb, @@ -398,7 +398,7 @@ In this chapter we first learned how to evaluate machine learning methods on dif 3. Consider the code below: - ```{r} + ```{r predsets_valid_inttune-035} branch_lrn = as_learner( ppl("branch", list( lrn("classif.ranger"), @@ -434,7 +434,7 @@ In this chapter we first learned how to evaluate machine learning methods on dif 4. Look at the (failing) code below: - ```{r, eval = FALSE} + ```{r predsets_valid_inttune-036, eval = FALSE} tsk_sonar = tsk("sonar") glrn = as_learner( po("pca") %>>% lrn("classif.xgboost", validate = 0.3) diff --git a/book/chapters/chapter2/data_and_basic_modeling.qmd b/book/chapters/chapter2/data_and_basic_modeling.qmd index 1ee1bb062..b87478f1f 100644 --- a/book/chapters/chapter2/data_and_basic_modeling.qmd +++ b/book/chapters/chapter2/data_and_basic_modeling.qmd @@ -43,7 +43,7 @@ For texts about ML, including detailed methodology and underpinnings of differen In the next few sections we will look at the building blocks of `mlr3` using regression as an example, we will then consider how to extend this to classification in @sec-classif. -```{r basics-fig-1, echo=FALSE} +```{r data_and_basic_modeling-001, echo=FALSE} #| label: fig-ml-abstraction-basics #| fig-cap: "General overview of the machine learning process." #| fig-alt: "A flowchart starting with the 'Data D' with two arrows to 'Dtrain' and 'Dtest'. 'Dtrain' has an arrow to 'Learner', which has an arrow to 'Model'. 'Dtest' has an arrow (labeled with 'Features') to 'Model' and an arrow (labeled with 'Labels') to 'Measure'. The 'Model' box has an arrow to 'Prediction', which has an arrow to 'Measure', which has an arrow to 'Performance'. The whole flowchart has curly brackets next to it that says 'Repeat = Resampling'." @@ -60,14 +60,14 @@ This information is extracted automatically when required, so the user does not `mlr3` includes a few predefined machine learning tasks in the `r ref("mlr_tasks", aside = TRUE)` `Dictionary`. -```{r basics-001} +```{r data_and_basic_modeling-002} mlr_tasks ``` To get a task from the dictionary, use the `r ref("tsk()", aside = TRUE)` function and assign the return value to a new variable. Below we retrieve `tsk("mtcars")`, which uses the `r ref("datasets::mtcars")` dataset: -```{r basics-002} +```{r data_and_basic_modeling-003} tsk_mtcars = tsk("mtcars") tsk_mtcars ``` @@ -88,7 +88,7 @@ The simplest way to do this is with the function `r ref("as_task_regr()", aside By example, we will ignore that `mtcars` is already available as a predefined task in `mlr3`. In the code below we load the `datasets::mtcars` dataset, subset the data to only include columns `"mpg"`, `"cyl"`, `"disp"`, print the modified data's properties, and then set up a regression task called `"cars"` (`id = "cars"`) in which we will try to predict miles per gallon (`target = "mpg"`) from the number of cylinders (`"cyl"`) and displacement (`"disp"`): -```{r basics-006} +```{r data_and_basic_modeling-004} data("mtcars", package = "datasets") mtcars_subset = subset(mtcars, select = c("mpg", "cyl", "disp")) str(mtcars_subset) @@ -110,13 +110,13 @@ You can bypass this check by setting `options(mlr3.allow_utf8_names = TRUE)` (bu Printing a task provides a summary and in this case, we can see the task has `r tsk_mtcars$nrow` observations and `r tsk_mtcars$ncol` columns (32 x 3), of which `mpg` is the target, there are no special properties (`Properties: -`), and there are `r length(tsk_mtcars$feature_names)` features stored in double-precision floating point format. -```{r} +```{r data_and_basic_modeling-005} tsk_mtcars ``` We can plot the task using the `r mlr3viz` package, which gives a graphical summary of the distribution of the target and feature values: -```{r basics-008, message=FALSE, output = FALSE} +```{r data_and_basic_modeling-006, message=FALSE, output = FALSE} #| label: fig-mtcars #| fig-cap: "Overview of the mtcars dataset." #| fig-alt: Diagram shows six plots, three are line plots showing the relationship between continuous variables, and three are scatter plots showing relationships between other variables. @@ -131,13 +131,13 @@ We have looked at how to create tasks to store data and metadata, now we will lo Various fields can be used to retrieve metadata about a task. The dimensions, for example, can be retrieved using `$nrow` and `$ncol`: -```{r basics-009} +```{r data_and_basic_modeling-007} c(tsk_mtcars$nrow, tsk_mtcars$ncol) ``` The names of the feature and target columns are stored in the `$feature_names` and `$target_names` slots, respectively. -```{r basics-010} +```{r data_and_basic_modeling-008} c(Features = tsk_mtcars$feature_names, Target = tsk_mtcars$target_names) ``` @@ -145,7 +145,7 @@ c(Features = tsk_mtcars$feature_names, The columns of a task have unique `character`-valued names and the rows are identified by unique natural numbers, called row IDs. They can be accessed through the `$row_ids` field: -```{r basics-011} +```{r data_and_basic_modeling-009} head(tsk_mtcars$row_ids) ``` @@ -153,7 +153,7 @@ Row IDs are not used as features when training or predicting but are metadata th Note that row IDs are not the same as row numbers. This is best demonstrated by example, below we create a regression task from random data, print the original row IDs, which correspond to row numbers 1-5, then we filter three rows (we will return to this method just below) and print the new row IDs, which no longer correspond to the row numbers. -```{r} +```{r data_and_basic_modeling-010} task = as_task_regr(data.frame(x = runif(5), y = runif(5)), target = "y") task$row_ids @@ -167,7 +167,7 @@ See @sec-backends for more information on using databases as data backends for t The data contained in a task can be accessed through `$data()`, which returns a `r ref("data.table")` object. This method has optional `rows` and `cols` arguments to specify subsets of the data to retrieve. -```{r basics-012} +```{r data_and_basic_modeling-011} # retrieve all data tsk_mtcars$data() # retrieve data for rows with IDs 1, 5, and 10 and all feature columns @@ -180,7 +180,7 @@ tsk_mtcars$data(rows = c(1, 5, 10), cols = tsk_mtcars$feature_names) You can work with row numbers instead of row IDs by using the `$row_ids` field to extract the row ID corresponding to a given row number: -```{r basics-016, eval = FALSE} +```{r data_and_basic_modeling-012, eval = FALSE} # select the 2nd row of the task by extracting the second row_id: tsk_mtcars$data(rows = task$row_ids[2]) ``` @@ -188,7 +188,7 @@ tsk_mtcars$data(rows = task$row_ids[2]) You can always use 'standard' R methods to extract summary data from a task, for example, to summarize the underlying data: -```{r basics-013} +```{r data_and_basic_modeling-013} summary(as.data.table(tsk_mtcars)) ``` @@ -200,7 +200,7 @@ Therefore, we provide `r index('mutators', aside = TRUE)`, which modify the give Subsetting by features (columns) is possible with `$select()` with the desired feature names passed as a character vector and subsetting by observations (rows) is performed with `$filter()` by passing the row IDs as a numeric vector. `r index(NULL, "$select()", parent = "Task", code = TRUE)` `r index(NULL, "$filter()", parent = "Task", code = TRUE)` -```{r} +```{r data_and_basic_modeling-014} tsk_mtcars_small = tsk("mtcars") # initialize with the full task tsk_mtcars_small$select("cyl") # keep only one feature tsk_mtcars_small$filter(2:3) # keep only these rows @@ -209,7 +209,7 @@ tsk_mtcars_small$data() As `R6` uses reference semantics (@sec-r6), you need to use `$clone()` if you want to modify a task while keeping the original object intact. -```{r basics-015} +```{r data_and_basic_modeling-015} # the wrong way tsk_mtcars = tsk("mtcars") tsk_mtcars_wrong = tsk_mtcars @@ -227,7 +227,7 @@ tsk_mtcars$head() To add extra rows and columns to a task, you can use `$rbind()` and `$cbind()` respectively: `r index(NULL, "$cbind()", parent = "Task", code = TRUE)` `r index(NULL, "$rbind()", parent = "Task", code = TRUE)` -```{r basics-017} +```{r data_and_basic_modeling-016} tsk_mtcars_small$cbind( # add another column data.frame(disp = c(150, 160)) ) @@ -244,7 +244,7 @@ The `r ref("mlr_learners", aside = TRUE)` dictionary contains all the learners a We will discuss the available learners in @sec-lrns-add; for now, we will just use a regression tree learner as an example to discuss the `Learner` interface. As with tasks, you can access learners from the dictionary with a single sugar function, in this case, `r ref("lrn()", aside = TRUE)`. -```{r basics-023} +```{r data_and_basic_modeling-017} lrn("regr.rpart") ``` @@ -261,7 +261,7 @@ To run a machine learning experiment, learners pass through two stages (@fig-bas * `r index('Training', "model training", aside = TRUE)`: A training `Task` is passed to the learner's `r index("$train()", parent = "Learner", code = TRUE)` function which trains and stores a `r index('model')`, i.e., the learned relationship of the features to the target. * `r index('Predicting', "model predicting", aside = TRUE)`: New data, potentially a different partition of the original dataset, is passed to the `r index("$predict()", parent = "Learner", code = TRUE)` method of the trained learner to predict the target values. -```{r basics-022, echo=FALSE, out.width = "70%"} +```{r data_and_basic_modeling-018, echo=FALSE, out.width = "70%"} #| label: fig-basics-learner #| fig-cap: Overview of the different stages of a learner. Top -- data (features and a target) are passed to an (untrained) learner. Bottom -- new data are passed to the trained model which makes predictions for the 'missing' target column. #| fig-alt: Diagram shows two boxes, the first is labeled "$train() on Training Data" and shows data pointing at the Learner. The second is labeled "$predict() on New Data to Get Predictions" and shows different data pointing at a learner which now includes a "$model". An arrow then shows predictions being made from the "Learner" in the second box. @@ -272,7 +272,7 @@ include_multi_graphics("mlr3book_figures-2") In the simplest use case, models are trained by passing a task to a learner with the `r index("$train()", parent = "Learner", aside = TRUE, code = TRUE)` method: -```{r} +```{r data_and_basic_modeling-019} # load mtcars task tsk_mtcars = tsk("mtcars") # load a regression tree @@ -283,7 +283,7 @@ lrn_rpart$train(tsk_mtcars) After training, the fitted model is stored in the `r index("$model", parent = "Learner", aside = TRUE, code = TRUE)` field for future inspection and prediction: -```{r} +```{r data_and_basic_modeling-020} # inspect the trained model lrn_rpart$model ``` @@ -298,14 +298,14 @@ When assessing the quality of a model's predictions, you will likely want to par In @sec-performance we will look at resampling and benchmark experiments, which will go into more detail about performance estimation but for now, we will just discuss the simplest method of splitting data using the `r ref("partition()", aside = TRUE)` function. This function creates index sets that randomly split the given task into two disjoint sets: a training set\index{training data} (67% of the total data by default) and a test set\index{test data} (the remaining 33% of the total data not in the training set). -```{r basics-025} +```{r data_and_basic_modeling-021} splits = partition(tsk_mtcars) splits ``` When training we will tell the model to only use the training data by passing the row IDs from `partition` to the `row_ids` argument of `$train()`: -```{r basics-025-1} +```{r data_and_basic_modeling-022} lrn_rpart$train(tsk_mtcars, row_ids = splits$train) ``` @@ -317,13 +317,13 @@ Predicting from trained models is as simple as passing your data as a `Task` to Carrying straight on from our last example, we will call the `$predict()` method of our trained learner and again will use the `row_ids` argument, but this time to pass the IDs of our `r index("test set", "test data")`: -```{r basics-030} +```{r data_and_basic_modeling-023} prediction = lrn_rpart$predict(tsk_mtcars, row_ids = splits$test) ``` The `$predict()` method returns an object inheriting from `r ref("Prediction", aside = TRUE)`, in this case `r ref("PredictionRegr", aside = TRUE)` as this is a regression task. -```{r} +```{r data_and_basic_modeling-024} prediction ``` @@ -334,13 +334,13 @@ The `Prediction` object can easily be converted into a `data.table` or `data.fra All data in the above columns can be accessed directly, for example, to get the first two predicted responses: -```{r basics-access-pred} +```{r data_and_basic_modeling-025} prediction$response[1:2] ``` Similarly to plotting `Task`s, `r mlr3viz` provides an `r ref("ggplot2::autoplot()")` method for `Prediction` objects. -```{r basics-035, message = FALSE, warning = FALSE, out.width = "70%"} +```{r data_and_basic_modeling-026, message = FALSE, warning = FALSE, out.width = "70%"} #| label: fig-basics-truthresponse #| fig-cap: "Comparing predicted and ground truth values for the mtcars dataset." #| fig-alt: "A scatter plot with predicted values on one axis and ground truth values on the other. A trend line is fit to show that in general there is good agreement between predicted and ground truth values." @@ -353,7 +353,7 @@ In the examples above we made predictions by passing a task to `$predict()`. However, if you would rather pass a `data.frame` type object directly, then you can use `r index("$predict_newdata()", parent = "Learner", code = TRUE)`. Note, the `truth` column values are all `NA`, as we did not include a target column in the generated data. -```{r basics-032} +```{r data_and_basic_modeling-027} mtcars_new = data.table(cyl = c(5, 6), disp = c(100, 120), hp = c(100, 150), drat = c(4, 3.9), wt = c(3.8, 4.1), qsec = c(18, 19.5), vs = c(1, 0), am = c(1, 1), @@ -369,7 +369,7 @@ Several regression models can also predict standard errors. To predict this, the `r index('$predict_type', parent = "Learner", code = TRUE)` field of a `r ref("LearnerRegr")` must be changed from "response" (the default) to `"se"` before training. The `"rpart"` learner we used above does not support predicting standard errors, so in the example below we will use a linear regression model (`lrn("regr.lm")`). -```{r basics-033} +```{r data_and_basic_modeling-028} library(mlr3learners) lrn_lm = lrn("regr.lm", predict_type = "se") lrn_lm$train(tsk_mtcars, splits$train) @@ -392,7 +392,7 @@ Hyperparameters can be optimized automatically (@sec-optimization), but in this We will continue our running example with a regression tree learner. To access the hyperparameters in the decision tree, we use `r index("$param_set", parent = "Learner", aside = TRUE, code = TRUE)`: -```{r basics} +```{r data_and_basic_modeling-029} lrn_rpart$param_set ``` @@ -430,25 +430,25 @@ From the parameter set output, we know that the `maxdepth` parameter has a defau There are a few different ways we could change this hyperparameter. The simplest way is during construction of the learner by passing the hyperparameter name and new value to `lrn()`: -```{r} +```{r data_and_basic_modeling-030} lrn_rpart = lrn("regr.rpart", maxdepth = 1) ``` We can get a list of non-default hyperparameters (i.e., those that have been set) by using `$param_set$values`: -```{r basics-027} +```{r data_and_basic_modeling-031} lrn_rpart$param_set$values ``` Now we can see that `maxdepth = 1` (as we discussed above `xval = 0` is changed during construction) and the learned regression tree reflects this: -```{r} +```{r data_and_basic_modeling-032} lrn_rpart$train(tsk("mtcars"))$model ``` The `$values` field simply returns a `list` of set hyperparameters, so another way to update hyperparameters is by updating an element in the list: -```{r} +```{r data_and_basic_modeling-033} lrn_rpart$param_set$values$maxdepth = 2 lrn_rpart$param_set$values # now with depth 2 @@ -457,7 +457,7 @@ lrn_rpart$train(tsk("mtcars"))$model To set multiple values at once we recommend either setting these during construction or using `r index("$set_values()", parent = "Learner", aside = TRUE, code = TRUE)`, which updates the given hyperparameters (argument names) with the respective values. -```{r} +```{r data_and_basic_modeling-034} lrn_rpart = lrn("regr.rpart", maxdepth = 3, xval = 1) lrn_rpart$param_set$values # or with set_values @@ -472,7 +472,7 @@ lrn_rpart$param_set$values As `lrn_rpart$param_set$values` returns a `list`, some users may be tempted to set hyperparameters by passing a new `list` to `$values` -- this would work but **we do not recommend it**. This is because passing a `list` will wipe any existing hyperparameter values if they are not included in the list. For example: -```{r} +```{r data_and_basic_modeling-035} # set xval and cp lrn_rpart_params = lrn("regr.rpart", xval = 0, cp = 1) # passing maxdepth through a list, removing all other values @@ -489,7 +489,7 @@ lrn_rpart_params$param_set$values Whichever method you choose, all have safety checks to ensure your new values fall within the allowed parameter range: -```{r, error=TRUE} +```{r data_and_basic_modeling-036, error=TRUE} lrn("regr.rpart", cp = 2, maxdepth = 2) ``` @@ -502,13 +502,13 @@ One such example is a `r index('support vector machine')` (`lrn("regr.svm")`). The field `r index("$deps", parent = "ParamSet", code = TRUE)` returns a `data.table`, which lists the hyperparameter dependencies in the `Learner`. For example we can see that the `cost` (`id`-column) parameter is dependent on the `type` (`on`-column) parameter. -```{r} +```{r data_and_basic_modeling-037} lrn("regr.svm")$param_set$deps ``` The `cond` column tells us what the condition is, which will either mean that `id` can be set if `on` equals a single value (`r ref("CondEqual")`) or any value in the listed set (`r ref("CondAnyOf")`). -```{r} +```{r data_and_basic_modeling-038} lrn("regr.svm")$param_set$deps[[1, "cond"]] lrn("regr.svm")$param_set$deps[[3, "cond"]] ``` @@ -517,7 +517,7 @@ This tells us that the parameter `cost` should only be set if the `type` paramet The `Learner` will error if dependent hyperparameters are set when their conditions are not met: -```{r, error=TRUE} +```{r data_and_basic_modeling-039, error=TRUE} # error as kernel is not polynomial lrn("regr.svm", kernel = "linear", degree = 1) # works because kernel is polynomial @@ -531,7 +531,7 @@ These are extremely simple or 'weak' learners known as `r index('baselines', asi Baselines are useful in model comparison (@sec-performance) and as fallback learners (@sec-encapsulation-fallback, @sec-fallback). For regression, we have implemented the baseline `lrn("regr.featureless")`, which always predicts new values to be the mean (or median, if the `robust` hyperparameter is set to `TRUE`) of the target in the training data: -```{r} +```{r data_and_basic_modeling-040} # generate data df = as_task_regr(data.frame(x = runif(1000), y = rnorm(1000, 2, 1)), target = "y") @@ -547,7 +547,7 @@ Perhaps *the most* important step of the applied machine learning workflow is ev Without this, we would have no way to know if our trained model makes very accurate predictions, is worse than randomly guessing, or somewhere in between. We will continue with our decision tree example to establish if the quality of our predictions is 'good', first we will rerun the above code so it is easier to follow along. -```{r} +```{r data_and_basic_modeling-041} lrn_rpart = lrn("regr.rpart") tsk_mtcars = tsk("mtcars") splits = partition(tsk_mtcars) @@ -560,7 +560,7 @@ prediction = lrn_rpart$predict(tsk_mtcars, splits$test) The quality of predictions is evaluated using measures that compare them to the ground truth data for supervised learning tasks. Similarly to `Task`s and `Learner`s, the available measures in `mlr3` are stored in a dictionary called `r ref("mlr_measures", aside = TRUE)` and can be accessed with `r index("msr()", "msr()/msrs()", aside = TRUE, code = TRUE)`: -```{r} +```{r data_and_basic_modeling-042} as.data.table(msr()) ``` @@ -569,7 +569,7 @@ As well as these defining elements, other metadata are important to consider whe All this information is encapsulated in the `r ref("Measure", aside = TRUE)` object. By example, let us consider the `r index('mean absolute error')` (MAE): -```{r} +```{r data_and_basic_modeling-043} measure = msr("regr.mae") measure ``` @@ -585,13 +585,13 @@ Now let us see how to use this measure for scoring our predictions. Usually, supervised learning measures compare the difference between predicted values and the ground truth. `mlr3` simplifies the process of bringing these quantities together by storing the predictions and true outcomes in the `r ref("Prediction", index = TRUE)` object as we have already seen. -```{r} +```{r data_and_basic_modeling-044} prediction ``` To calculate model performance, we simply call the `r index("$score()", parent = "Prediction", aside = TRUE, code = TRUE)` method of a `Prediction` object and pass as a single argument the measure that we want to compute: -```{r} +```{r data_and_basic_modeling-045} prediction$score(measure) ``` @@ -600,7 +600,7 @@ Note that all task types have default measures that are used if the argument to It is possible to calculate multiple measures at the same time by passing multiple measures to `$score()`. For example, below we compute performance for mean squared error (`"regr.mse"`) and mean absolute error (`"regr.mae"`) -- note we use `r index("msrs()", "msr()/msrs()", aside = TRUE, code = TRUE)` to load multiple measures at once. -```{r basics-039} +```{r data_and_basic_modeling-046} measures = msrs(c("regr.mse", "regr.mae")) prediction$score(measures) ``` @@ -619,7 +619,7 @@ These include: For example, we could score our decision tree to see how many seconds it took to train the model and make predictions: -```{r} +```{r data_and_basic_modeling-047} measures = msrs(c("time_train", "time_predict", "time_both")) prediction$score(measures, learner = lrn_rpart) ``` @@ -629,19 +629,19 @@ Notice a few key properties of these measures: 1) `time_both` is simply the sum of `time_train` and `time_predict`. 2) We had to pass `learner = lrn_rpart` to `$score()` as these measures have the `requires_learner` property: -```{r} +```{r data_and_basic_modeling-048} msr("time_train")$properties ``` 3) These can be used after model training and predicting because we automatically store model run times whenever `$train()` and `$predict()` are called, so the measures above are equivalent to: -```{r} +```{r data_and_basic_modeling-049} c(lrn_rpart$timings, both = sum(lrn_rpart$timings)) ``` The `selected_features` measure calculates how many features were used in the fitted model. -```{r} +```{r data_and_basic_modeling-050} msr_sf = msr("selected_features") msr_sf ``` @@ -649,7 +649,7 @@ msr_sf We can see that this measure contains `r index('control parameters', aside = TRUE)` (`Parameters: normalize=FALSE`), which control how the measure is computed. As with hyperparameters these can be accessed with `r index("$param_set", parent = "Measure", code = TRUE)`: -```{r} +```{r data_and_basic_modeling-051} msr_sf = msr("selected_features") msr_sf$param_set ``` @@ -657,7 +657,7 @@ msr_sf$param_set The `normalize` hyperparameter specifies whether the returned number of selected features should be normalized by the total number of features, this is useful if you are comparing this value across tasks with differing numbers of features. We would change this parameter in the exact same way as we did with the learner above: -```{r basics-measures-hp} +```{r data_and_basic_modeling-052} msr_sf$param_set$values$normalize = TRUE prediction$score(msr_sf, task = tsk_mtcars, learner = lrn_rpart) ``` @@ -671,7 +671,7 @@ What we have not yet attempted is to ascertain if our predictions are any 'good' So before look at how the building blocks of `mlr3` extend to classification, we will take a brief pause to put together everything above in a short experiment to assess the quality of our predictions. We will do this by comparing the performance of a featureless regression learner to a decision tree with changed hyperparameters. -```{r} +```{r data_and_basic_modeling-053} library(mlr3) set.seed(349) # load and partition our task @@ -717,7 +717,7 @@ We will then move to differences in tasks, learners and predictions, before look The interface for classification tasks, learners, and measures, is identical to the regression setting, except the underlying objects inherit from `r ref("TaskClassif", index = TRUE)`, `r ref("LearnerClassif", index = TRUE)`, and `r ref("MeasureClassif", index = TRUE)`, respectively. We can therefore run a very similar experiment to the one above. -```{r} +```{r data_and_basic_modeling-054} library(mlr3) set.seed(349) # load and partition our task @@ -749,13 +749,13 @@ Classification tasks, objects inheriting from `r ref("TaskClassif", aside = TRUE You can view the predefined classification tasks in `mlr3` by filtering the `mlr_tasks` dictionary: -```{r} +```{r data_and_basic_modeling-055} as.data.table(mlr_tasks)[task_type == "classif"] ``` You can create your own task with `r ref("as_task_classif", aside = TRUE)`. -```{r} +```{r data_and_basic_modeling-056} as_task_classif(palmerpenguins::penguins, target = "species") ``` @@ -763,7 +763,7 @@ There are two types of classification tasks supported in `mlr3`: binary classifi The `sonar` task is an example of a binary classification problem, as the target can only take two different values, in `mlr3` terminology it has the "twoclass" property: -```{r} +```{r data_and_basic_modeling-057} tsk_sonar = tsk("sonar") tsk_sonar tsk_sonar$class_names @@ -771,7 +771,7 @@ tsk_sonar$class_names In contrast, `tsk("penguins")` is a multiclass problem as there are more than two species of penguins; it has the "multiclass" property: -```{r basics-041} +```{r data_and_basic_modeling-058} tsk_penguins = tsk("penguins") tsk_penguins$properties tsk_penguins$class_names @@ -783,7 +783,7 @@ It is arbitrary which is which, though often the more 'important' (and often sma You can set the positive class during or after construction. If no positive class is specified then `mlr3` assumes the first level in the `target` column is the positive class, which can lead to misleading results. -```{r} +```{r data_and_basic_modeling-059} # Load the "Sonar" dataset from the "mlbench" package as an example data(Sonar, package = "mlbench") # specifying the positive class: @@ -799,7 +799,7 @@ While the choice of positive and negative class is arbitrary, they are essential Finally, plotting is possible with `r ref("autoplot.TaskClassif")`, below we plot a comparison between the target column and features. -```{r basics-043, warning = FALSE, message = FALSE, fig.height = 5} +```{r data_and_basic_modeling-060, warning = FALSE, message = FALSE, fig.height = 5} #| label: fig-penguins-overview #| fig-cap: Overview of part of the penguins dataset. #| fig-alt: Diagram showing the distribution of target and feature values for a subset of the penguins data. The 'Adelie' species has an even split between male/female, short bill length and average bill depth. The 'Chinstrap' species only come from the island 'Dream' and have a lower body mass. The 'Gentoo' species only come from the island 'Biscoe', and have a longer flipper length and higher body mass. @@ -814,7 +814,7 @@ Classification learners, which inherit from `r ref("LearnerClassif", aside = TRU However, a key difference is that the possible predictions in classification are either `"response"` -- predicting an observation's class (a penguin's species in our example, this is sometimes called "hard labeling") -- or `"prob"` -- predicting a vector of probabilities, also called "posterior probabilities", of an observation belonging to each class. In classification, the latter can be more useful as it provides information about the confidence of the predictions: -```{r basics-044} +```{r data_and_basic_modeling-061} lrn_rpart = lrn("classif.rpart", predict_type = "prob") lrn_rpart$train(tsk_penguins, splits$train) prediction = lrn_rpart$predict(tsk_penguins, splits$test) @@ -828,7 +828,7 @@ The key difference in usage is that you will need to ensure your selected measur To evaluate `"response"` predictions, you will need measures with `predict_type = "response"`, or to evaluate probability predictions you will need `predict_type = "prob"`. The easiest way to find these measures is by filtering the `r ref("mlr_measures")` dictionary: -```{r} +```{r data_and_basic_modeling-062} as.data.table(msr())[ task_type == "classif" & predict_type == "prob" & !sapply(task_properties, function(x) "twoclass" %in% x)] @@ -838,7 +838,7 @@ We also filtered to remove any measures that have the `"twoclass"` property as t We need to use `sapply` for this, the `task_properties` column is a list column. We can evaluate the quality of our probability predictions and response predictions simultaneously by providing multiple measures: -```{r} +```{r data_and_basic_modeling-063} measures = msrs(c("classif.mbrier", "classif.logloss", "classif.acc")) prediction$score(measures) ``` @@ -857,7 +857,7 @@ Firstly, the added field `$confusion`, and secondly the added method `r index('$ A `r index('confusion matrix', aside = TRUE)` is a popular way to show the quality of classification (response) predictions in a more detailed fashion by seeing if a model is good at (mis)classifying observations in a particular class. For binary and multiclass classification, the confusion matrix is stored in the `r index('$confusion', parent = "PredictionClassif", code = TRUE, aside = TRUE)` field of the `r ref("PredictionClassif")` object: -```{r basics-049} +```{r data_and_basic_modeling-064} prediction$confusion ``` @@ -867,12 +867,12 @@ In this case, the classifier does fairly well classifying all penguins, but we c You can visualize the predicted class labels with `autoplot.PredictionClassif()`. -```{r} +```{r data_and_basic_modeling-065} #| output: false #| cache: false autoplot(prediction) ``` -```{r, out.width = "70%"} +```{r data_and_basic_modeling-066, out.width = "70%"} #| fig-cap: "Counts of each class label in the ground truth data (left) and predictions (right)." #| fig-alt: "Two stacked bar plots. Bottom left corresponds to true number of Gentoo species (41), middle left is true Chinstrap (22) and top left is true Adelie (50). Bottom right is predicted number of Gentoo species (41), middle right is Chinstrap (20), and top right is Adelie (52)." #| label: fig-basics-classlabels @@ -887,7 +887,7 @@ print(plt) In the binary classification case, the top left entry corresponds to `r index('true positives')`, the top right to `r index('false positives')`, the bottom left to `r index('false negatives')` and the bottom right to `r index('true negatives')`. Taking `tsk_sonar` as an example with ``r tsk_sonar$positive`` as the positive class: -```{r} +```{r data_and_basic_modeling-067} splits = partition(tsk_sonar) lrn_rpart$ train(tsk_sonar, splits$train)$ @@ -909,7 +909,7 @@ This 50% value is known as the threshold and it can be useful to change this thr As an example, let us take `tsk("german_credit")` in which 700 customers have good credit and 300 have bad. Now we could easily build a model with around "70%" accuracy simply by always predicting a customer will have good credit: -```{r} +```{r data_and_basic_modeling-068} task_credit = tsk("german_credit") lrn_featureless = lrn("classif.featureless", predict_type = "prob") split = partition(task_credit) @@ -918,13 +918,13 @@ prediction = lrn_featureless$predict(task_credit, split$test) prediction$score(msr("classif.acc")) ``` -```{r} +```{r data_and_basic_modeling-069} #| output: false #| cache: false autoplot(prediction) ``` -```{r, out.width = "70%"} +```{r data_and_basic_modeling-070, out.width = "70%"} #| fig-cap: "Class labels ground truth (left) and predictions (right). The learner completely ignores the 'bad' class." #| fig-alt: "Two stacked bar plots. Bottom left corresponds to true number of 'good' customers (231) and top left is 'bad' customers (99). Right is a single bar corresponding to 330 'good' predictions, 'bad' is never predicted." #| label: fig-basics-classlabels-german @@ -941,12 +941,12 @@ While this model may appear to have good performance on the surface, in fact, it Thresholding allows classes to be selected with a different probability threshold, so instead of predicting that a customer has bad credit if P(good) < 50%, we might predict bad credit if P(good) < 70% -- notice how we write this in terms of the positive class, which in this task is 'good'. Let us see this in practice: -```{r} +```{r data_and_basic_modeling-071} prediction$set_threshold(0.7) prediction$score(msr("classif.acc")) ``` -```{r} +```{r data_and_basic_modeling-072} lrn_rpart = lrn("classif.rpart", predict_type = "prob") lrn_rpart$train(task_credit, split$train) prediction = lrn_rpart$predict(task_credit, split$test) @@ -966,7 +966,7 @@ In multiclass classification, thresholding works by first assigning a threshold For example, say we are predicting if a new observation will be of class A, B, C, or D and we have predicted $P(A = 0.2), P(B = 0.4), P(C = 0.1), P(D = 0.3)$. We will assume that the threshold for all classes is identical and `1`: -```{r} +```{r data_and_basic_modeling-073} probs = c(0.2, 0.4, 0.1, 0.3) thresholds = c(A = 1, B = 1, C = 1, D = 1) probs/thresholds @@ -975,7 +975,7 @@ probs/thresholds We would therefore predict our observation is of class B as this is the highest ratio. However, we could change our thresholds so that D has the lowest threshold and is most likely to be predicted, A has the highest threshold, and B and C have equal thresholds: -```{r} +```{r data_and_basic_modeling-074} thresholds = c(A = 0.5, B = 0.25, C = 0.25, D = 0.1) probs/thresholds ``` @@ -986,7 +986,7 @@ In `mlr3`, this is achieved by passing a named list to `$set_threshold()`. This is demonstrated below with `tsk("zoo")`. Before changing the thresholds, some classes are never predicted and some are predicted more often than they occur. -```{r basics-thresholding-3} +```{r data_and_basic_modeling-075} #| label: fig-zoopreds #| fig-cap: "Comparing predicted and ground truth values for the zoo dataset." #| fig-alt: "Four stacked barplots comparing predictions before and after thresholding. Before thresholding some classes are over-predicted and some are never predicted. After thresholding there is still some imbalance but less drastic." @@ -1038,7 +1038,7 @@ Column roles are updated using `r index("$set_col_roles()", parent = "Task", cod When we set the `"order"` column role, the data is ordered according to that column(s). In the following example, we set the `"order"` column role and then order data by this column by including `ordered = TRUE`: -```{r} +```{r data_and_basic_modeling-076} df = data.frame(mtcars[1:2, ], idx = 2:1) tsk_mtcars_order = as_task_regr(df, target = "mpg") # original order @@ -1055,7 +1055,7 @@ The `weights` column role is used to weight data points differently. One example of why we would do this is in classification tasks with severe class imbalance, where weighting the minority class more heavily may improve the model's predictive performance for that class. For example in the `breast_cancer` dataset, there are more instances of benign tumors than malignant tumors, so if we want to better predict malignant tumors we could weight the data in favor of this class: -```{r} +```{r data_and_basic_modeling-077} cancer_unweighted = tsk("breast_cancer") summary(cancer_unweighted$data()$class) @@ -1108,7 +1108,7 @@ You can find an up-to-date list of learners at `r link("https://mlr-org.com/lear The dictionary `r ref("mlr_learners", index = TRUE)` contains learners that are supported in loaded packages: -```{r basics-learners-list} +```{r data_and_basic_modeling-078} learners_dt = as.data.table(mlr_learners) learners_dt ``` @@ -1116,13 +1116,13 @@ learners_dt The resulting `data.table` contains a lot of metadata that is useful for identifying learners with particular properties. For example, we can list all learners that support classification problems: -```{r basics-learners-list-regr} +```{r data_and_basic_modeling-079} learners_dt[task_type == "classif"] ``` We can filter by multiple conditions, for example to list all regression learners that can predict standard errors: -```{r basics-learners-regr-se} +```{r data_and_basic_modeling-080} learners_dt[task_type == "regr" & sapply(predict_types, function(x) "se" %in% x)] ``` diff --git a/book/chapters/chapter3/evaluation_and_benchmarking.qmd b/book/chapters/chapter3/evaluation_and_benchmarking.qmd index d41c4907e..0aedadf94 100644 --- a/book/chapters/chapter3/evaluation_and_benchmarking.qmd +++ b/book/chapters/chapter3/evaluation_and_benchmarking.qmd @@ -47,7 +47,7 @@ As a rule of thumb, it is common to use 2/3 of the data for training and 1/3 for In @sec-basics, we used `r ref("partition()")` to apply the holdout method to a `r ref("Task")` object. To recap, let us split `tsk("penguins")` with a 2/3 holdout (default split): -```{r performance-003} +```{r evaluation_and_benchmarking-001} tsk_penguins = tsk("penguins") splits = partition(tsk_penguins) lrn_rpart = lrn("classif.rpart") @@ -59,7 +59,7 @@ We can now estimate the generalization performance of a final model by evaluatin As we have seen in @sec-eval, this is simply a case of choosing one or more measures and passing them to the `$score()` function. So to estimate the accuracy of our final model we would pass the accuracy measure to our intermediate model: -```{r} +```{r evaluation_and_benchmarking-002} prediction$score(msr("classif.acc")) ``` @@ -80,7 +80,7 @@ The `$score()` method summarizes these individual loss values into a an average Other performance measures that are not decomposable instead act on a set of observations, we will return to this in detail when we look at the AUC measure in @sec-roc. @fig-score illustrates the input-output behavior of the `$score()` method, we will return to this when we turn to more complex evaluation strategies. -```{r performance-017, out.width = "80%"} +```{r evaluation_and_benchmarking-003, out.width = "80%"} #| echo: false #| label: fig-score #| fig-cap: "Illustration of the `$score()` method which aggregates predictions of multiple observations contained in a prediction object into a single numeric score" @@ -97,7 +97,7 @@ By repeating the data splitting process, data points are repeatedly used for bot Furthermore, a high number of resampling iterations can reduce the variance in our scores and thus result in a more reliable performance estimate. This means that the performance estimate is less likely to be affected by an 'unlucky' split (e.g., a split that does not reflect the original data distribution). -```{r performance-002, echo=FALSE} +```{r evaluation_and_benchmarking-004, echo=FALSE} #| label: fig-ml-abstraction #| fig-cap: "A general abstraction of the performance estimation process. The available data is (repeatedly) split into training data and test data (data splitting / resampling process). The learner is trained on each training dataset and produces intermediate models (learning process). Each intermediate model makes predictions based on the features in the test data. The performance measure compares these predictions with the ground truth from the test data and computes a performance value for each test dataset. All performance values are aggregated into a scalar value to estimate the generalization performance (evaluation process)." #| fig-alt: "A flowchart-like diagram with 3 overlapping boxes. Left box has the caption 'Data splitting / resampling process', upper right box has caption 'Learning process', and lower right box has caption 'Evaluation process'. The process starts in the left box with 'Data' and an arrow to 'Resampling Strategy', which separates into two elements stacked vertically: 'Train Set(s)' above and 'Test Set(s)' below. The 'Train set(s)' element leads to a 'Learner' box, which is inside the larger 'Learning Process' box. A box that says 'Hyperparameters' also sits within the 'Learning Process' and is connected with an arrow also pointing to 'Learner'. An arrow points from the 'Learner' to a stack of 'Intermediate Model(s)'. One thick arrow goes down into the yellow box to a stack of 'Prediction(s)'. An arrow goes from there to 'Performance measure'. The 'Test set(s)' from earlier also have an arrow to 'Performance measure'. From there, a thick arrow goes to 'Performance Value(s)', which has a final dashed arrow to 'Aggregated Performance'." @@ -131,7 +131,7 @@ In general, the larger the dataset, the fewer splits are required, yet sample-si For settings where one is more interested in proper inference (such as through statistical performance tests or confidence intervals) than bare point estimators of performance, bootstrapping and subsampling are often considered, usually with a higher number of iterations. Bootstrapping has become less common, as having repeated observations in training data can lead to problems in some machine learning setups, especially when combined with model selection methods and nested resampling (as duplicated observations can then end up simultaneously in training and test sets in nested schemes). Also note that in all of these common and simple schemes, resampling performance estimates are not independent, as models are fitted on overlapping training data, making proper inference less than trivial, but a proper treatment of these issues is out of scope for us here. For further details and critical discussion we refer to the literature, e.g., @molinaro2005prediction, @kim2009estimating, and @bischl2012resampling. -```{r performance-007, echo=FALSE} +```{r evaluation_and_benchmarking-005, echo=FALSE} #| label: fig-cv-illustration #| fig-cap: "Illustration of a three-fold cross-validation." #| fig-alt: "Complex flow chart in roughly three rows. Top row (Iteration 1) shows Dtrain split into two light blue boxes representing training data and pointing to a 'Learner', which points to a 'Model'. A dark blue box representing test data points to the same 'Model' as well as 'Measure'. 'Model' points to 'Prediction' which also points to 'Measure', which then points to 'Performance', which has an arrow to 'Averaged Performance'. In rows two and three the same process is inferred except with different boxes in dark and light blue so that each box has been dark blue exactly once across all three iterations." @@ -144,7 +144,7 @@ In the rest of this section, we will go through querying and constructing resamp All implemented resampling strategies are stored in the `r ref("mlr_resamplings")` dictionary. -```{r performance-008} +```{r evaluation_and_benchmarking-006} as.data.table(mlr_resamplings) ``` @@ -153,13 +153,13 @@ The `params` column shows the parameters of each resampling strategy (e.g., the `r ref("Resampling", aside = TRUE)` objects can be constructed by passing the strategy 'key' to the sugar function `r ref("rsmp()", aside = TRUE)`. For example, to construct the holdout strategy with a 4/5 split (2/3 by default): -```{r performance-009} +```{r evaluation_and_benchmarking-007} rsmp("holdout", ratio = 0.8) ``` Parameters for objects inheriting from `Resampling` work in the same way as measures and learners and can be set, retrieved, and updated accordingly: -```{r performance-011} +```{r evaluation_and_benchmarking-008} # three-fold CV cv3 = rsmp("cv", folds = 3) # Subsampling with 3 repeats and 9/10 ratio @@ -172,7 +172,7 @@ When a `"Resampling"` object is constructed, it is simply a definition for how t However, it is possible to manually instantiate a resampling strategy, i.e., generate all train-test splits, by calling the `$instantiate()`\index{\texttt{Resampling}!\texttt{\$instantiate()}}[`$instantiate()`]{.aside} method on a given task. So carrying on our `tsk("penguins")` example we can instantiate the three-fold CV object and then view the row indices of the data selected for training and testing each fold using `$train_set()` and `$test_set()` respectively: -```{r performance-012} +```{r evaluation_and_benchmarking-009} cv3$instantiate(tsk_penguins) # first 5 observations in first training set cv3$train_set(1)[1:5] @@ -189,7 +189,7 @@ Therefore, manually instantiating resampling strategies is rarely required but m The `r ref("resample()", aside = TRUE)` function takes a given `Task`, `Learner`, and `r ref("Resampling")` object to run the given resampling strategy. `resample()` repeatedly fits a model on training sets, makes predictions on the corresponding test sets and stores them in a `r ref("ResampleResult", aside = TRUE)` object, which contains all the information needed to estimate the generalization performance. -```{r performance-013} +```{r evaluation_and_benchmarking-010} rr = resample(tsk_penguins, lrn_rpart, cv3) rr ``` @@ -197,7 +197,7 @@ rr Each row of the output corresponds to one of the three iterations/folds. As with `Prediction` objects, we can calculate the score *for each iteration* with `$score()`: -```{r performance-014} +```{r evaluation_and_benchmarking-011} acc = rr$score(msr("classif.ce")) acc[, .(iteration, classif.ce)] ``` @@ -209,14 +209,14 @@ By default, `$score()` evaluates the performance in the *test* sets in each iter While `$score()` returns the performance in each evaluation, `r index('$aggregate()', parent = "Learner", aside = TRUE, code = TRUE)`, returns the aggregated score across all resampling iterations. -```{r} +```{r evaluation_and_benchmarking-012} rr$aggregate(msr("classif.ce")) ``` By default, the majority of measures will aggregate scores using a `r index("macro average")`, which first calculates the measure in each resampling iteration separately, and then averages these scores across all iterations. However, it is also possible to aggregate scores using a `r index("micro average")`, which pools predictions across resampling iterations into one `r ref("Prediction")` object and then computes the measure on this directly: -```{r performance-015} +```{r evaluation_and_benchmarking-013} rr$aggregate(msr("classif.ce", average = "micro")) ``` @@ -229,7 +229,7 @@ The default type of aggregation method can be found by querying the `$average` f ## Macro- and Micro-Averaging As a simple example to explain macro- and micro-averaging, consider the difference between taking the mean of a vector (micro) compared to the mean of two group-wise means (macro): -```{r} +```{r evaluation_and_benchmarking-014} # macro mean(mean(c(3, 5, 9)), mean(c(1, 5))) # micro @@ -245,7 +245,7 @@ The aggregated score returned by `$aggregate()` estimates the generalization per While we are usually interested in this aggregated score, it can be useful to look at the individual performance values of each resampling iteration (as returned by the `$score()` method) as well, e.g., to see if any of the iterations lead to very different performance results. @fig-score-aggregate-resampling visualizes the relationship between `$score()` and `$aggregate()` for a small example based on the `"penguins"` task. -```{r performance-017} +```{r evaluation_and_benchmarking-015} #| echo: false #| label: fig-score-aggregate-resampling #| fig-cap: "An example of the difference between `$score()` and `$aggregate()`: The former aggregates predictions to a single score within each resampling iteration, and the latter aggregates scores across all resampling iterations." @@ -258,14 +258,14 @@ Histograms can be useful to visually gauge the variance of the performance resul -```{r} +```{r evaluation_and_benchmarking-016} #| eval: false rr = resample(tsk_penguins, lrn_rpart, rsmp("cv", folds = 10)) autoplot(rr, measure = msr("classif.acc"), type = "boxplot") autoplot(rr, measure = msr("classif.acc"), type = "histogram") ``` -```{r performance-035} +```{r evaluation_and_benchmarking-017} #| layout-ncol: 2 #| label: fig-resamp-viz #| fig-subcap: @@ -292,7 +292,7 @@ We can use the `$predictions()` method to obtain a list of `r ref("Prediction")` This can be used to analyze the predictions of individual intermediate models from each resampling iteration. To understand the class better, we use it here to manually compute a macro averaged performance estimate. -```{r performance-018} +```{r evaluation_and_benchmarking-018} # list of prediction objects rrp = rr$predictions() # print first two @@ -306,7 +306,7 @@ The `$prediction()` method can be used to extract a single `Prediction` object t The combined prediction object can, for example, be used to manually compute a micro-averaged performance estimate (see @sec-resampling-exec for how to you can micro-average more conveniently). -```{r} +```{r evaluation_and_benchmarking-019} prediction = rr$prediction() prediction prediction$score() @@ -317,7 +317,7 @@ However, it can sometimes be useful to inspect, compare, or extract information We can configure the `r ref("resample()")` function to keep the fitted intermediate models by setting `store_models = TRUE`. Each model trained in a specific resampling iteration can then be accessed via `$learners[[i]]$model`, where `i` refers to the `i`-th resampling iteration: -```{r performance-021} +```{r evaluation_and_benchmarking-020} rr = resample(tsk_penguins, lrn_rpart, cv3, store_models = TRUE) # get the model from the first iteration rr$learners[[1]]$model @@ -325,7 +325,7 @@ rr$learners[[1]]$model In this example, we could then inspect the most important variables in each iteration to help us learn more about the respective fitted models: -```{r performance-022} +```{r evaluation_and_benchmarking-021} # print 2nd and 3rd iteration lapply(rr$learners[2:3], function(x) x$model$variable.importance) ``` @@ -339,7 +339,7 @@ Sometimes it is necessary to perform resampling with custom splits, e.g., to rep A custom holdout resampling strategy can be constructed using `rsmp("custom")`, where the row IDs of the observations used for training and testing must be defined manually when instantiated with a task. In the example below, we first construct a custom holdout resampling strategy by manually assigning row IDs to the `$train` and `$test` fields, then construct a resampling strategy with two iterations by passing row IDs as list elements: -```{r performance-023} +```{r evaluation_and_benchmarking-022} rsmp_custom = rsmp("custom") # resampling strategy with two iterations @@ -355,7 +355,7 @@ A custom cross-validation strategy can be more efficiently constructed with `rsm In this case, we now have to specify either a custom `factor` variable or a `factor` column from the data to determine the folds. In the example below, we use a smaller version of `tsk("penguins")` and instantiate a custom two-fold CV strategy using a `factor` variable called `folds` where the first and third rows are used as the test set in Fold 1, and the second and fourth rows are used as the test set in Fold 2: -```{r performance-025} +```{r evaluation_and_benchmarking-023} tsk_small = tsk("penguins")$filter(c(1, 100, 200, 300)) rsmp_customcv = rsmp("custom_cv") folds = as.factor(c(1, 2, 1, 2)) @@ -378,7 +378,7 @@ For example, in a longitudinal study, measurements are taken from the same indiv If we do not group these, we might overestimate the model's generalization capability to unseen individuals, because observations of the same individuals might simultaneously be in the train and test set. In this context, the leave-one-out cross-validation strategy can be coarsened to the "leave-one-object-out" cross-validation strategy, where all observations associated with a certain group are left out (@fig-group). -```{r performance-026, echo=FALSE} +```{r evaluation_and_benchmarking-024, echo=FALSE} #| label: fig-group #| fig-cap: "Illustration of the train-test splits of a leave-one-object-out cross-validation with 3 groups of observations (highlighted by different colors)." #| fig-alt: "Three images, each shows a green box with text 'Train' and white space around it with text 'Test'. Left (Iteration 1): green box with blue and red dots inside it and yellow dots outside it. Middle (Iteration 2): green box with blue and yellow dots inside it and red dots outside it. Right (Iteration 3): green box with yellow and red dots inside it and blue dots outside it." @@ -388,7 +388,7 @@ include_multi_graphics("mlr3book_figures-7") The `"group"` column role allows us to specify the column in the data that defines the group structure of the observations. In the following code, we construct a leave-one-out resampling strategy, assign the `"group"` role to the 'year' column of `tsk("penguins")`, instantiate the resampling strategy, and finally show how the years are nicely separated in the first fold. -```{r performance-027} +```{r evaluation_and_benchmarking-025} rsmp_loo = rsmp("loo") tsk_grp = tsk("penguins") tsk_grp$set_col_roles("year", "group") @@ -405,7 +405,7 @@ Stratified sampling ensures that one or more discrete features within the traini This is especially useful when a discrete feature is highly imbalanced and we want to make sure that the distribution of that feature is similar in each resampling iteration (@fig-stratification). We can also stratify on the target feature to ensure that each intermediate model is fit on training data where the class distribution of the target is representative of the actual task, this is useful to ensure target classes are not strongly under-represented by random chance in individual resampling iterations, which would lead to degenerate estimations of the generalization performance. -```{r performance-028, echo=FALSE} +```{r evaluation_and_benchmarking-026, echo=FALSE} #| label: fig-stratification #| fig-cap: "Illustration of a three-fold cross-validation with stratification for an imbalanced binary classification task with a majority class that is about twice as large as the minority class. In each resampling iteration, the class distribution from the available data is preserved (which is not necessarily the case for cross-validation without stratification)." #| fig-alt: "The figure shows rectangles in yellow and green to represent the majority and minority class respectively. On the left side are rectangles corresponding to the task before it is split; the majority class (yellow) on the left is clearly larger than the minority class (green) on the right. This is labeled 'Imabalanced Class Distribution'. In the next three boxes, labeled 'Iteration 1-3' respectively, the size difference between the majority and minority classes is preserved, i.e., the difference in size between majority and minority classes are equal." @@ -417,13 +417,13 @@ In this case, strata would be formed out of each combination of the stratified f `tsk("penguins")` displays imbalance in the `species` column, as can be seen in the output below: -```{r performance-029} +```{r evaluation_and_benchmarking-027} prop.table(table(tsk_penguins$data(cols = "species"))) ``` Without specifying a `"stratum"` column role, the `species` column may have quite different class distributions across the CV folds, as can be seen in the example below. -```{r performance-030} +```{r evaluation_and_benchmarking-028} rsmp_cv10 = rsmp("cv", folds = 10) rsmp_cv10$instantiate(tsk_penguins) @@ -441,7 +441,7 @@ When imbalance is severe, minority classes might not occur in the training sets Consequently, the intermediate models within these resampling iterations will never predict the missing class, resulting in a misleading performance estimate for any resampling strategy without stratification. The code below uses `species` as `"stratum"` column role to illustrate that the distribution of `species` in each test set will closely match the original distribution: -```{r performance-031} +```{r evaluation_and_benchmarking-029} tsk_str = tsk("penguins") # set species to have both the 'target' and 'stratum' column role tsk_str$set_col_roles("species", c("target", "stratum")) @@ -457,7 +457,7 @@ rbind("Fold 1" = fold1, "Fold 2" = fold2) You can view the observations that fall into each stratum using the `$strata` field of a `Task` object, this can be particularly useful when we are interested in multiple strata: -```{r performance-034} +```{r evaluation_and_benchmarking-030} tsk_str$set_col_roles("year", "stratum") tsk_str$strata # N above matches with numbers in table below @@ -481,7 +481,7 @@ The provided resampling strategy is automatically instantiated on each task to e To use the `benchmark()` function we first call `r ref("benchmark_grid()")`, which constructs an exhaustive *design* to describe all combinations of the learners, tasks and resamplings to be used in a benchmark experiment, and instantiates the resampling strategies. By example, below we set up a design to see if a random forest, decision tree, or featureless baseline (@sec-basics-featureless), performs best across two classification tasks. -```{r performance-037} +```{r evaluation_and_benchmarking-031} tasks = tsks(c("german_credit", "sonar")) learners = lrns(c("classif.rpart", "classif.ranger", "classif.featureless"), predict_type = "prob") @@ -503,14 +503,14 @@ Since this process is stochastic, it is necessary to set a seed **before** calli The constructed benchmark design can then be passed to `benchmark()` to run the experiment and the result is a `r ref("BenchmarkResult")` object: -```{r performance-039} +```{r evaluation_and_benchmarking-032} bmr = benchmark(design) bmr ``` As `benchmark()` is just an extension of `resample()`, we can once again use `$score()`, or `$aggregate()` depending on your use-case, though note that in this case `$score()` will return results over each fold of each learner/task/resampling combination. -```{r performance-040} +```{r evaluation_and_benchmarking-033} bmr$score()[c(1, 7, 13), .(iteration, task_id, learner_id, classif.ce)] bmr$aggregate()[, .(task_id, learner_id, classif.ce)] ``` @@ -530,7 +530,7 @@ Analysis of benchmark experiments, including statistical tests, is covered in mo A `r ref("BenchmarkResult")` object is a collection of multiple `r ref("ResampleResult", index = TRUE)` objects. -```{r performance-043} +```{r evaluation_and_benchmarking-034} bmrdt = as.data.table(bmr) bmrdt[1:2, .(task, learner, resampling, iteration)] ``` @@ -538,7 +538,7 @@ bmrdt[1:2, .(task, learner, resampling, iteration)] The contents of a `BenchmarkResult` and `ResampleResult` (@sec-resampling-inspect) are almost identical and the stored `ResampleResult`s can be extracted via the `$resample_result(i)` method, where `i` is the index of the performed resample experiment. This allows us to investigate the extracted `ResampleResult` and individual resampling iterations as shown in @sec-resampling, as well as the predictions from each fold with `$resample_result(i)$predictions()`. -```{r performance-044} +```{r evaluation_and_benchmarking-035} rr1 = bmr$resample_result(1) rr1 rr2 = bmr$resample_result(2) @@ -547,7 +547,7 @@ rr2 = bmr$resample_result(2) In addition, `r ref('as_benchmark_result()')` can be used to convert objects from `ResampleResult` to `BenchmarkResult`. The `c()`-method can be used to combine multiple `BenchmarkResult` objects, which can be useful when conducting experiments across multiple machines: -```{r performance-045} +```{r evaluation_and_benchmarking-036} bmr1 = as_benchmark_result(rr1) bmr2 = as_benchmark_result(rr2) @@ -556,13 +556,13 @@ c(bmr1, bmr2) Boxplots are most commonly used to visualize benchmark experiments as they can intuitively summarize results across tasks and learners simultaneously. -```{r performance-046} +```{r evaluation_and_benchmarking-037} #| output: false #| cache: false autoplot(bmr, measure = msr("classif.acc")) ``` -```{r, out.width = "70%"} +```{r evaluation_and_benchmarking-038, out.width = "70%"} #| fig-height: 5 #| fig-width: 6 #| label: fig-benchmark-box @@ -600,7 +600,7 @@ This classifier must have a very high number of TPs (as FNs are not acceptable a As we saw in @sec-basics-classif-learner, it is possible for a classifier to have a good classification accuracy but to overlook the nuances provided by a full confusion matrix, as in the following `tsk("german_credit")` example: -```{r performance-050} +```{r evaluation_and_benchmarking-039} tsk_german = tsk("german_credit") lrn_ranger = lrn("classif.ranger", predict_type = "prob") splits = partition(tsk_german, ratio = 0.8) @@ -624,7 +624,7 @@ Instead, several normalized measures can be derived (@fig-confusion): * **Accuracy (ACC)\index{accuracy}**: The proportion of correctly classified instances out of the total number of instances. * **F1-score\index{F1}**: The harmonic mean of precision and recall, which balances the trade-off between precision and recall. It is calculated as $2 \times \frac{Precision \times Recall}{Precision + Recall}$. -```{r performance-049} +```{r evaluation_and_benchmarking-040} #| echo: false #| label: fig-confusion #| fig-cap: "Binary confusion matrix of ground truth class vs. predicted class." @@ -634,7 +634,7 @@ include_multi_graphics("confusion_matrix") The `r ref_pkg("mlr3measures")` package allows you to compute several common confusion matrix-based measures using the `r ref("mlr3measures::confusion_matrix()")` function: -```{r performance-051} +```{r evaluation_and_benchmarking-041} mlr3measures::confusion_matrix(truth = prediction$truth, response = prediction$response, positive = tsk_german$positive) ``` @@ -658,7 +658,7 @@ For example, if each positive instance will be randomly classified (ignoring fea If we assign each negative instance randomly to the positive class, we would have an FPR of 0.25. In practice, we should never obtain a classifier below the diagonal and a point in the ROC space below the diagonal might indicate that the positive and negative class labels have been switched by the classifier. -```{r performance-054, echo = FALSE, fig.height = 3.5, fig.width = 8} +```{r evaluation_and_benchmarking-042, echo = FALSE, fig.height = 3.5, fig.width = 8} #| label: fig-roc #| fig-cap: "Panel (a): ROC space with best discrete classifier, two baseline classifiers -- one that always predicts the positive class and one that never predicts the positive class -- and three 'real' classifiers C1, C2, C3. We cannot say if C1 or C3 is better than the other as both are better in one metric. C2 is clearly worse than C1 and C3, which are better in at least one metric than C2 while not being worse in any other metric. Panel (b): ROC curves of the best classifier (AUC = 1), of a random guessing classifier (AUC = 0.5), and the classifiers C1, C3, and C2." #| fig-alt: "Two plots labeled (a) and (b). Both have 'FPR' between 0-1 on x-axis and 'TPR' between 0-1 on y-axis, both also have a diagonal line y=x with text 'baseline (random classifiers)'. (a): There is a green dot in upper left corner at (0,1). There is a triangle labeled C1 at around (0.1,0.75), a square labeled C2 at around (0.24, 0.75), and a plus labeled C3 at around (0.25, 0.8). (b) is same as (a) except now there are three dashed lines such that each of the points from (a) lies on one of these lines. The lines roughly curve from (0,0) towards (0,1) and then to (1,1)" @@ -763,13 +763,13 @@ Now consider classifiers that predict probabilities instead of discrete classes. Using different thresholds to cut off predicted probabilities and assign them to the positive and negative class will lead to different TPRs and FPRs and by plotting these values across different thresholds we can characterize the behavior of a binary classifier -- this is the ROC curve. For example, we can use the previous `r ref("Prediction")` object to compute all possible TPR and FPR combinations by thresholding the predicted probabilities across all possible thresholds, which is exactly what `mlr3viz::autoplot.PredictionClassif` will do when `type = "roc"` is selected: -```{r performance-055} +```{r evaluation_and_benchmarking-043} #| output: false #| cache: false autoplot(prediction, type = "roc") ``` -```{r, out.width = "70%"} +```{r evaluation_and_benchmarking-044, out.width = "70%"} #| label: fig-basics-roc-ranger #| fig-cap: "ROC-curve based on the `german_credit` dataset and the `classif.ranger` random forest learner. Recall FPR = $1 -$ Specificity and TPR = Sensitivity." #| fig-alt: ROC curve with "1 - Specificity" on x-axis (between 0-1) and "Sensitivity" on y-axis (between 0-1). There is a line from around (0,0) to (0.3,0.75) to (1, 1). @@ -786,10 +786,10 @@ The AUC can be interpreted as the probability that a randomly chosen positive in Therefore, higher values (closer to $1$) indicate better performance. Random classifiers (such as the featureless baseline) will always have an AUC of (approximately, when evaluated empirically) 0.5 (see @fig-roc, panel (b)). -```{r} +```{r evaluation_and_benchmarking-045} prediction$score(msr("classif.auc")) ``` -```{r, echo = FALSE} +```{r evaluation_and_benchmarking-046, echo = FALSE} x = prediction$score(msr("classif.auc")) ``` @@ -806,13 +806,13 @@ The main difference between ROC curves and PR curves is that the number of true- This can be useful in imbalanced populations where the positive class is rare, and where a classifier with high TPR may still not be very informative and have low PPV. See @davis2006relationship for a detailed discussion about the relationship between the PRC and ROC curves. -```{r performance-056} +```{r evaluation_and_benchmarking-047} #| output: false #| cache: false autoplot(prediction, type = "prc") ``` -```{r, out.width = "70%"} +```{r evaluation_and_benchmarking-048, out.width = "70%"} #| fig-cap: 'Precision-Recall curve based on `tsk("german_credit")` and `lrn("classif.ranger")`.' #| label: fig-basics-prc-ranger #| fig-alt: 'Line curve with "Recall" on x-axis (between 0-1) and "Precision" on y-axis (between 0-1). There is a horizontal line through around y=0.74. There is also a line decreasing from (0,1) to (1,0.74).' @@ -826,13 +826,13 @@ print(plt) Another useful way to think about the performance of a classifier is to visualize the relationship of a performance metric over varying thresholds, for example, see @fig-basics-fpracc-ranger to inspect the FPR and accuracy across all possible thresholds: -```{r performance-057} +```{r evaluation_and_benchmarking-049} #| eval: false autoplot(prediction, type = "threshold", measure = msr("classif.fpr")) autoplot(prediction, type = "threshold", measure = msr("classif.acc")) ``` -```{r performance-057-1} +```{r evaluation_and_benchmarking-050} #| label: fig-basics-fpracc-ranger #| fig-cap: 'Comparing threshold and FPR (left) with threshold and accuracy (right) for the random forest trained on `tsk("german_credit")`.' #| fig-alt: 'Two line graphs, both with "Probability Threshold" on x-axis from 0-1. Left: "classif.fpr" on y-axis. Line slowly decreases from (0,1) to (1,0). Right: "classif.acc" on y-axis. Line travels from (0,0.7) to (0.25,0.7) to (0.4,0.75) to (1, 0.3).' @@ -855,7 +855,7 @@ Depending on the problem at hand, this might be a perfectly desirable trade-off. These visualizations are also available for `r ref("ResampleResult")` objects. In this case, the predictions of individual resampling iterations are merged before calculating a ROC or PR curve (micro averaged): -```{r performance-058} +```{r evaluation_and_benchmarking-051} #| eval: false rr = resample( task = tsk("german_credit"), @@ -865,7 +865,7 @@ rr = resample( autoplot(rr, type = "roc") autoplot(rr, type = "prc") ``` -```{r performance-058-1} +```{r evaluation_and_benchmarking-052} #| label: fig-basics-rocpr-ranger #| layout-ncol: 2 #| fig-subcap: @@ -890,7 +890,7 @@ print(plt2) Finally, we can visualize ROC/PR curves for a `r ref("BenchmarkResult")` to compare multiple learners on the same `r ref("Task")`: -```{r performance-059-evalF, eval = FALSE} +```{r evaluation_and_benchmarking-053, eval = FALSE} library(patchwork) design = benchmark_grid( @@ -903,7 +903,7 @@ bmr = benchmark(design) autoplot(bmr, type = "roc") + autoplot(bmr, type = "prc") + plot_layout(guides = "collect") ``` -```{r performance-059-evalT, echo = FALSE, fig.width = 11} +```{r evaluation_and_benchmarking-054, echo = FALSE, fig.width = 11} #| label: fig-basics-rocpr-bmr #| fig-cap: 'Comparing random forest (green) and decision tree (purple) using ROC and PR Curves.' #| fig-alt: 'Two line graphs, each with two lines for decision tree and random forest. Left is ROC curve showing random forest has consistently better TPR/FPR trade-off. Right is PR Curve showing random forest has better Precision/Recall trade-off.' diff --git a/book/chapters/chapter4/hyperparameter_optimization.qmd b/book/chapters/chapter4/hyperparameter_optimization.qmd index 364d4324c..14207a73e 100644 --- a/book/chapters/chapter4/hyperparameter_optimization.qmd +++ b/book/chapters/chapter4/hyperparameter_optimization.qmd @@ -30,7 +30,7 @@ For more general details on HPO and more theoretical background, we recommend @h Note that `mlr3` never does any automatic hyperparameter optimization that the user did not explicitly request. -```{r optimization-003, echo = FALSE, out.width = "80%"} +```{r hyperparameter_optimization-001, echo = FALSE, out.width = "80%"} #| label: fig-optimization-loop-basic #| fig-cap: Representation of the hyperparameter optimization loop in mlr3tuning. Blue - Hyperparameter optimization loop. Purple - Objects of the tuning instance supplied by the user. Blue-Green - Internally created objects of the tuning instance. Green - Optimization Algorithm. #| fig-alt: Diagram showing 13 boxes representing model-agnostic HPO. On the top are two boxes, one that says "Search Space" (dark blue) and the other "Tuner" (green), these are connected by a line to "Propose Hyperparameter Configurations" (purple). That box has an arrow pointing towards another box "Evaluate by Resampling" (purple), which has a line to a blue-green box "Objective", which has four blue boxes connected toit "Task", "Learner", "Resampling", and "Measure". "Evaluate by Resampling" also has one line to the right connected to "Archive" (blue-green) which has an arrow to "Terminator" (blue) and "Update Tuner" (purple). "Terminator" has an arrow to "Optimal Hyperparameter Configuration" (purple) and "Update Tuner" has an arrow back to "Propose Hyperparameter Configurations". @@ -53,7 +53,7 @@ Throughout this section, we will look at optimizing an SVM classifier\index{supp The tuning process begins by deciding which hyperparameters to tune and what range to tune them over. The first place to start is therefore picking a learner and looking at the possible hyperparameters to tune with `$param_set`: -```{r optimization-004} +```{r hyperparameter_optimization-002} as.data.table(lrn("classif.svm")$param_set)[, .(id, class, lower, upper, nlevels)] ``` @@ -74,7 +74,7 @@ For numeric hyperparameters (we will explore others later) one must specify the We do this by constructing a learner and using `r ref("to_tune()")` to set the lower and upper limits for the parameters we want to tune. This function allows us to *mark* the hyperparameter as requiring tuning in the specified range. -```{r optimization-006} +```{r hyperparameter_optimization-003} learner = lrn("classif.svm", type = "C-classification", kernel = "radial", @@ -130,7 +130,7 @@ Continuing our example, we will construct a `r index("single-objective")` tuning For this example, we will use three-fold CV and optimize the classification error measure. Note that in the next section, we will continue our example with a grid search tuner, so we select `trm("none")` below as we will want to iterate over the full grid without stopping too soon. -```{r optimization-007} +```{r hyperparameter_optimization-004} tsk_sonar = tsk("sonar") learner = lrn("classif.svm", @@ -227,7 +227,7 @@ For further details on different tuners and practical recommendations, we refer ## `$param_classes` and `$properties` The `$param_classes` and `$properties` fields of a `Tuner` respectively provide information about which classes of hyperparameters can be handled and what properties the tuner can handle (e.g., hyperparameter dependencies, which are shown in @sec-defining-search-spaces, or multicriteria optimization, which is presented in @sec-multi-metrics-tuning): -```{r} +```{r hyperparameter_optimization-005} tnr("random_search")$param_classes tnr("random_search")$properties ``` @@ -238,14 +238,14 @@ The resolution is the number of distinct values to try *per hyperparameter*, whi All configurations will be tried by the tuner (in random order) until either all configurations are evaluated or the terminator (@sec-terminator) signals that the budget is exhausted. For grid and random search tuners, the `batch_size` parameter controls how many configurations are evaluated at the same time when parallelization is enabled (see @sec-parallel-tuning), and also determines how many configurations should be applied before the terminator should check if the termination criterion has been reached. -```{r optimization-008} +```{r hyperparameter_optimization-006} tuner = tnr("grid_search", resolution = 5, batch_size = 10) tuner ``` The `resolution` and `batch_size` parameters are termed `r index("control parameters", aside = TRUE)` of the tuner, and other tuners will have other control parameters that can be set, as with learners these are accessible with `$param_set`. -```{r optimization-009} +```{r hyperparameter_optimization-007} tuner$param_set ``` @@ -258,7 +258,7 @@ However, some control parameters like `batch_size` often interact with the paral Now that we have introduced all our components, we can start the tuning process. To do this we simply pass the constructed `r ref("TuningInstanceBatchSingleCrit")` to the `$optimize()` method of the initialized `r ref("TunerBatch")`, which triggers the hyperparameter optimization loop (@fig-optimization-loop-basic). -```{r optimization-010} +```{r hyperparameter_optimization-008} tuner$optimize(instance) ``` @@ -266,7 +266,7 @@ The optimizer returns the best hyperparameter configuration and the correspondin The first columns (here `cost` and `gamma`) will be named after the tuned hyperparameters and show the optimal values from the searched tuning spaces. The `$learner_param_vals` field of the `$result` lists the optimal hyperparameters from tuning, as well as the values of any other hyperparameters that were set, this is useful for onward model use (@sec-analyzing-result). -```{r} +```{r hyperparameter_optimization-009} instance$result$learner_param_vals ``` @@ -284,12 +284,12 @@ The correct method is to test the model on more unseen data, which can be effici For many non-negative hyperparameters that have a large upper bound, tuning on a logarithmic scale can be more efficient than tuning on a linear scale. By example, consider sampling uniformly in the interval $[\log(1e-5), \log(1e5)]$ and then exponentiating the outcome, the histograms in @fig-logscale show how we are initially sampling within a narrow range ($[-11.5, 11.5]$) but then exponentiating results in the majority of points being relatively small but a few being very large. -```{r optimization-011} +```{r hyperparameter_optimization-010} cost = runif(1000, log(1e-5), log(1e5)) exp_cost = exp(cost) ``` -```{r optimization-012, echo = FALSE} +```{r hyperparameter_optimization-011, echo = FALSE} #| label: fig-logscale #| fig-cap: Histograms of uniformly sampled values from the interval $[\log(1e-5), \log(1e5)]$ before (left) and after (right) exponentiation. #| fig-subcap: @@ -317,7 +317,7 @@ ggplot(data, aes(x = cost)) + To add this transformation to a hyperparameter we simply pass `logscale = TRUE` to `r ref("to_tune()")`. -```{r optimization-013} +```{r hyperparameter_optimization-012} learner = lrn("classif.svm", cost = to_tune(1e-5, 1e5, logscale = TRUE), gamma = to_tune(1e-5, 1e5, logscale = TRUE), @@ -340,7 +340,7 @@ We can see from this example that using the log transformation improved the hype Note that the fields `cost` and `gamma` show the optimal values *before* transformation, whereas `x_domain` and `learner_param_vals` contain optimal values *after* transformation, it is these latter fields you would take forward for future model use. -```{r optimization-014} +```{r hyperparameter_optimization-013} instance$result$x_domain ``` @@ -351,7 +351,7 @@ Now we will look at how to put everything into practice so we can make use of th Independently of whether you use `r ref("ti()")` or `r ref("tune()")`, or if you include transformations or not, the created objects and the output are structurally the same and the instance's archive lists all evaluated hyperparameter configurations: -```{r optimization-016} +```{r hyperparameter_optimization-014} as.data.table(instance$archive)[1:3, .(cost, gamma, classif.ce)] ``` @@ -359,14 +359,14 @@ Each row of the archive is a different evaluated configuration. The columns show the tested configurations (before transformation) and the chosen performance measure. We can also manually inspect the archive to determine other important features such as time of evaluation, model runtime, and any errors or warnings that occurred during tuning. -```{r optimization-017} +```{r hyperparameter_optimization-015} as.data.table(instance$archive)[1:3, .(timestamp, runtime_learners, errors, warnings)] ``` Another powerful feature of the instance is that we can score the internal `r ref("ResampleResult")`s on a different performance measure, for example looking at false negative rate and false positive rate as well as classification error: -```{r optimization-018} +```{r hyperparameter_optimization-016} as.data.table(instance$archive, measures = msrs(c("classif.fpr", "classif.fnr")))[1:5 , .(cost, gamma, classif.ce, classif.fpr, classif.fnr)] @@ -377,7 +377,7 @@ You can access all the resamplings combined in a `r ref("BenchmarkResult")` obje Finally, to visualize the results, you can use `r ref("mlr3viz::autoplot.TuningInstanceBatchSingleCrit")` (@fig-surface). In this example we can observe one of the flaws (by design) in grid search, despite testing 25 configurations, we only saw five unique values for each hyperparameter. -```{r optimization-019} +```{r hyperparameter_optimization-017} #| label: fig-surface #| fig-cap: Model performance with different configurations for `cost` and `gamma`. Bright yellow regions represent the model performing worse and dark blue performing better. We can see that high `cost` values and low `gamma` values achieve the best performance. Note that we should not directly infer the performance of new unseen values from the heatmap since it is only an interpolation based on a surrogate model (`regr.ranger`). However, we can see the general interaction between the hyperparameters. #| fig-alt: Heatmap showing model performance during HPO. y-axis is 'gamma' parameter between (-10,10) and x-axis is 'cost' parameter between (-10,10). The heatmap shows squares covering all points on the plot and circular points indicating configurations tried in our optimization. The top-left quadrant is all yellow indicating poor performance when gamma is high and cost is low. The bottom-right is dark blue indicating good performance when cost is high and gamma is low. @@ -389,14 +389,14 @@ autoplot(instance, type = "surface") Once we found good hyperparameters for our learner through tuning, we can use them to train a final model on the whole data. To do this we simply construct a new learner with the same underlying algorithm and set the learner hyperparameters to the optimal configuration: -```{r optimization-020} +```{r hyperparameter_optimization-018} lrn_svm_tuned = lrn("classif.svm") lrn_svm_tuned$param_set$values = instance$result_learner_param_vals ``` Now we can train the learner on the full dataset and we are ready to make predictions. -```{r optimization-021} +```{r hyperparameter_optimization-019} lrn_svm_tuned$train(tsk_sonar)$model ``` @@ -408,7 +408,7 @@ In the previous section, we looked at constructing and manually putting together The first helper function is `r ref("tune()")`, which creates the tuning instance and calls `$optimize()` for you. You may prefer the manual method with `ti()` if you want to view and make changes to the instance before tuning. -```{r optimization-015} +```{r hyperparameter_optimization-020} tnr_grid_search = tnr("grid_search", resolution = 5, batch_size = 5) lrn_svm = lrn("classif.svm", cost = to_tune(1e-5, 1e5, logscale = TRUE), @@ -433,7 +433,7 @@ The other helper function is `r ref("auto_tuner")`, which creates an object of c The `AutoTuner` inherits from the `r ref("Learner")` class and wraps all the information needed for tuning, which means you can treat a learner waiting to be optimized just like any other learner. Under the hood, the `AutoTuner` essentially runs `tune()` on the data that is passed to the model when `$train()` is called and then sets the learner parameters to the optimal configuration. -```{r optimization-022} +```{r hyperparameter_optimization-021} at = auto_tuner( tuner = tnr_grid_search, learner = lrn_svm, @@ -444,7 +444,7 @@ at = auto_tuner( at ``` -```{r performance-028, echo=FALSE, out.width = "60%"} +```{r hyperparameter_optimization-022, echo=FALSE, out.width = "60%"} #| label: fig-auto-tuner #| fig-cap: "Illustration of an Auto-Tuner." #| fig-alt: 'Flow diagram. Top box "Input: Training Data, Learner, Performance Metric, Resampling Strategy, Search Space". This has an arrow to "Auto-Tuner" which is a box containing "Tuning", which has three arrows pointing at each other in a circle representing the tuning process, and "Final Model Fit: Fit Learner with Optimal Hyperparameters on Dtrain". "Auto-Tuner" then points to "Return: Model, Optimal Hyperparameters".' @@ -453,7 +453,7 @@ include_multi_graphics("mlr3book_figures-12") And we can now call `$train()`, which will first tune the hyperparameters in the search space listed above before fitting the optimal model. -```{r optimization-023} +```{r hyperparameter_optimization-023} split = partition(tsk_sonar) at$train(tsk_sonar, row_ids = split$train) at$predict(tsk_sonar, row_ids = split$test)$score() @@ -461,7 +461,7 @@ at$predict(tsk_sonar, row_ids = split$test)$score() The `AutoTuner` contains a tuning instance that can be analyzed like any other instance. -```{r} +```{r hyperparameter_optimization-024} at$tuning_instance$result ``` @@ -476,7 +476,7 @@ This is analogous to `r index("optimism of the training error")` described in @j `r index("Nested resampling")` separates model optimization from the process of estimating the performance of the tuned model by adding an additional resampling, i.e., while model performance is estimated using a resampling method in the 'usual way', tuning is then performed by resampling the resampled data (@fig-nested-resampling). For more details and a formal introduction to nested resampling the reader is referred to @hpo_practical and @Simon2007. -```{r optimization-024, echo = FALSE, out.width = "80%"} +```{r hyperparameter_optimization-025, echo = FALSE, out.width = "80%"} #| label: fig-nested-resampling #| fig-cap: An illustration of nested resampling. The large blocks represent three-fold CV for the outer resampling for model evaluation and the small blocks represent four-fold CV for the inner resampling for HPO. The light blue blocks are the training sets and the dark blue blocks are the test sets. #| fig-alt: The image shows three rows of large blocks representing three-fold CV for the outer resampling. Below the blocks are four further rows of small blocks representing four-fold CV for the inner resampling. Text annotations highlight how tuned parameters from the inner resampling are passed to the outer resampling. @@ -512,7 +512,7 @@ If you are interested in identifying optimal configurations, then use `r ref("tu While the theory of nested resampling may seem complicated, it is all automated in `mlr3tuning` by simply passing an `AutoTuner` to `r ref("resample()")` or `r ref("benchmark()")`. Continuing with our previous example, we will use the auto-tuner to resample a support vector classifier with three-fold CV in the outer resampling and four-fold CV in the inner resampling. -```{r optimization-025} +```{r hyperparameter_optimization-026} at = auto_tuner( tuner = tnr_grid_search, learner = lrn_svm, @@ -530,13 +530,13 @@ While we used k-fold CV for both the inner and outer resampling strategy, you co The estimated performance of a tuned model is reported as the aggregated performance of all outer resampling iterations, which is a less biased estimate of future model performance. -```{r optimization-028} +```{r hyperparameter_optimization-027} rr$aggregate() ``` In addition to the methods described in @sec-resampling, `r ref("extract_inner_tuning_results()")` and `r ref("extract_inner_tuning_archives()")` return the optimal configurations (across all outer folds) and full tuning archives, respectively. -```{r optimization-026} +```{r hyperparameter_optimization-028} extract_inner_tuning_results(rr)[, .(iteration, cost, gamma, classif.ce)] extract_inner_tuning_archives(rr)[1:3, @@ -554,7 +554,7 @@ The `TaskGenerator` class is used when you want to simulate data for use in expe We begin by loading our learner, task generator, and generating 100 training data points and 1,000,000 testing data points. -```{r exp1} +```{r hyperparameter_optimization-029} set.seed(5) lrn_xgboost = lrn("classif.xgboost", eta = to_tune(1e-4, 1, logscale = TRUE), @@ -572,7 +572,7 @@ tsk_moons_test = tsk_moons$generate(1000000) Now we will tune the learner with respect to the classification error, using holdout resampling and random search with 700 evaluations. We then report the tuning performance without nested resampling. -```{r exp2} +```{r hyperparameter_optimization-030} tnr_random = tnr("random_search") rsmp_holdout = rsmp("holdout") trm_evals700 = trm("evals", n_evals = 700) @@ -591,7 +591,7 @@ insample = instance$result_y Next, we estimate generalization error by nested resampling (below we use an outer five-fold CV), using an `AutoTuner`: -```{r exp3} +```{r hyperparameter_optimization-031} # same setup as above at = auto_tuner( tuner = tnr_random, @@ -608,7 +608,7 @@ outsample = resample(tsk_moons_train, at, rsmp_cv5)$aggregate() And finally, we estimate the `r index('generalization error')` by training the tuned learner (i.e., using the values from the `instance` above) on the full training data again and predicting on the test data. -```{r exp4} +```{r hyperparameter_optimization-032} lrn_xgboost_tuned = lrn("classif.xgboost") lrn_xgboost_tuned$param_set$set_values( .values = instance$result_learner_param_vals) @@ -618,7 +618,7 @@ generalization = lrn_xgboost_tuned$train(tsk_moons_train)$ Now we can compare these three values: -```{r} +```{r hyperparameter_optimization-033} round(c(true_generalization = as.numeric(generalization), without_nested_resampling = as.numeric(insample), with_nested_resampling = as.numeric(outsample)), 2) @@ -638,7 +638,7 @@ The `r ref("to_tune()")` function can be used to tune parameters of any class, w To best understand this function, we will consider what is happening behind the scenes. When `to_tune()` is used in a learner, implicitly a `r ref("ParamSet")` is created just for the tuning search space: -```{r optimization-039} +```{r hyperparameter_optimization-034} learner = lrn("classif.svm", cost = to_tune(1e-1, 1e5), gamma = to_tune(1e-1, 1), @@ -654,7 +654,7 @@ In this example, we can see that `gamma` hyperparameter has class `ParamDbl`, wi If we wanted to tune over a non-numeric hyperparameter, we can still use `to_tune()`, which will infer the correct class to construct in the resulting parameter set. For example, say we wanted to tune the numeric `cost`, factor `kernel`, and logical `scale` hyperparameter in our SVM: -```{r} +```{r hyperparameter_optimization-035} learner = lrn("classif.svm", cost = to_tune(1e-1, 1e5), kernel = to_tune(c("radial", "linear")), @@ -670,7 +670,7 @@ The `shrinking` hyperparameter is a logical, there are only two possible values Similarly, for factor parameters, we could also use `to_tune()` without any arguments if we want to tune over all possible values. Finally, we can use `to_tune()` to treat numeric parameters as factors if we want to discretize them over a small subset of possible values, for example, if we wanted to find the optimal number of trees in a random forest we might only consider three scenarios: 100, 200, or 400 trees: -```{r, eval = FALSE} +```{r hyperparameter_optimization-036, eval = FALSE} lrn("classif.ranger", num.trees = to_tune(c(100, 200, 400))) ``` @@ -701,7 +701,7 @@ This function takes named arguments of class `r ref("Domain")`, which can be cre As a simple example, let us look at how to create a search space to tune `cost` and `gamma` again: -```{r} +```{r hyperparameter_optimization-037} search_space = ps( cost = p_dbl(lower = 1e-1, upper = 1e5), kernel = p_fct(c("radial", "linear")), @@ -711,7 +711,7 @@ search_space = ps( This search space would then be passed to the `search_space` argument in `auto_tuner()`: -```{r} +```{r hyperparameter_optimization-038} ti(tsk_sonar, lrn("classif.svm", type = "C-classification"), rsmp_cv3, msr_ce, trm("none"), search_space = search_space) ``` @@ -721,7 +721,7 @@ ti(tsk_sonar, lrn("classif.svm", type = "C-classification"), rsmp_cv3, When manually creating search spaces, make sure all numeric hyperparameters in your search space are bounded, e.g., if you are trying to tune a hyperparameter that could take any value in $(-\infty, \infty)$ then the tuning process will throw an error for nearly all tuners if you do not pass lower and upper limits to `p_dbl()` or `p_int()`. You can use `$is_bounded` on the constructed `r ref("ParamSet")` if you are unsure: -```{r optimization-042} +```{r hyperparameter_optimization-039} ps(cost = p_dbl(lower = 0.1, upper = 1))$is_bounded ps(cost = p_dbl(lower = 0.1, upper = Inf))$is_bounded ``` @@ -734,7 +734,7 @@ ps(cost = p_dbl(lower = 0.1, upper = Inf))$is_bounded In @sec-logarithmic-transformations we saw how to quickly apply log transformations with `r ref("to_tune()")`. As you now know, `to_tune()` is just a wrapper that creates `r ref("ParamSet")` objects, so let us look at what is taking place when we set `logscale = TRUE`: -```{r} +```{r hyperparameter_optimization-040} lrn("classif.svm", cost = to_tune(1e-5, 1e5, logscale = TRUE))$ param_set$search_space() ``` @@ -742,7 +742,7 @@ lrn("classif.svm", cost = to_tune(1e-5, 1e5, logscale = TRUE))$ Notice that now the `lower` and `upper` fields correspond to the transformed bounds, i.e. $[\log(1e-5), \log(1e5)]$. To manually create the same transformation, we can pass the transformation to the `trafo` argument in `p_dbl()` and set the bounds: -```{r optimization-045} +```{r hyperparameter_optimization-041} search_space = ps(cost = p_dbl(log(1e-5), log(1e5), trafo = function(x) exp(x))) # alternatively: 'trafo = exp' search_space @@ -750,21 +750,21 @@ search_space We can confirm it is correctly set by making use of the `$trafo()` method, which takes a named list and applies the specified transformations -```{r} +```{r hyperparameter_optimization-042} search_space$trafo(list(cost = 1)) ``` Where transformations become the most powerful is in the ability to pass arbitrary functions that can act on single parameters or even the entire parameter set. As an example, consider a simple transformation to add '2' to our range: -```{r} +```{r hyperparameter_optimization-043} search_space = ps(cost = p_dbl(0, 3, trafo = function(x) x + 2)) search_space$trafo(list(cost = 1)) ``` Simple transformations such as this can even be added directly to a learner by passing a `Param` object to `to_tune()`: -```{r, eval = FALSE} +```{r hyperparameter_optimization-044, eval = FALSE} lrn("classif.svm", cost = to_tune(p_dbl(0, 3, trafo = function(x) x + 2))) ``` @@ -774,7 +774,7 @@ More complex transformations that require multiple arguments should be passed to Below we first exponentiate the value of `cost` and then add '2' if the `kernel` is `"polynomial"`. -```{r} +```{r hyperparameter_optimization-045} search_space = ps( cost = p_dbl(-1, 1, trafo = function(x) exp(x)), kernel = p_fct(c("polynomial", "radial")), @@ -796,7 +796,7 @@ By example, consider the `class.weights` parameter of the SVM, which takes a nam To tune this parameter we could tune a scalar and then transform this to a vector. The code below would result in a value, `x`, between `0.1` and `0.9` being sampled, the result is then transformed to (`x`, `1 - x`) and is then passed to the `Learner`. -```{r optimization-049} +```{r hyperparameter_optimization-046} search_space = ps( class.weights = p_dbl(lower = 0.1, upper = 0.9, trafo = function(x) c(M = x, R = 1 - x)) @@ -808,7 +808,7 @@ For example, say we want to tune the architecture of a `r index('neural network' In this case, the learner expects a vector where each element of the vector corresponds to the number of nodes in a layer and the length of the vector is the number of layers. We could then tune this as follows: -```{r} +```{r hyperparameter_optimization-047} search_space = ps( num_layers = p_int(lower = 1, upper = 20), num_nodes_per_layer = p_int(4, 64), @@ -823,13 +823,13 @@ search_space = ps( Here we are tuning the pseudo-parameter `num_layers` between `1` and `20`, then tuning the pseudo-parameter `num_nodes_per_layer` between `4` and `64`, then combining these into a vector called `num_nodes` (the real hyperparameter) and removing the pseudo-parameters. -```{r} +```{r hyperparameter_optimization-048} search_space$trafo(list(num_layers = 4, num_nodes_per_layer = 12)) ``` Even though this transformation looks complex, it only affects one of the hyperparameters (and does not need access to others), so we could include it in the learner using `to_tune()` by passing the whole `ParamSet` object: -```{r} +```{r hyperparameter_optimization-049} learner = lrn("surv.coxtime") learner$param_set$set_values(num_nodes = to_tune(search_space)) learner$param_set$search_space() @@ -843,7 +843,7 @@ Hyperparameter dependencies occur when a hyperparameter should only be set if an For example, the `degree` parameter in SVM is only valid when `kernel` is `"polynomial"`. In the `r ref("ps()")` function, we specify this using the `depends` argument, which takes a named argument of the form ` == value` or ` %in% `: -```{r} +```{r hyperparameter_optimization-050} ps( kernel = p_fct(c("polynomial", "radial")), degree = p_int(1, 3, depends = (kernel == "polynomial")), @@ -858,7 +858,7 @@ Hence setting the dependency tells the tuning process to tune `degree` if `kerne Dependencies can also be passed straight into a learner using `r ref("to_tune()")`: -```{r} +```{r hyperparameter_optimization-051} lrn("classif.svm", kernel = to_tune(c("polynomial", "radial")), degree = to_tune(p_int(1, 3, depends = (kernel == "polynomial"))) @@ -873,7 +873,7 @@ Selected search spaces can require a lot of background knowledge or expertise. The package `r ref_pkg("mlr3tuningspaces")` tries to make HPO more accessible by providing implementations of published search spaces for many popular machine learning algorithms, the hope is that these search spaces are applicable to a wide range of datasets. The search spaces are stored in the dictionary `r ref("mlr_tuning_spaces")`. -```{r optimization-056,message=FALSE} +```{r hyperparameter_optimization-052,message=FALSE} library(mlr3tuningspaces) as.data.table(mlr_tuning_spaces)[1:3, .(key, label)] ``` @@ -882,14 +882,14 @@ The tuning spaces are named according to the scheme `{learner-id}.{tuning-space- The `default` tuning spaces are published in @hpo_practical, other tuning spaces are part of the random bot experiments `rbv1` and `rbv2` published in @kuehn_2018 and @binder2020. The sugar function `r ref("lts()")` (learner tuning space) is used to retrieve a `r ref("TuningSpace")`. -```{r optimization-057} +```{r hyperparameter_optimization-053} lts_rpart = lts("classif.rpart.default") lts_rpart ``` A tuning space can be passed to `r ref("ti()")` or `r ref("auto_tuner()")` as the `search_space`. -```{r optimization-058} +```{r hyperparameter_optimization-054} instance = ti( task = tsk_sonar, learner = lrn("classif.rpart"), @@ -902,7 +902,7 @@ instance = ti( Alternatively, as loaded search spaces are just a collection of tune tokens, we could also pass these straight to a learner: -```{r optimization-059} +```{r hyperparameter_optimization-055} vals = lts_rpart$values vals learner = lrn("classif.rpart") @@ -914,13 +914,13 @@ Note how we used the `.values` parameter of `$set_values()`, which allows us to We could also apply the default search spaces from @hpo_practical by passing the learner to `r ref("lts()")`: -```{r optimization-060} +```{r hyperparameter_optimization-056} lts(lrn("classif.rpart")) ``` Finally, it is possible to overwrite a predefined tuning space in construction, for example, changing the range of the `maxdepth` hyperparameter in a decision tree: -```{r optimization-061} +```{r hyperparameter_optimization-057} lts("classif.rpart.rbv2", maxdepth = to_tune(1, 20)) ``` diff --git a/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd b/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd index 81751346a..09ccafb70 100644 --- a/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd +++ b/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd @@ -31,7 +31,7 @@ Even in simple machine learning problems, there is a lot of potential for things For example, when learners do not converge, run out of memory, or terminate with an error due to issues in the underlying data. As a common issue, learners can fail if there are factor levels present in the test data that were not in the training data, models fail in this case as there have been no weights/coefficients trained for these new factor levels: -```{r, error = TRUE, warning = FALSE, message = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-001, error = TRUE, warning = FALSE, message = FALSE} tsk_pen = tsk("penguins") # remove rows with missing values tsk_pen$filter(tsk_pen$row_ids[complete.cases(tsk_pen$data())]) @@ -54,7 +54,7 @@ This is even worse in nested resampling or benchmarking when errors could cause `r index('Encapsulation')` (@sec-encapsulation) allows errors to be isolated and handled, without disrupting the tuning process. We can tell a learner to encapsulate an error using the `$encapsulate()` method as follows: -```{r optimization-035} +```{r advanced_tuning_methods_and_black_box_optimization-002} learner$encapsulate(method = "evaluate", fallback = lrn("classif.featureless")) ``` @@ -63,7 +63,7 @@ Note by passing `"evaluate"`, we are telling the learner to set up encapsulation Another common issue that cannot be easily solved during HPO is learners not converging and the process running indefinitely. We can prevent this from happening by setting the `timeout` field in a learner, which signals the learner to stop if it has been running for that much time (in seconds), again this can be set for training and prediction individually: -```{r optimization-036} +```{r advanced_tuning_methods_and_black_box_optimization-003} learner$timeout = c(train = 30, predict = 30) ``` @@ -76,7 +76,7 @@ We use `lrn("classif.featureless")`, which always predicts the majority class. We can now run our experiment and see errors that occurred during tuning in the archive. -```{r optimization-038} +```{r advanced_tuning_methods_and_black_box_optimization-004} instance = tune(tnr_random, tsk_pen, learner, rsmp_custom, msr_ce, 10) @@ -88,7 +88,7 @@ instance$archive$resample_result(1)$errors The learner was tuned without breaking because the errors were encapsulated and logged before the fallback learners were used for fitting and predicting: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-005} instance$result ``` @@ -136,7 +136,7 @@ Methodological details on multi-objective hyperparameter optimization can be fou We will tune `cp`, `minsplit`, and `maxdepth`: -```{r optimization-082} +```{r advanced_tuning_methods_and_black_box_optimization-006} learner = lrn("classif.rpart", cp = to_tune(1e-04, 1e-1), minsplit = to_tune(2, 64), maxdepth = to_tune(1, 30)) @@ -146,7 +146,7 @@ measures = msrs(c("classif.ce", "selected_features")) As we are tuning with respect to multiple measures, the function `ti()` automatically creates a `r ref("TuningInstanceBatchMultiCrit")` instead of a `r ref("TuningInstanceBatchSingleCrit")`. Below we set `store_models = TRUE` as this is required by the selected features measure. -```{r optimization-083} +```{r advanced_tuning_methods_and_black_box_optimization-007} instance = ti( task = tsk("sonar"), learner = learner, @@ -160,7 +160,7 @@ instance We can then select and tune a tuning algorithm as usual: -```{r optimization-084,output=FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-008,output=FALSE} #| cache: false tuner = tnr("random_search") tuner$optimize(instance) @@ -168,13 +168,13 @@ tuner$optimize(instance) Finally, we inspect the best-performing configurations, i.e., the Pareto set, and visualize the corresponding estimated Pareto front (@fig-pareto). Note that the `selected_features` measure is averaged across the folds, so the values in the archive may not always be integers. -```{r optimization-085} +```{r advanced_tuning_methods_and_black_box_optimization-009} #| cache: false instance$archive$best()[, .(cp, minsplit, maxdepth, classif.ce, selected_features)] ``` -```{r optimization-086,message=FALSE, echo = FALSE, warning = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-010,message=FALSE, echo = FALSE, warning = FALSE} #| cache: false #| label: fig-pareto #| fig-cap: Pareto front of selected features and classification error. White dots represent tested configurations, each black dot individually represents a Pareto-optimal configuration and all black dots together represent the approximated Pareto front. @@ -211,7 +211,7 @@ By definition, there is no optimal configuration so this may depend on your use You can select one configuration and pass it to a learner for training using `$result_learner_param_vals`, so if we want to select the second configuration we would run: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-011} learner = lrn("classif.rpart") learner$param_set$values = instance$result_learner_param_vals[[2]] ``` @@ -300,7 +300,7 @@ We will load the learner and define the search space. We specify a range from 16 ($r_{min}$) to 128 ($r_{max}$) boosting iterations and tag the parameter with `"budget"` to identify it as a fidelity parameter. For the other hyperparameters, we take the search space for XGBoost from @hpo_practical, which usually works well for a wide range of datasets. -```{r optimization-062, message = FALSE, warning = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-012, message = FALSE, warning = FALSE} library(mlr3hyperband) learner = lrn("classif.xgboost") @@ -321,7 +321,7 @@ We use `trm("none")` and set the `repetitions` control parameter to `1` so that Note that setting `repetition = Inf` can be useful if you want a terminator to stop the optimization, for example, based on runtime. The `r ref("hyperband_schedule()")` function can be used to display the schedule across the given fidelity levels and budget increase factor. -```{r optimization-064} +```{r advanced_tuning_methods_and_black_box_optimization-013} instance = ti( task = tsk("sonar"), learner = learner, @@ -338,7 +338,7 @@ hyperband_schedule(r_min = 16, r_max = 128, eta = 2) Finally, we can tune as normal and print the result and archive. Note that the archive resulting from a Hyperband run contains the additional columns `bracket` and `stage` which break down the results by the corresponding bracket and stage. -```{r optimization-067, message = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-014, message = FALSE} tuner$optimize(instance) instance$result[, .(classif.ce, nrounds)] @@ -398,7 +398,7 @@ The `Objective` requires specification of the function to optimize its domain an By tagging the codomain with `"minimize"` or `"maximize"` we specify the optimization direction. Note how below our optimization function takes a `list` as an input with one element called `x`. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-015} library(bbotk) sinus_1D = function(xs) 2 * xs$x * sin(14 * xs$x) @@ -410,7 +410,7 @@ objective = ObjectiveRFun$new(sinus_1D, We can visualize our objective by generating a grid of points on which we evaluate the function (@fig-bayesian-optimization-sinusoidal), this will help us identify its local minima and global minimum. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-016} #| label: fig-bayesian-optimization-sinusoidal #| fig-cap: Visualization of the sinusoidal function. Local minima in triangles and global minimum in the circle. #| fig-alt: Line graph from (0,1) on the x-axis to (-2,2) on the y-axis; labelled 'x' and 'y' respectively. The line starts with a local minimum at (0,0), increases and then has a local minimum at around (0.35,-0.69), the function then increases and then decreases to the global minimum at around (0.79, -1.56). @@ -431,7 +431,7 @@ ggplot(aes(x = x, y = y), data = xydt) + The global minimizer, `r xydt[y == min(y), x]`, corresponds to the point of the domain with the lowest function value: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-017} xydt[y == min(y), ] ``` @@ -445,7 +445,7 @@ In the following, we use a simple random search to optimize the sinusoidal funct An optimization instance is constructed with the `r ref("oi()")` function. Analogously to tuners, `Optimizer`s in `bbotk` are stored in the `r ref('mlr_optimizers')` dictionary and can be constructed with `r ref('opt()', aside = TRUE)`. -```{r, output=FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-018, output=FALSE} instance = oi(objective, search_space = domain, terminator = trm("evals", n_evals = 20)) @@ -455,7 +455,7 @@ optimizer$optimize(instance) Similarly to how we can use `r ref("tune()")` to construct a tuning instance, here we can use `r ref("bb_optimize()")`, which returns a list with elements `"par"` (best found parameters), `"val"` (optimal outcome), and `"instance"` (the optimization instance); the values given as `"par"` and `"val"` are the same as the values found in `instance$result`: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-019} optimal = bb_optimize(objective, method = "random_search", max_evals = 20) optimal$instance$result @@ -481,7 +481,7 @@ We refer to these elements as the 'building blocks' of BO as it is a highly modu The design of `mlr3mbo` reflects this modularity, with the base class for `r ref("OptimizerMbo")` holding all the key elements: the BO algorithm loop structure (`r ref("loop_function")`), *surrogate* model (`r ref("Surrogate")`), *acquisition function* (`r ref("AcqFunction")`), and *acquisition function optimizer* (`r ref("AcqOptimizer")`). In this section, we will provide a more detailed explanation of these building blocks and explore their interplay and interaction during optimization. -```{r, echo = FALSE, out.width = "80%"} +```{r advanced_tuning_methods_and_black_box_optimization-020, echo = FALSE, out.width = "80%"} #| label: fig-optimization-loop #| fig-cap: Bayesian optimization loop. #| fig-alt: Diagram is a graphical representation of the loop written out above from initial design to surrogate model and then looping until the termination criterion is met. @@ -511,7 +511,7 @@ The Sobol design works similarly to LHS but can provide better coverage than LHS For this reason, LHS or Sobol designs are usually recommended for BO, but usually the influence of the initial design will be smaller compared to other design choices of BO. A random design might work well-enough, but grid designs are usually discouraged. -```{r, echo = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-021, echo = FALSE} #| label: fig-bayesian-optimization-designs #| fig-cap: Comparing different samplers for constructing an initial design of nine points on a domain of two numeric variables ranging from $0$ to $1$. Dotted horizontal and vertical lines partition the domain into equally sized bins. Histograms on the top and right visualize the marginal distributions of the generated sample. #| fig-alt: "Plot shows four grids with x_1 on x-axis ranging from 0 to 1 and x_2 on y-axis ranging from 0 to 1. Each grid has bars above them and to the right representing marginal distributions. Top left: 'Random Design' nine points are scattered randomly across the grid with poor coverage. Marginal distributions are also random. Top right: 'Grid Design', points are uniformly scattered across the grid on lines x_1=0,x_1=0.5,x_1=1 and same for x_2. Marginal distributions show three long bars at each of the corresponding lines. Bottom left: 'LHS Design', points appear randomly scattered however marginal distributions are completely equal with equal-sized bars along each axis. Bottom right: 'Sobol Design', very similar to 'LHS Design' however one of the bars in the marginal distribution is slightly longer than the others." @@ -573,7 +573,7 @@ grid.arrange(g_random, g_grid, g_lhs, g_sobol, nrow = 2, ncol = 2) Whichever of these methods you choose, the result is a `r ref("Design")` object, which is mostly just a wrapper around a `data.table`: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-022} sample_domain = ps(x1 = p_dbl(0, 1), x2 = p_dbl(0, 1)) generate_design_random(sample_domain, n = 3)$data ``` @@ -582,7 +582,7 @@ Therefore you could also specify a completely custom initial design by defining Either way, when manually constructing an initial design (as opposed to letting `loop_function` automate this), it needs to be evaluated on the `r ref("OptimInstance")` before optimizing it. Returning to our running example of minimizing the sinusoidal function, we will evaluate a custom initial design with `$eval_batch()`: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-023} instance = OptimInstanceSingleCrit$new(objective, terminator = trm("evals", n_evals = 20)) design = data.table(x = c(0.1, 0.34, 0.65, 1)) @@ -604,7 +604,7 @@ Typical choices of regression learners used as surrogate models include `r index A detailed introduction to Gaussian processes can be found in @williams_2006 and an in-depth focus on Gaussian processes in the context of surrogate models in BO is given in @garnett_2022. In this example, we use a Gaussian process with Matérn 5/2 kernel, which uses `BFGS` as an optimizer to find the optimal kernel parameters and set `trace = FALSE` to prevent too much output during fitting. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-024} #| cache: false lrn_gp = lrn("regr.km", covtype = "matern5_2", optim.method = "BFGS", control = list(trace = FALSE)) @@ -612,7 +612,7 @@ lrn_gp = lrn("regr.km", covtype = "matern5_2", optim.method = "BFGS", A `SurrogateLearner` can be constructed by passing a `LearnerRegr` object to the sugar function `r index('srlrn()', aside = TRUE, code = TRUE)`, alongside the `archive` of the instance: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-025} #| cache: false library(mlr3mbo) surrogate = srlrn(lrn_gp, archive = instance$archive) @@ -622,7 +622,7 @@ Internally, the regression learner is fit on a `TaskRegr` where features are the In our running example we have already initialized our archive with the initial design, so we can update our surrogate model, which essentially fits the Gaussian process, note how we use `$learner` to access the wrapped model: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-026} #| cache: false surrogate$update() surrogate$learner$model @@ -647,7 +647,7 @@ In `mlr3mbo`, acquisition functions (of class `r ref( "AcqFunction")`) are store In our running example, we will use the expected improvement (`acqf("ei")`) to choose the next candidate for evaluation. Before we can do that, we have to update (`$update()`) the `AcqFunction`'s view of the incumbent, to ensure it is still using the best value observed so far. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-027} #| cache: false acq_function = acqf("ei", surrogate = surrogate) acq_function$update() @@ -658,7 +658,7 @@ You can use `$eval_dt()` to evaluate the acquisition function for the domain giv In @fig-bayesian-optimization-ei we evaluated the expected improvement on a uniform grid of points between $0$ and $1$ using the predicted mean and standard deviation from the Gaussian process. We can see that the expected improvement is high in regions where the mean prediction (gray dashed lines) of the Gaussian process is low, or where the uncertainty is high. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-028} #| cache: false #| label: fig-bayesian-optimization-ei #| fig-cap: Expected improvement (solid dark gray line) based on the mean and uncertainty prediction (dashed gray line) of the Gaussian process surrogate model trained on an initial design of four points (black). Ribbons represent the mean plus minus the standard deviation prediction. @@ -703,7 +703,7 @@ The terminators are the same as those introduced in @sec-terminator. Below we use the DIRECT algorithm and we terminate the acquisition function optimization if there is no improvement of at least `1e-5` for `100` iterations. The `$optimize()` method optimizes the acquisition function and returns the next candidate. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-029} acq_optimizer = acqo( optimizer = opt("nloptr", algorithm = "NLOPT_GN_ORIG_DIRECT"), terminator = trm("stagnation", iters = 100, threshold = 1e-5), @@ -725,7 +725,7 @@ The `r ref("loop_function", index = TRUE)` determines the behavior of the BO alg Loop functions are stored in the `r ref("mlr_loop_functions")` dictionary. As these are `S3` (not `R6`) classes, they can be simply loaded by just referencing the `key` (i.e., there is no constructor required). -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-030} as.data.table(mlr_loop_functions)[, .(key, label, instance)] ``` @@ -735,7 +735,7 @@ A simplified version of this code is shown at the end of this section, both to h In short, the code sets up the relevant components discussed above and then loops the steps above: 1) update the surrogate model 2) update the acquisition function 3) optimize the acquisition function to yield a new candidate 4) evaluate the candidate and add it to the archive. If there is an error during the loop then a fallback is used where the next candidate is proposed uniformly at random, ensuring that the process continues even in the presence of potential issues, we will return to this in @sec-practical-bayesian-optimization. -```{r, eval=FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-031, eval=FALSE} my_simple_ego = function( instance, surrogate, @@ -791,7 +791,7 @@ We are now ready to put everything together to automate the BO process. We use the `bayesopt_ego` loop function provided by `mlr_loop_functions`, which works similarly to the code shown above but takes more care to offer sensible default values for its arguments and handle edge cases correctly. You do not need to pass any of these building blocks to each other manually as the `r ref('opt()', aside = TRUE)` constructor will do this for you: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-032} bayesopt_ego = mlr_loop_functions$get("bayesopt_ego") surrogate = srlrn(lrn("regr.km", covtype = "matern5_2", optim.method = "BFGS", control = list(trace = FALSE))) @@ -813,7 +813,7 @@ Additional arguments for customizing certain loop functions can be passed throug In this example, we will use the same initial design that we created before and will optimize our sinusoidal function using `$optimize()`: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-033} instance = OptimInstanceSingleCrit$new(objective, terminator = trm("evals", n_evals = 20)) design = data.table(x = c(0.1, 0.34, 0.65, 1)) @@ -825,7 +825,7 @@ Using only a few evaluations, BO comes close to the true global optimum (0.792). @fig-bayesian-optimization-sampling shows the sampling trajectory of candidates as the algorithm progressed, we can see that focus is increasingly given to more regions around the global optimum. However, even in later optimization stages, the algorithm still explores new areas, illustrating that the expected improvement acquisition function indeed balances exploration and exploitation as we required. -```{r, echo = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-034, echo = FALSE} #| label: fig-bayesian-optimization-sampling #| fig-cap: Sampling trajectory of the BO algorithm. Points of the initial design in black triangles. Sampled points are in dots with color progressing from black to white as the algorithm progresses. #| fig-alt: Line graph of the same sinusoidal function as before but now there are dots from white to black along the line. There are more dots around the global minimum in later stages but still a spread of dots throughout the line. @@ -844,7 +844,7 @@ ggplot() + If we replicate running our BO algorithm ten times (with random initial designs and varying random seeds) and compare this to a random search, we can see that BO indeed performs much better and on average reaches the global optimum after around 15 function evaluations (@fig-bayesian-sinusoidal_bo_rs). As expected, the performance for the initial design size is close to the performance of the random search. -```{r, eval=FALSE, echo=FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-035, eval=FALSE, echo=FALSE} library(mlr3misc) library(pammtools) set.seed(2906) @@ -882,7 +882,7 @@ ggsave("Figures/bo_1d_sinusoidal_bo_rs.pdf", plot = g, width = 6, height = 4) ggsave("Figures/bo_1d_sinusoidal_bo_rs.png", plot = g, width = 6, height = 4) ``` -```{r, echo=FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-036, echo=FALSE} #| echo: false #| label: fig-bayesian-sinusoidal_bo_rs #| fig-cap: Anytime performance of BO and random search on the 1D sinusoidal function given a budget of 20 function evaluations. Solid line depicts the best observed target value averaged over 10 replications. Ribbons represent standard errors. @@ -896,7 +896,7 @@ knitr::include_graphics("Figures/bo_1d_sinusoidal_bo_rs.png") As an example, below we will tune the `cost` and `gamma` parameters of `lrn("classif.svm")` with a radial kernel on `tsk("sonar")` with three-fold CV. We set up `tnr("mbo")` using the same objects constructed above and then run our tuning experiment as usual: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-037} tuner = tnr("mbo", loop_function = bayesopt_ego, surrogate = surrogate, @@ -923,7 +923,7 @@ ParEGO (`r ref("bayesopt_parego()")`) tackles multi-objective BO via a randomize Other compatible loop functions can be found by looking at the `"instance"` column of `r ref('mlr_loop_functions')`. We will tune three parameters of a decision tree with respect to the true positive (maximize) and false positive (minimize) rates, the `r index('Pareto front', lower = FALSE)` is visualized in @fig-pareto-bayesopt. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-038} tuner = tnr("mbo", loop_function = bayesopt_parego, surrogate = surrogate, @@ -940,7 +940,7 @@ instance = tune(tuner, tsk("sonar"), lrn_svm, rsmp("cv", folds = 3), msrs(c("classif.tpr", "classif.fpr")), 25) ``` -```{r, echo = FALSE, warning = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-039, echo = FALSE, warning = FALSE} #| label: fig-pareto-bayesopt #| fig-cap: Pareto front of TPR and FPR obtained via ParEGO. White dots represent tested configurations, each black dot individually represents a Pareto-optimal configuration and all black dots together represent the Pareto front. #| fig-alt: Scatter plot with classif.tpr on y-axis (between 0.75 and 1.00) and classif.fpr on x-axis (between 0.2 and 1.0). The Pareto front is shown as the set of points at roughly (0.23, 0.85), (0.24, 0.88), (0.25, 0.91), (0.30, 0.93), (0.35, 0.95), (0.40, 0.96), (0.8, 1.00). @@ -977,7 +977,7 @@ For example, if you were modeling a machine in a factory to estimate the rate of In `bbotk`, you can mark an `r ref("Objective")` object as noisy by passing the `"noisy"` tag to the `properties` parameter, which allows us to use methods that can treat such objectives differently. -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-040} sinus_1D_noisy = function(xs) { y = 2 * xs$x * sin(14 * xs$x) + rnorm(1, mean = 0, sd = 0.1) y @@ -996,7 +996,7 @@ Noisy objectives can be treated in different ways: In the first case, instead of using an interpolating `r index('Gaussian process', lower = FALSE)`, we could instead use Gaussian process regression that estimates the measurement error by setting `nugget.estim = TRUE`: -```{r, output=FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-041, output=FALSE} srlrn(lrn("regr.km", nugget.estim = TRUE)) ``` @@ -1010,7 +1010,7 @@ Finally, `mlr3mbo` allows for explicitly specifying how the final result after o In contrast, the default method, `r ref('ResultAssignerArchive')`, just picks the best point according to the evaluations logged in `archive`. Result assigners are stored in the `r ref('mlr_result_assigners')` dictionary and can be constructed with `r ref('ras()')`. -```{r, eval = FALSE} +```{r advanced_tuning_methods_and_black_box_optimization-042, eval = FALSE} opt("mbo", loop_function = bayesopt_ego, surrogate = surrogate, @@ -1117,7 +1117,7 @@ Use a budget of 40 function evaluations. Run this with both the "default" Gaussian process surrogate model with Matérn 5/2 kernel, and the "default" random forest surrogate model. Compare their anytime performance (similarly as in @fig-bayesian-sinusoidal_bo_rs). You can construct the surrogate models with default settings using: -```{r} +```{r advanced_tuning_methods_and_black_box_optimization-043} surrogate_gp = srlrn(default_gp()) surrogate_rf = srlrn(default_rf()) ``` diff --git a/book/chapters/chapter6/feature_selection.qmd b/book/chapters/chapter6/feature_selection.qmd index e7188248a..568a8fb7b 100644 --- a/book/chapters/chapter6/feature_selection.qmd +++ b/book/chapters/chapter6/feature_selection.qmd @@ -84,14 +84,14 @@ These are accessible from the `r ref("mlr_filters", index = TRUE)` dictionary wi Each object of class `r ref("Filter", index = TRUE)` has a `$calculate()`\index{Filter!\$calculate()}[`$calculate()`]{.aside} method, which computes the filter values and ranks them in a descending order. For example, we can use the information gain filter described above: -```{r feature-selection-001} +```{r feature_selection-001} library(mlr3filters) flt_gain = flt("information_gain") ``` Such a `Filter` object can now be used to calculate the filter on `tsk("penguins")` and get the results: -```{r feature-selection-002} +```{r feature_selection-002} tsk_pen = tsk("penguins") flt_gain$calculate(tsk_pen) @@ -102,7 +102,7 @@ This shows that the flipper and bill measurements are the most informative featu Some filters have hyperparameters that can be changed in the same way as `Learner` hyperparameters. For example, to calculate `"spearman"` instead of `"pearson"` correlation with the correlation filter: -```{r feature-selection-003} +```{r feature_selection-003} flt_cor = flt("correlation", method = "spearman") flt_cor$param_set ``` @@ -113,7 +113,7 @@ To use feature importance filters, we can use a learner with with an `$importanc All learners with the property "importance" have this functionality. A list of all learners with this property can be found with -```{r feature-selection-004, eval = FALSE} +```{r feature_selection-004, eval = FALSE} as.data.table(mlr_learners)[ sapply(properties, function(x) "importance" %in% x)] ``` @@ -122,7 +122,7 @@ For some learners, the desired filter method needs to be set as a hyperparameter For example, `lrn("classif.ranger")` comes with multiple integrated methods, which can be selected during construction: To use the `r index('feature importance')` method `"impurity"`, select it during learner construction: -```{r feature-selection-005} +```{r feature_selection-005} lrn("classif.ranger")$param_set$levels$importance lrn_ranger = lrn("classif.ranger", importance = "impurity") ``` @@ -130,14 +130,14 @@ lrn_ranger = lrn("classif.ranger", importance = "impurity") We first have to remove missing data because the learner cannot handle missing data, i.e. it does not have the property "missing". Note we use the `$filter()` method presented in @sec-tasks-mutators here to remove rows; the "filter" name is unrelated to feature filtering, however. -```{r feature-selection-006} +```{r feature_selection-006} tsk_pen = tsk("penguins") tsk_pen$filter(tsk_pen$row_ids[complete.cases(tsk_pen$data())]) ``` Now we can use `flt("importance")` to calculate importance values: -```{r feature-selection-006-2} +```{r feature_selection-007} flt_importance = flt("importance", learner = lrn_ranger) flt_importance$calculate(tsk_pen) as.data.table(flt_importance) @@ -151,14 +151,14 @@ These subsets can be used for feature selection, which we call `r index('embedde The selected features (and those not selected) can be queried if the learner has the `"selected_features"` property. As above, we can find those learners with -```{r feature-selection-007, eval = FALSE} +```{r feature_selection-008, eval = FALSE} as.data.table(mlr_learners)[ sapply(properties, function(x) "selected_features" %in% x)] ``` For example, we can use `lrn("classif.rpart")`: -```{r feature-selection-007-2} +```{r feature_selection-009} tsk_pen = tsk("penguins") lrn_rpart = lrn("classif.rpart") lrn_rpart$train(tsk_pen) @@ -167,7 +167,7 @@ lrn_rpart$selected_features() The features selected by the model can be extracted by a `r ref("Filter")` object, where `$calculate()` corresponds to training the learner on the given task: -```{r feature-selection-008} +```{r feature_selection-010} flt_selected = flt("selected_features", learner = lrn_rpart) flt_selected$calculate(tsk_pen) as.data.table(flt_selected) @@ -183,7 +183,7 @@ Below, we find the names of features with a value of `1` and select those featur At first glance it may appear a bit convoluted to have a filter assign scores based on the feature names returned by `$selected_features()`, only to turn these scores back into the names of the features to be kept. However, this approach allows us to use the same interface for all filter methods, which is especially useful when we want to automate the feature selection process in pipelines, as we will see in @sec-pipelines-featsel. -```{r feature-selection-009} +```{r feature_selection-011} flt_selected$calculate(tsk_pen) # select all features used by rpart @@ -201,7 +201,7 @@ The first option is equivalent to dropping the bottom $p-k$ features. For both options, one has to decide on a threshold, which is often quite arbitrary. For example, to implement the first option with the information gain filter: -```{r feature-selection-010} +```{r feature_selection-012} tsk_pen = tsk("penguins") flt_gain = flt("information_gain") flt_gain$calculate(tsk_pen) @@ -214,7 +214,7 @@ tsk_pen$feature_names Or, the second option with $\tau = 0.5$: -```{r feature-selection-011} +```{r feature_selection-013} tsk_pen = tsk("penguins") flt_gain = flt("information_gain") flt_gain$calculate(tsk_pen) @@ -238,7 +238,7 @@ In more detail, wrapper methods iteratively evaluate subsets of features by resa The specific search strategy iteration is defined by a `r ref("FSelectorBatch", index = TRUE)` object. A simple example is the sequential forward selection that starts with computing each single-feature model, selects the best one, and then iteratively always adds the feature that leads to the largest performance improvement (@fig-sequential-forward-selection). -```{r optimization-003, out.width = "80%", echo = FALSE} +```{r feature_selection-014, out.width = "80%", echo = FALSE} #| label: fig-sequential-forward-selection #| fig-cap: A binary representation of sequential forward selection with four features. Gray indicates feature sets that were evaluated, with dark gray indicating the best feature set in each iteration; white indicates feature sets that were not evaluated. We start at the bottom with no selected features (all are '0'). In the next iteration all features are separately tested (each is '1' separately) and the best option (darkest in row two) is selected. This continues for selecting the second, third, and fourth features. #| fig-alt: "A web graph with one element at the bottom, four on the second row, six on third row, four on fourth row and one on fifth (top) row. Each element consists of four numbers, 0s and 1s to represent the selected (1) and unselected (0) features. The diagram is covered to suggest the optimal path was '0000' -> '1000' -> '1010' -> '1011' -> '1111'." @@ -262,7 +262,7 @@ The API is in many places nearly identical, we can use the same terminators, res We start with the simple example from above and do sequential forward selection with `tsk("penguins")`, similarly to how the sugar function `r ref("tune()")` shown in @sec-autotuner works, we can use `r ref('fselect()', aside = TRUE)` to directly start the optimization and select features. -```{r feature-selection-017, message=FALSE} +```{r feature_selection-015, message=FALSE} library(mlr3fselect) # subset features to ease visualization @@ -282,7 +282,7 @@ instance = fselect( To show all analyzed feature subsets and the corresponding performance, we use `as.data.table(instance$archive)`. In this example, the `batch_nr` column represents the iteration of the `r index('sequential forward selection')` and we start by looking at the first iteration. -```{r feature-selection-018} +```{r feature_selection-016} dt = as.data.table(instance$archive) dt[batch_nr == 1, 1:5] ``` @@ -290,11 +290,11 @@ dt[batch_nr == 1, 1:5] We see that the feature `flipper_length` achieved the highest prediction performance in the first iteration and is thus selected. We plot the performance over the iterations: -```{r feature-selection-018-5, output = FALSE, cache = FALSE} +```{r feature_selection-017, output = FALSE, cache = FALSE} autoplot(instance, type = "performance") ``` -```{r, echo = FALSE, warning = FALSE, message = FALSE} +```{r feature_selection-018, echo = FALSE, warning = FALSE, message = FALSE} #| label: fig-forwardselection #| fig-cap: Model performance in iterations of sequential forward selection. #| fig-alt: 'Scatter and line plot with "Batch" on the x-axis and "classif.acc" on the y-axis. Line shows improving performance from 1 to batch 2 then increases very slightly in batch 3 and decreases in 4, the values are in the printed instance archive.' @@ -306,7 +306,7 @@ plt In the plot, we can see that adding a second feature further improves the performance to over 90%. To see which feature was added, we can go back to the archive and look at the second iteration: -```{r feature-selection-018-2} +```{r feature_selection-019} dt[batch_nr == 2, 1:5] ``` @@ -314,7 +314,7 @@ The improvement in batch three is small so we may even prefer to select a margin To directly show the best feature set, we can use `$result_feature_set` which returns the features in alphabetical order (not order selected): -```{r feature-selection-019} +```{r feature_selection-020} instance$result_feature_set ``` @@ -333,7 +333,7 @@ In the following two sections, these classes will be created manually, to learn To create an `FSelectInstanceBatchSingleCrit` object, we use the sugar function `r ref("fsi()", aside = TRUE)`: -```{r feature-selection-020} +```{r feature_selection-021} instance = fsi( task = tsk_pen, learner = lrn_rpart, @@ -364,7 +364,7 @@ The following algorithms are currently implemented in `mlr3fselect`: Note that all these methods can be stopped (early) with a terminator, e.g. an exhaustive search can be stopped after a given number of evaluations. In this example, we will use a simple random search and retrieve it from the `r ref("mlr_fselectors", index = TRUE)` dictionary with `r ref("fs()", aside = TRUE)`. -```{r feature-selection-021} +```{r feature_selection-022} fselector = fs("random_search") ``` @@ -372,7 +372,7 @@ fselector = fs("random_search") To start the feature selection, we pass the `FSelectInstanceBatchSingleCrit` object to the `$optimize()` method of the initialized `FSelectorBatch` object: -```{r feature-selection-022, output=FALSE} +```{r feature_selection-023, output=FALSE} fselector$optimize(instance) ``` @@ -387,20 +387,20 @@ The algorithm proceeds as follows The best feature subset and the corresponding measured performance can be accessed from the instance: -```{r feature-selection-023} +```{r feature_selection-024} as.data.table(instance$result)[, .(features, classif.acc)] ``` As in the forward selection example above, one can investigate all subset evaluations, which are stored in the archive of the `FSelectInstanceBatchSingleCrit` object and can be accessed by using `as.data.table()`: -```{r feature-selection-024} +```{r feature_selection-025} as.data.table(instance$archive)[1:5, .(bill_depth, bill_length, body_mass, flipper_length, classif.acc)] ``` Now the optimized feature subset can be used to subset the task and fit the model on all observations: -```{r feature-selection-025, eval=FALSE} +```{r feature_selection-026, eval=FALSE} tsk_pen = tsk("penguins") tsk_pen$select(instance$result_feature_set) @@ -415,7 +415,7 @@ You might want to use multiple criteria to evaluate the performance of the featu In the following example, we will perform feature selection on the sonar dataset. This time, we will use `r ref("FSelectInstanceBatchMultiCrit")` to select a subset of features that has high sensitivity, i.e. TPR, and high specificity, i.e. TNR. The feature selection process with multiple criteria is similar to that with a single criterion, except that we select two measures to be optimized: -```{r feature-selection-026} +```{r feature_selection-027} instance = fsi( task = tsk("sonar"), learner = lrn_rpart, @@ -429,14 +429,14 @@ The function `r ref("fsi")` creates an instance of `FSelectInstanceBatchMultiCri We now create an `r ref("FSelectorBatch")` and call the `$optimize()` function of the `FSelectorBatch` with the `FSelectInstanceBatchMultiCrit` object, to search for the subset of features with the best TPR and FPR. Note that these two measures cannot both be optimal at the same time (except for the perfect classifier) and we expect several Pareto-optimal solutions. -```{r feature-selection-027, output=FALSE} +```{r feature_selection-028, output=FALSE} fselector = fs("random_search") fselector$optimize(instance) ``` As above, the best feature subsets and the corresponding measured performance can be accessed from the instance. -```{r feature-selection-029} +```{r feature_selection-029} as.data.table(instance$result)[, .(features, classif.tpr, classif.tnr)] ``` @@ -453,7 +453,7 @@ In the example below, a logistic regression learner is created. This learner is then wrapped in a random search feature selector that uses holdout (inner) resampling for performance evaluation. The sugar function `r ref("auto_fselector", aside = TRUE)` can be used to create an instance of `AutoFSelector`: -```{r feature-selection-030} +```{r feature_selection-030} afs = auto_fselector( fselector = fs("random_search"), learner = lrn("classif.log_reg"), @@ -467,7 +467,7 @@ afs The `AutoFSelector` can then be passed to `benchmark()` or `resample()` for nested resampling (@sec-nested-resampling). Below we compare our wrapped learner `afs` with a normal logistic regression `lrn("classif.log_reg")`. -```{r feature-selection-031, warning=FALSE} +```{r feature_selection-031, warning=FALSE} grid = benchmark_grid(tsk("sonar"), list(afs, lrn("classif.log_reg")), rsmp("cv", folds = 3)) diff --git a/book/chapters/chapter7/sequential_pipelines.qmd b/book/chapters/chapter7/sequential_pipelines.qmd index 2984c5c49..f57784e1d 100644 --- a/book/chapters/chapter7/sequential_pipelines.qmd +++ b/book/chapters/chapter7/sequential_pipelines.qmd @@ -20,7 +20,7 @@ During model training, the `PipeOp`s in a `Graph` transform a given `Task` and s As well as transforming data, `PipeOp`s generate a *state*, which is used to inform the `PipeOp`s operation during prediction, similar to how learners learn and store model parameters/weights during training that go on to inform model prediction. This is visualized in @fig-pipelines-state using the "Scaling" `PipeOp`, which scales features during training and saves the scaling factors as a state to be used in predictions. -```{r, echo = FALSE, out.width="70%"} +```{r sequential_pipelines-001, echo = FALSE, out.width="70%"} #| label: fig-pipelines-state #| fig-cap: 'The `$train()` method of the "Scaling" PipeOp both transforms data (rectangles) as well as creates a state, which is the scaling factors necessary to transform data during prediction.' #| fig-alt: 'Plot shows a box that says "Dtrain" with an arrow to "Scaling" which itself has an arrow to "Transformed Data". Below "Dtrain" is a box that says "Dtest" with an arrow to "Scaling; Scaling Factors" which itself has an arrow to "Transformed Data". There is an arrow pointing from the scaling box on the top row to the one on the bottom. There is also an arrow from the top row scaling box to "Scaling Factors", the implication is the top row created the scaling factors for the bottom row. Finally, there is a curly bracket next to "Scaling Factors" with the text "State (learned parameters)".' @@ -49,14 +49,14 @@ As with other classes, `PipeOp`s can be constructed with a sugar function, `r re An up-to-date list of `PipeOp`s contained in `mlr3pipelines` with links to their documentation can be found at `r link("https://mlr-org.com/pipeops.html")`, a small subset of these are printed below. If you want to extend `mlr3pipelines` with a `PipeOp` that has not been implemented, have a look at our vignette on extending `PipeOp`s by running: `vignette("extending", package = "mlr3pipelines")`. -```{r} +```{r sequential_pipelines-002} as.data.table(po())[1:6, 1:2] ``` Let us now take a look at a `PipeOp` in practice using `r index('principal component analysis')` (PCA)\index{PCA|see{principal component analysis}} as an example, which is implemented in `r ref("PipeOpPCA")`. Below we construct the `PipeOp` using its ID `"pca"` and inspect it. -```{r pipeop-intro-1, eval = TRUE} +```{r sequential_pipelines-003, eval = TRUE} library(mlr3pipelines) po_pca = po("pca", center = TRUE) @@ -72,7 +72,7 @@ A `PipeOp` can be trained using `$train()`, which can have multiple inputs and o Both inputs and outputs are passed as elements in a single `list`. The `"pca"` `PipeOp` takes as input the original task and after training returns the task with features replaced by their principal components. -```{r 05-pipelines-in-depth-003, eval = TRUE} +```{r sequential_pipelines-004, eval = TRUE} tsk_small = tsk("penguins_simple")$select(c("bill_depth", "bill_length")) poin = list(tsk_small$clone()$filter(1:5)) poout = po_pca$train(poin) # poin: Task in a list @@ -83,13 +83,13 @@ poout[[1]]$head() During training, PCA transforms incoming data by rotating it in such a way that features become uncorrelated and are ordered by their contribution to the total variance. The rotation matrix is also saved in the internal `$state` field during training (shown in @fig-pipelines-state), which is then used during predictions and applied to new data. -```{r 05-pipelines-in-depth-005, eval = TRUE} +```{r sequential_pipelines-005, eval = TRUE} po_pca$state ``` Once trained, the `$predict()` function can then access the saved state to operate on the test data, which again is passed as a `list`: -```{r 05-pipelines-in-depth-004, eval = TRUE} +```{r sequential_pipelines-006, eval = TRUE} tsk_onepenguin = tsk_small$clone()$filter(42) poin = list(tsk_onepenguin) poout = po_pca$predict(poin) @@ -107,7 +107,7 @@ When given two `PipeOp`s, this operator creates a `Graph` that first executes th It can also be used to connect a `Graph` with a `PipeOp`, or with another `Graph`. The following example uses `po("mutate")` to add a new feature to the task, and `po("scale")` to then `r index('scale')` and center all numeric features. -```{r 05-sequential-01} +```{r sequential_pipelines-007} po_mutate = po("mutate", mutation = list(bill_ratio = ~bill_length / bill_depth) ) @@ -121,10 +121,10 @@ For each `PipOp` (`ID`), we can see information about the state (`State`), as we In this simple `Graph`, the output of the `"mutate"` `PipeOp` is passed directly to the `"scale"` `PipeOp` and neither takes any other inputs or outputs from other `PipeOp`s. The `r index("$plot()", parent = "Graph", aside = TRUE, code = TRUE)` method can be used to visualize the graph. -```{r 05-sequential-01-evalF, eval = FALSE} +```{r sequential_pipelines-008, eval = FALSE} graph$plot(horizontal = TRUE) ``` -```{r 05-sequential-01-evalT, eval = TRUE, echo = FALSE} +```{r sequential_pipelines-009, eval = TRUE, echo = FALSE} #| label: fig-pipelines-basic-plot #| fig-cap: Simple sequential pipeline plot. #| fig-alt: 'Four boxes in a straight line connected by arrows: " -> mutate -> scale -> ".' @@ -138,14 +138,14 @@ The plot demonstrates how a `Graph` is simply a collection of `PipeOp`s that are The collection of `PipeOp`s inside a `Graph` can be accessed through the `$pipeops` \index{\$pipeops} field. The `$edges` \index{\$edges} field can be used to access edges, which returns a `data.table` listing the "source" (`src_id`, `src_channel`) and "destination" (`dst_id`, `dst_channel`) of data flowing along each edge [`$edges`/`$pipeops`]{.aside}. -```{r 05-pipelines-in-depth-018-2, eval = TRUE} +```{r sequential_pipelines-010, eval = TRUE} graph$pipeops graph$edges ``` Instead of using `%>>%`, you can also create a `Graph` explicitly using the `$add_pipeop()` and `$add_edge()` methods to create `PipeOp`s and the edges connecting them: -```{r 05-pipelines-in-depth-016} +```{r sequential_pipelines-011} graph = Graph$new()$ add_pipeop(po_mutate)$ add_pipeop(po_scale)$ @@ -163,7 +163,7 @@ A term such as "directed acyclic multigraph" would be more accurate, but we use Once built, a `Graph` can be used by calling `$train()` and `$predict()` as if it were a `Learner` (though it still outputs a `list` during training and prediction): -```{r 05-pipelines-in-depth-019, eval = TRUE} +```{r sequential_pipelines-012, eval = TRUE} result = graph$train(tsk_small) result result[[1]]$data()[1:3] @@ -177,7 +177,7 @@ Possibly the most common application for `mlr3pipelines` is to use it to perform A `Graph` representing this workflow manipulates data and fits a `Learner`-model during training, ensuring that the data is processed the same way during the prediction stage. Conceptually, the process may look as shown in @fig-pipelines-pipeline. -```{r 05-pipelines-modeling-002, eval = TRUE, echo = FALSE} +```{r sequential_pipelines-013, eval = TRUE, echo = FALSE} #| label: fig-pipelines-pipeline #| fig-cap: "Conceptualization of training and prediction process inside a sequential learner-pipeline. During training (top row), the data is passed along the preprocessing operators, each of which modifies the data and creates a `$state`. Finally, the learner receives the data and a model is created. During prediction (bottom row), data is likewise transformed by preprocessing operators, using their respective `$state` (gray boxes) information in the process. The learner then receives data that has the same format as the data seen during training, and makes a prediction." #| fig-alt: "Top pipeline: Dtrain -> Scaling -> Factor Encoding -> Median Imputation -> Decision Tree. Bottom is same as Top except starts with Dtest and at the end has an arrow to Prediction. Each PipeOp in the top row has an arrow to the same PipeOp in the bottom row pointing to a trained state." @@ -191,12 +191,12 @@ In @fig-pipelines-pipeline the final `PipeOp` is a `Learner`. With either method, internally `Learner`s are passed to `po("learner")`. The following code creates a `r ref("Graph")` that uses `po("imputesample")` to impute\index{imputation} missing values by sampling from observed values (@sec-preprocessing-missing) then fits a `r index('logistic regression')` on the transformed task. -```{r 05-pipelines-modeling-1-evalF, eval = FALSE} +```{r sequential_pipelines-014, eval = FALSE} lrn_logreg = lrn("classif.log_reg") graph = po("imputesample") %>>% lrn_logreg graph$plot(horizontal = TRUE) ``` -```{r 05-pipelines-modeling-1-evalT, eval = TRUE, echo = FALSE} +```{r sequential_pipelines-015, eval = TRUE, echo = FALSE} #| label: fig-pipelines-learnerpipeop #| fig-cap: '`"imputesample"` and `"learner"` PipeOps in a sequential pipeline.' #| fig-alt: 'Four boxes in a straight line connected by arrows: " -> imputesample -> classif.log_reg -> ".' @@ -213,7 +213,7 @@ We have seen how training and predicting `Graph`s is possible but has a slightly To use a `Graph` as a `Learner` with an identical interface, it can be wrapped in a `r ref("GraphLearner", index = TRUE)` object with `r ref("as_learner()", index = TRUE)`[`GraphLearner`]{.aside}. The `Graph` can then be used like any other `Learner`, so now we can benchmark our pipeline to decide if we should impute by sampling or with the mode of observed values (`po("imputemode")`): -```{r 05-pipelines-modeling-3} +```{r sequential_pipelines-016} glrn_sample = as_learner(graph) glrn_mode = as_learner(po("imputemode") %>>% lrn_logreg) @@ -233,7 +233,7 @@ In this example, we can see that the `r c("sampling", "mode")[which.min(unlist(a In this book, we always use `as_learner()` to convert a `Graph` to a `Learner` explicitly for clarity. While this conversion is necessary when you want to use `Learner`-specific functions like `$predict_newdata()`, builtin `mlr3` methods like `resample()` and `benchmark_grid()` will make this conversion automatically and it is therefore not strictly needed. In the above example, it is therefore also possible to use -```{r, eval = FALSE} +```{r sequential_pipelines-017, eval = FALSE} design = benchmark_grid(tsk("pima"), list(graph, po("imputesample") %>>% lrn_logreg), rsmp("cv", folds = 3)) @@ -248,7 +248,7 @@ design = benchmark_grid(tsk("pima"), You may want to inspect pipelines and the flow of data to learn more about your pipeline or to debug\index{debugging} them. We first need to set the `$keep_results` flag to be `TRUE` so that intermediate results are retained, which is turned off by default to save memory. -```{r 05-pipelines-modeling-debugging, eval = TRUE} +```{r sequential_pipelines-018, eval = TRUE} glrn_sample$graph_model$keep_results = TRUE glrn_sample$train(tsk("pima")) ``` @@ -257,7 +257,7 @@ The `Graph` can be accessed through the `$graph_model` field and then `PipeOp`s In this example, we can see that our `r ref("Task")` no longer has missing data after training the `"imputesample"` `PipeOp`. This can be used to access arbitrary intermediate results: -```{r 05-pipelines-modeling-debugging-1, eval = TRUE} +```{r sequential_pipelines-019, eval = TRUE} imputesample_output = glrn_sample$graph_model$pipeops$imputesample$ .result imputesample_output[[1]]$missings() @@ -266,7 +266,7 @@ imputesample_output[[1]]$missings() We could also use `$pipeops` to access our underlying `r ref("Learner")`, note we need to use `$learner_model` to get the learner from the `r ref("PipeOpLearner")`. We could use a similar method to peek at the state of any `PipeOp` in the graph: -```{r 05-pipelines-modeling-debugging-2, eval = TRUE} +```{r sequential_pipelines-020, eval = TRUE} pipeop_logreg = glrn_sample$graph_model$pipeops$classif.log_reg learner_logreg = pipeop_logreg$learner_model learner_logreg @@ -284,7 +284,7 @@ In this example we could have used `glrn_sample$base_learner()` to immediately a `PipeOp` hyperparameters are collected together in the `$param_set` of a graph and prefixed with the ID of the `PipeOp` to avoid parameter name clashes. Below we use the same `PipeOp` twice but set the `id` to ensure their IDs are unique. -```{r 05-pipelines-in-depth-035, eval = TRUE} +```{r sequential_pipelines-021, eval = TRUE} graph = po("scale", center = FALSE, scale = TRUE, id = "scale") %>>% po("scale", center = TRUE, scale = FALSE, id = "center") %>>% lrn("classif.rpart", cp = 1) @@ -301,7 +301,7 @@ Do not change the ID of a `PipeOp` through `graph$pipeops$$id = Whether a pipeline is treated as a `Graph` or `GraphLearner`, `r index('hyperparameters')` are updated and accessed in the same way. -```{r} +```{r sequential_pipelines-022} graph$param_set$values$classif.rpart.maxdepth = 5 graph_learner = as_learner(graph) graph_learner$param_set$values$classif.rpart.minsplit = 2 diff --git a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd index 27e5625cf..7020a4305 100644 --- a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd +++ b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd @@ -10,7 +10,7 @@ aliases: `r chapter = "Non-sequential Pipelines and Tuning"` `r authors(chapter)` -```{r pipelines-setup, include = FALSE, cache = FALSE} +```{r non-sequential_pipelines_and_tuning-001, include = FALSE, cache = FALSE} library(mlr3oml) dir.create(here::here("book", "openml"), showWarnings = FALSE, recursive = TRUE) options(mlr3oml.cache = here::here("book", "openml", "cache")) @@ -27,7 +27,7 @@ However, by using the `r ref("gunion()")` function, we can instead combine multi In the following example, we create a `Graph` that centers its inputs (`po("scale")`) and then copies the centered data to two parallel streams: one replaces the data with columns that indicate whether data is missing (`po("missind")`), and the other imputes missing data using the median (`po("imputemedian")`), which we will return to in @sec-preprocessing-missing. The outputs of both streams are then combined into a single dataset using `po("featureunion")`. -```{r 05-pipelines-modeling-003-evalF, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-002, eval = FALSE} library(mlr3pipelines) graph = po("scale", center = TRUE, scale = FALSE) %>>% @@ -39,7 +39,7 @@ graph = po("scale", center = TRUE, scale = FALSE) %>>% graph$plot(horizontal = TRUE) ``` -```{r 05-pipelines-modeling-003-evalT, fig.width = 8, eval = TRUE, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-003, fig.width = 8, eval = TRUE, echo = FALSE} #| label: fig-pipelines-parallel-plot #| fig-cap: 'Simple parallel pipeline plot showing a common data source being scaled then the same data being passed to two `PipeOp`s in parallel whose outputs are combined and returned to the user.' #| fig-alt: 'Six boxes where first two are " -> scale", then "scale" has two arrows to "missind" and "imputemedian" which both have an arrow to "featureunion -> ".' @@ -59,7 +59,7 @@ magick::image_trim(fig) When applied to the first three rows of the `"pima"` task we can see how this imputes missing data and adds a column indicating where values were missing. -```{r 05-pipelines-modeling-004, eval = TRUE} +```{r non-sequential_pipelines_and_tuning-004, eval = TRUE} tsk_pima_head = tsk("pima")$filter(1:3) tsk_pima_head$data(cols = c("diabetes", "insulin", "triceps")) result = graph$train(tsk_pima_head)[[1]] @@ -72,7 +72,7 @@ result$data(cols = c("diabetes", "insulin", "missing_insulin", "triceps", It is common in `r ref("Graph")`s for an operation to be applied to a subset of features. In `mlr3pipelines` this can be achieved in two ways (@fig-pipelines-select-affect): either by passing the column subset to the `affect_columns` hyperparameter of a `r ref("PipeOp")` (assuming it has that hyperparameter), which controls which columns should be affected by the `PipeOp`; or, one can use the `r ref("PipeOpSelect", index = TRUE)` operator to create operations in parallel on specified feature subsets, and then unite the result using `r ref("PipeOpFeatureUnion")`. -```{r echo = FALSE, out.width = "70%"} +```{r non-sequential_pipelines_and_tuning-005= FALSE, out.width = "70%"} #| label: fig-pipelines-select-affect #| layout-nrow: 2 #| fig-cap: "Two methods of setting up `PipeOp`s (`po(op1)` and `po(op2)`) that operate on complementary features (X and ¬X) of an input task." @@ -92,7 +92,7 @@ These are helper functions that indicate to a `PipeOp` which features it should For example, in @sec-pipelines-pipeops we applied PCA to the bill length and depth of penguins from `tsk("penguins_simple")` by first selecting these columns using the `Task` method `$select()` and then applying the `PipeOp`. We can now do this more simply with `selector_grep`, and could go on to use `selector_invert` to apply some other `PipeOp` to other features, below we use `po("scale")` and make use of the `affect_columns` hyperparameter: -```{r 05-pipelines-multicol-1, eval = TRUE} +```{r non-sequential_pipelines_and_tuning-006, eval = TRUE} sel_bill = selector_grep("^bill") sel_not_bill = selector_invert(sel_bill) @@ -109,7 +109,7 @@ For example, if we had reversed the order of `po("pca")` and `po("scale")` above Creating parallel paths with `po("select")` can help mitigate such errors by selecting features given by the `Selector` and creating independent data processing streams with the given feature subset. Below we pass the parallel pipelines to `r ref("gunion()")` as a `list` to ensure they receive the same input, and then combine the outputs with `po("featureunion")`. -```{r 05-pipelines-multicol-3-evalF, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-007, eval = FALSE} po_select_bill = po("select", id = "s_bill", selector = sel_bill) po_select_not_bill = po("select", id = "s_notbill", selector = sel_not_bill) @@ -120,7 +120,7 @@ path_scale = po_select_not_bill %>>% po("scale") graph = gunion(list(path_pca, path_scale)) %>>% po("featureunion") graph$plot(horizontal = TRUE) ``` -```{r 05-pipelines-multicol-3-evalT, fig.width = 8, eval = TRUE, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-008, fig.width = 8, eval = TRUE, echo = FALSE} #| label: fig-pipelines-pcascale #| fig-cap: Visualization of a `Graph` where features are split into two paths, one with PCA and one with scaling, then combined and returned. #| fig-alt: 'Seven boxes where first is "" which points to "s_bill -> pca" and "s_notbill" -> scale", then both "pca" and "scale" point to "featureunion -> ".' @@ -141,14 +141,14 @@ magick::image_trim(fig) The `po("select")` method also has the significant advantage that it allows the same set of features to be used in multiple operations simultaneously, or to both transform features and keep their untransformed versions (by using `po("nop")` in one path). `r ref("PipeOpNOP")` performs no operation on its inputs and is thus useful when you only want to perform a transformation on a subset of features and leave the others untouched: -```{r 05-pipelines-multicol-5-evalF, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-009, eval = FALSE} graph = gunion(list( po_select_bill %>>% po("scale"), po_select_not_bill %>>% po("nop") )) %>>% po("featureunion") graph$plot(horizontal = TRUE) ``` -```{r 05-pipelines-multicol-5-evalT, fig.width = 8, eval = TRUE, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-010, fig.width = 8, eval = TRUE, echo = FALSE} #| label: fig-pipelines-selectnop #| fig-cap: Visualization of our `Graph` where features are split into two paths, features that start with 'bill' are scaled and the rest are untransformed. #| fig-alt: 'Seven boxes where first is "" which points to "s_bill -> scale" and "s_notbill -> nop", then both "scale" and "nop" point to "featureunion -> ".' @@ -162,7 +162,7 @@ invisible(dev.off()) magick::image_trim(fig) ``` -```{r 05-pipelines-multicol-6, eval = TRUE} +```{r non-sequential_pipelines_and_tuning-011, eval = TRUE} graph$train(tsk("penguins_simple"))[[1]]$data()[1:3, 1:5] ``` @@ -195,7 +195,7 @@ Each learner is trained on a different random sample of the original data. Although we have already seen that a pre-constructed bagging pipeline is available with `ppl("bagging")`, in this section we will build our own pipeline from scratch to showcase how to construct a complex `r ref("Graph")`, which will look something like @fig-pipelines-bagging. -```{r, echo = FALSE, out.width = "70%"} +```{r non-sequential_pipelines_and_tuning-012, echo = FALSE, out.width = "70%"} #| label: fig-pipelines-bagging #| fig-cap: "Graph that performs Bagging by independently subsampling data and fitting individual decision tree learners. The resulting predictions are aggregated by a majority vote `PipeOp`." #| fig-alt: 'Graph shows "Dtrain" with arrows to four separate po("subsample") boxes that each have a separate arrow to four more po("classif.rpart") boxes that each have an arrow to the same one po("classif.avg") box.' @@ -204,19 +204,19 @@ include_multi_graphics("mlr3book_figures-26") To begin, we use `po("subsample")` to sample a fraction of the data (here 70%), which is then passed to a classification tree (note by default `po("subsample")` samples without replacement). -```{r 05-pipelines-non-sequential-009, eval = TRUE} +```{r non-sequential_pipelines_and_tuning-013, eval = TRUE} gr_single_pred = po("subsample", frac = 0.7) %>>% lrn("classif.rpart") ``` Next, we use `ppl("greplicate")` to copy the graph, `gr_single_pred`, 10 times (`n = 10`) and finally `po("classifavg")` to take the majority vote of all predictions, note that we pass `innum = 10` to `"classifavg"` to tell the `r ref("PipeOp")` to expect 10 inputs. -```{r 05-pipelines-non-sequential-010-evalT, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-014, eval = FALSE} gr_pred_set = ppl("greplicate", graph = gr_single_pred, n = 10) gr_bagging = gr_pred_set %>>% po("classifavg", innum = 10) gr_bagging$plot() ``` -```{r 05-pipelines-non-sequential-010-evalF, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-015, echo = FALSE} #| label: fig-pipelines-bagginggraph #| fig-cap: Constructed bagging `Graph` with one input being sampled many times for 10 different learners. #| fig-alt: 'Parallel pipeline showing "" pointing to ten PipeOps "subsample_1",...,"subsample_10" that each separately point to "classif.rpart_1",...,"classif.rpart_10" respectively, which all point to the same "classifavg -> ".' @@ -230,7 +230,7 @@ magick::image_trim(fig) Now let us see how well our bagging pipeline compares to the single decision tree and a random forest when benchmarked against `tsk("sonar")`. -```{r 05-pipelines-non-sequential-013} +```{r non-sequential_pipelines_and_tuning-016} # turn graph into learner glrn_bagging = as_learner(gr_bagging) glrn_bagging$id = "bagging" @@ -247,7 +247,7 @@ The bagged learner performs better than the decision tree but worse than the ran To automatically recreate this pipeline, you can construct `ppl("bagging")` by specifying the learner to 'bag', the number of iterations, the fraction of data to sample, and the `r ref("PipeOp")` to average the predictions, as shown in the code below. Note we set `collect_multiplicity = TRUE` which collects the predictions across paths, that technically use the `r ref("Multiplicity")` method, which we will not discuss here but refer the reader to the documentation. -```{r, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-017, eval = FALSE} ppl("bagging", lrn("classif.rpart"), iterations = 10, frac = 0.7, averager = po("classifavg", collect_multiplicity = TRUE)) @@ -258,7 +258,7 @@ While we cannot implement this directly with `mlr3pipelines`, we can use a custo We will create this `Selector` by passing a function that takes as input the task and returns a sample of the features, we sample the square root of the number of features to mimic the implementation in `r ref("ranger::ranger")`. For efficiency, we will now use `ppl("bagging")` to recreate the steps above: -```{r 05-bagging-ex} +```{r non-sequential_pipelines_and_tuning-018} # custom selector selector_subsample = function(task) { sample(task$feature_names, sqrt(length(task$feature_names))) @@ -301,7 +301,7 @@ Stacking can be built with more than two levels (both conceptually, and in `mlr3 As with bagging, we will demonstrate how to create a stacking pipeline manually, although a pre-constructed pipeline is available with `ppl("stacking")`. -```{r echo = FALSE, out.width = "70%"} +```{r non-sequential_pipelines_and_tuning-019= FALSE, out.width = "70%"} #| label: fig-pipelines-stacking #| fig-cap: "Graph that performs Stacking by fitting three models and using their outputs as features for another model after combining with `PipeOpFeatureUnion`." #| fig-alt: 'Graph shows "Dtrain" with arrows to three boxes: "Decision Tree", "KNN", and "Lasso Regression". Each of these points to the same "Feature Union -> Logistic Regression".' @@ -317,7 +317,7 @@ We first create the level 0 learners to produce the predictions that will be use In this example, we use a classification tree\index{decision tree}, `r index('k-nearest neighbors')` (KNN)\index{KNN|see{k-nearest neighbors}}, and a regularized GLM\index{generalized linear model}. Each learner is wrapped in `po("learner_cv")` which performs cross-validation on the input data and then outputs the predictions from the `r ref("Learner")` in a new `r ref("Task")` object. -```{r 05-pipelines-non-sequential-015} +```{r non-sequential_pipelines_and_tuning-020} lrn_rpart = lrn("classif.rpart", predict_type = "prob") po_rpart_cv = po("learner_cv", learner = lrn_rpart, resampling.folds = 2, id = "rpart_cv" @@ -339,7 +339,7 @@ po_glmnet_cv = po("learner_cv", These learners are combined using `r ref("gunion()")`, and `po("featureunion")` is used to merge their predictions. This is demonstrated in the output of `$train()`: -```{r 05-pipelines-non-sequential-016, warning = FALSE} +```{r non-sequential_pipelines_and_tuning-021, warning = FALSE} gr_level_0 = gunion(list(po_rpart_cv, po_knn_cv, po_glmnet_cv)) gr_combined = gr_level_0 %>>% po("featureunion") @@ -358,7 +358,7 @@ The resulting task contains the predicted probabilities for both classes made fr However, as the probabilities always add up to $1$, we only need the predictions for one of the classes (as this is a binary classification task), so we can use `po("select")` to only keep predictions for one class (we choose `"M"` in this example). -```{r 05-pipelines-non-sequential-017} +```{r non-sequential_pipelines_and_tuning-022} gr_stack = gr_combined %>>% po("select", selector = selector_grep("\\.M$")) ``` @@ -366,12 +366,12 @@ gr_stack = gr_combined %>>% Finally, we can combine our pipeline with the final model that will take these predictions as its input. Below we use `r index('logistic regression')`, which combines the level 0 predictions in a weighted linear sum. -```{r 05-pipelines-non-sequential-018-evalF, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-023, eval = FALSE} gr_stack = gr_stack %>>% po("learner", lrn("classif.log_reg")) gr_stack$plot(horizontal = TRUE) ``` -```{r 05-pipelines-non-sequential-018-evalT, fig.width = 10, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-024, fig.width = 10, echo = FALSE} #| label: fig-pipelines-stackinggraph #| fig-cap: 'Constructed stacking Graph with one input being passed to three weak learners whose predictions are passed to the logistic regression.' #| fig-alt: 'Graph with "" in the first box with arrows to three boxes: "rpart_cv", "knn_cv", "glmnet_cv", which all have arrows pointing to the same boxes: "featureunion -> select -> classif.log_reg -> ".' @@ -384,7 +384,7 @@ magick::image_trim(fig) As our final model was an interpretable logistic regression, we can inspect the weights of the level 0 learners by looking at the final trained model: -```{r 05-pipelines-non-sequential-019-x, warning = FALSE} +```{r non-sequential_pipelines_and_tuning-025, warning = FALSE} glrn_stack = as_learner(gr_stack) glrn_stack$train(tsk("sonar")) glrn_stack$base_learner()$model @@ -393,7 +393,7 @@ glrn_stack$base_learner()$model The model weights suggest that `r c("rpart", "knn", "glmnet")[which.max(glrn_stack$base_learner()$model$coefficients[-1])]` influences the predictions the most with the largest coefficient. To confirm this we can benchmark the individual models alongside the stacking pipeline. -```{r 05-pipelines-non-sequential-019-1-background, warning = FALSE} +```{r non-sequential_pipelines_and_tuning-026, warning = FALSE} glrn_stack$id = "stacking" design = benchmark_grid(tsk("sonar"), list(lrn_rpart, lrn_knn, lrn_glmnet, glrn_stack), rsmp("repeated_cv")) @@ -404,7 +404,7 @@ bmr$aggregate()[, .(learner_id, classif.ce)] This experiment confirms that of the individual models, the KNN learner performs the best, however, our stacking pipeline outperforms them all. Now that we have seen the inner workings of this pipeline, next time you might want to more efficiently create it using `ppl("stacking")`, to copy the example above you would run: -```{r, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-027, eval = FALSE} ppl("stacking", base_learners = lrns(c("classif.rpart", "classif.kknn", "classif.glmnet")), @@ -431,14 +431,14 @@ By wrapping a pipeline inside a `r ref("GraphLearner")`, we can tune it at two l Let us consider a simple, sequential pipeline using `po("pca")` followed by `lrn("classif.kknn")`: -```{r} +```{r non-sequential_pipelines_and_tuning-028} graph_learner = as_learner(po("pca") %>>% lrn("classif.kknn")) ``` The optimal setting of the `rank.` hyperparameter of our PCA `r ref("PipeOp")` may realistically depend on the value of the `k` hyperparameter of the KNN model so jointly tuning them is reasonable. For this, we can simply use the syntax for tuning `Learner`s, which was introduced in @sec-optimization. -```{r} +```{r non-sequential_pipelines_and_tuning-029} lrn_knn = lrn("classif.kknn", k = to_tune(1, 32)) po_pca = po("pca", rank. = to_tune(2, 20)) graph_learner = as_learner(po_pca %>>% lrn_knn) @@ -448,7 +448,7 @@ graph_learner$param_set$values We can see how the pipeline's `$param_set` includes the tune tokens for all selected hyperparameters, creating a joint search space. We can compare the tuned and untuned pipeline in a benchmark experiment with nested resampling by using an `AutoTuner`: -```{r} +```{r non-sequential_pipelines_and_tuning-030} glrn_tuned = auto_tuner(tnr("random_search"), graph_learner, rsmp("holdout"), term_evals = 10) glrn_untuned = po("pca") %>>% lrn("classif.kknn") @@ -467,7 +467,7 @@ However, we tuned the PCA without first considering if it was even beneficial at `po("branch")` creates multiple paths such that data can only flow through *one* of these as determined by the `selection` hyperparameter (@fig-pipelines-alternatives). This concept makes it possible to use tuning to decide which `r ref("PipeOp")`s and `r ref("Learner")`s to include in the pipeline, while also allowing all options in every path to be tuned. -```{r, echo = FALSE, out.width = "100%"} +```{r non-sequential_pipelines_and_tuning-031, echo = FALSE, out.width = "100%"} #| label: fig-pipelines-branching #| fig-cap: 'Figure demonstrates the `po("branch")` and `po("unbranch")` operators where three separate branches are created and data only flows through the PCA, which is specified with the argument to `selection`.' #| fig-alt: 'Graph with "Dtrain" on the left with an arrow to `po("branch", selection = "pca")` which then has a dark shaded arrow to a box that says "PCA". Above this box is a transparent box that says "PipeOpNOP" and below the "PCA" box is another transparent box that says "YeoJohnson", the implication is that only the "PCA" box is active. The "PCA" box then has an arrow to `po("unbranch")` -> po("branch", selection = "XGBoost")` which has three arrows to another three boxes with "XGBoost" highlighted and "Random Forest" and "Decision Tree" transparent again. These finally have arrows to the same `po("unbranch")`.' @@ -477,7 +477,7 @@ include_multi_graphics("mlr3book_figures-24") To demonstrate alternative paths we will make use of the MNIST [@lecun1998gradient] data, which is useful for demonstrating preprocessing. The data is loaded from OpenML, which is described in @sec-openml, we subset the data to make the example run faster. -```{r} +```{r non-sequential_pipelines_and_tuning-032} library(mlr3oml) otsk_mnist = otsk(id = 3573) tsk_mnist = as_task(otsk_mnist)$ @@ -489,7 +489,7 @@ tsk_mnist = as_task(otsk_mnist)$ Below we create three branches: do nothing (`po("nop")`), apply PCA (`po("pca")`), remove constant features (`po("removeconstants")`) then apply the `r index('Yeo-Johnson', lower = FALSE)` transform (`po("yeojohnson")`). It is important to use `po("unbranch")` (with the same arguments as `"branch"`) to ensure that the outputs are merged into one result object. -```{r 05-pipelines-non-sequential-003, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-033, eval = FALSE} paths = c("nop", "pca", "yeojohnson") graph = po("branch", paths, id = "brnchPO") %>>% @@ -503,7 +503,7 @@ graph = po("branch", paths, id = "brnchPO") %>>% graph$plot(horizontal = TRUE) ``` -```{r 05-pipelines-non-sequential-004-evalT, fig.width = 10, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-034, fig.width = 10, echo = FALSE} #| label: fig-pipelines-branchone #| fig-cap: 'Graph with branching to three different paths that are split with `po("branch")` and combined with `po("unbranch")`.' #| fig-alt: 'Graph starting with " -> brnchPO" which has three arrows to "removeconstants -> yeojohnson", "nop", and "pca", which all then point to "unbrnchPO -> ".' @@ -524,7 +524,7 @@ magick::image_trim(fig) We can see how the output of this `Graph` depends on the setting of the `branch.selection` hyperparameter: -```{r 05-pipelines-branch-01} +```{r non-sequential_pipelines_and_tuning-035} # use the "PCA" path graph$param_set$values$brnchPO.selection = "pca" # new PCA columns @@ -537,20 +537,20 @@ head(graph$train(tsk_mnist)[[1]]$feature_names) `ppl("branch")` simplifies the above by allowing you to just pass the different paths to the `graphs` argument (omitting "`rm_const`" for simplicity here): -```{r, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-036, eval = FALSE} ppl("branch", graphs = pos(c("nop", "pca", "yeojohnson"))) ``` Branching can even be used to tune which of several learners is most appropriate for a given dataset. We extend our example further and add the choice between a decision tree and KKNN: -```{r 05-pipelines-branch-02-evalF, eval = FALSE} +```{r non-sequential_pipelines_and_tuning-037, eval = FALSE} graph_learner = graph %>>% ppl("branch", lrns(c("classif.rpart", "classif.kknn"))) graph_learner$plot(horizontal = TRUE) ``` -```{r 05-pipelines-branch-02-evalT, fig.width = 8, fig.height = 6, echo = FALSE, out.width = "100%"} +```{r non-sequential_pipelines_and_tuning-038, fig.width = 8, fig.height = 6, echo = FALSE, out.width = "100%"} #| label: fig-pipelines-branchtwo #| fig-cap: 'Graph with branching to three different paths that are split with `po("branch")` and combined with `po("unbranch")` then branch and recombine again.' #| fig-alt: 'Graph starts with " -> brnchPO" which has three arrows to "removeconstants -> yeojohnson", "nop", and "pca", which all then point to "unbrnchPO -> branch", which then has two arrows to "classif.rpart" and "classif.kknn" which then both point to "unbranch -> ".' @@ -566,7 +566,7 @@ Tuning the `selection` hyperparameters can help determine which of the possible We additionally tune the `k` hyperparameter of the KNN learner, as it may depend on the type of preprocessing performed. As this hyperparameter is only active when the `"classif.kknn"` path is chosen we will set a dependency (@sec-optimization-depends): -```{r 05-pipelines-branch-03-prep, echo = FALSE} +```{r non-sequential_pipelines_and_tuning-039, echo = FALSE} # instead of plotting, we make autoplot() save the plot so we can edit it afterwards # This is *not* the same as ggplot::last_plot(), but the result is easier to handle in a loop. plt_container = new.env() @@ -577,7 +577,7 @@ autoplot = function(...) { } ``` -```{r 05-pipelines-branch-03} +```{r non-sequential_pipelines_and_tuning-040} graph_learner = as_learner(graph_learner) graph_learner$param_set$set_values( @@ -596,7 +596,7 @@ instance$archive$data[order(classif.ce)[1:5], autoplot(instance) ``` -```{r 05-pipelines-branch-03-post, echo = FALSE, message = FALSE, warning = FALSE} +```{r non-sequential_pipelines_and_tuning-041, echo = FALSE, message = FALSE, warning = FALSE} #| label: fig-nonseq-instance #| fig-cap: Instance after tuning preprocessing branch choice (`brnchPO.selection`), KNN `k` parameter (`classif.kknn.k`), and learning branch choice (`branch.selection`). Dots are different hyperparameter configurations that were tested during tuning, colors separate hyperparameter configurations. #| fig-alt: "Three scatter plots all with y-axis 'classif.ce' from around 0.25 to 0.5. Left plot is 'brnchPO.selection', middle is 'classif.knn.k', right is 'branch.selection'. x-axis text is the hyperparameter values to tune. Each 'row' of the y-axis indicates a different hyperparameter configuration (also separated by colored dots). The bottom row (and therefore best configuration) is at around 0.22 and shows the same results as in the instance output. Other 'rows' show a trade-off between KKNN `k` parameter, choice of learner, and choice of operators." @@ -638,7 +638,7 @@ As we can see in the results and @fig-nonseq-instance, the KNN-learner with `k` `po("proxy")` is a meta-operator that performs the operation that is stored in its `content` hyperparameter, which could be another `r ref("PipeOp")` or `r ref("Graph")`. It can therefore be used to tune over and select different `PipeOp`s or `Graph`s that could be passed to this hyperparameter (@fig-pipelines-alternatives). -```{r, echo = FALSE, out.width = "70%"} +```{r non-sequential_pipelines_and_tuning-042, echo = FALSE, out.width = "70%"} #| label: fig-pipelines-alternatives #| fig-cap: 'Figure demonstrates the `po("proxy")` operator with a `PipeOp` as its argument.' #| fig-alt: 'Graph with "Dtrain -> po("proxy", content = PCA) -> po("proxy", content = XGBoost)"; "PCA" and "XGBoost" are represented as boxes that imply PipeOps.' @@ -647,7 +647,7 @@ include_multi_graphics("mlr3book_figures-25") To recreate the example above with `po("proxy")`, the first step is to create placeholder `r ref("PipeOpProxy")` operators to stand in for the operations (i.e., different paths) that should be tuned. -```{r} +```{r non-sequential_pipelines_and_tuning-043} graph_learner = po("proxy", id = "preproc") %>>% po("proxy", id = "learner") graph_learner = as_learner(graph_learner) @@ -656,7 +656,7 @@ graph_learner = as_learner(graph_learner) The tuning space for the `content` hyperparameters should be a discrete set of possibilities to be evaluated, passed as a `r ref("p_fct")` (@sec-tune-ps). For the `"preproc"` proxy operator this would simply be the different `PipeOp`s that we want to consider: -```{r} +```{r non-sequential_pipelines_and_tuning-044} # define content for the preprocessing proxy operator preproc.content = p_fct(list( nop = po("nop"), @@ -670,7 +670,7 @@ The choice of the learner itself (`lrn("classif.rpart")` or `lrn("classif.kknn") To enable this we pass a transformation to `.extra_trafo` (@sec-tune-trafo). Note that inside this transformation we clone `learner.content`, otherwise, we would end up modifying the original `r ref("Learner")` object inside the search space by reference (@sec-r6). -```{r} +```{r non-sequential_pipelines_and_tuning-045} # define content for the learner proxy operator learner.content = p_fct(list( classif.rpart = lrn("classif.rpart"), @@ -690,7 +690,7 @@ trafo = function(x, param_set) { We can now put this all together, add the KNN tuning, and run the experiment. -```{r} +```{r non-sequential_pipelines_and_tuning-046} search_space = ps( preproc.content = preproc.content, learner.content = learner.content, @@ -729,7 +729,7 @@ Resampling with less data will still give us some information about the relative In this example, we will optimize the SVM\index{support vector machine} hyperparameters, `cost` and `gamma`, on `tsk("sonar")`: -```{r optimization-070} +```{r non-sequential_pipelines_and_tuning-047} library(mlr3tuning) learner = lrn("classif.svm", id = "svm", type = "C-classification", @@ -740,7 +740,7 @@ learner = lrn("classif.svm", id = "svm", type = "C-classification", We then construct `po("subsample")` and specify that we want to use the `frac` parameter between $[3^{-3}, 1]$ as our fidelity parameter and set the `"budget"` tag to pass this information to Hyperband. We add this to our SVM and create a `r ref("GraphLearner")`. -```{r} +```{r non-sequential_pipelines_and_tuning-048} graph_learner = as_learner( po("subsample", frac = to_tune(p_dbl(3^-3, 1, tags = "budget"))) %>>% learner @@ -749,14 +749,14 @@ graph_learner = as_learner( As good practice, we encapsulate our learner and add a fallback to prevent fatal errors (@sec-tuning-errors). -```{r} +```{r non-sequential_pipelines_and_tuning-049} graph_learner$encapsulate("evaluate", lrn("classif.featureless")) graph_learner$timeout = c(train = 30, predict = 30) ``` Now we can tune our SVM by tuning our `GraphLearner` as normal, below we set `eta = 3` for Hyperband. -```{r optimization-076} +```{r non-sequential_pipelines_and_tuning-050} instance = tune(tnr("hyperband", eta = 3), tsk("sonar"), graph_learner, rsmp("cv", folds = 3), msr("classif.ce")) @@ -778,7 +778,7 @@ Now that we have covered pipelines and tuning, we will briefly return to feature Below we use the information gain filter and select the top three features: -```{r feature-selection-012, warning = FALSE, message = FALSE} +```{r non-sequential_pipelines_and_tuning-051, warning = FALSE, message = FALSE} library(mlr3filters) library(mlr3fselect) @@ -794,7 +794,7 @@ po("filter", filter = flt("information_gain"), filter.nfeat = 3)$ Choosing `3` as the cutoff was fairly arbitrary but by tuning a graph we can optimize this cutoff: -```{r feature-selection-013} +```{r non-sequential_pipelines_and_tuning-052} # tune between 1 and total number of features po_filter = po("filter", filter = flt("information_gain"), filter.nfeat = to_tune(1, task_pen$ncol)) @@ -809,7 +809,7 @@ instance$result In this example, ``r instance$result$information_gain.filter.nfeat`` is the optimal number of features. It can be especially useful in feature selection to visualize the tuning results as there may be cases where the optimal result is only marginally better than a result with less features (which would lead to a model that is quicker to train and possibly easier to interpret). -```{r feature-selection-016} +```{r non-sequential_pipelines_and_tuning-053} #| label: fig-tunefilter #| fig-cap: Model performance with different numbers of features, selected by an information gain filter. #| fig-alt: Plot showing model performance in filter-based feature selection, showing that adding a second, third, and fourth feature to the model improves performance, while adding more features achieves no further performance gain. diff --git a/book/chapters/chapter9/preprocessing.qmd b/book/chapters/chapter9/preprocessing.qmd index 95a7e1632..d977916aa 100644 --- a/book/chapters/chapter9/preprocessing.qmd +++ b/book/chapters/chapter9/preprocessing.qmd @@ -34,7 +34,7 @@ The dataset includes 2,930 residential properties (rows) situated in Ames, Iowa, It contains 81 features about various aspects of the property, the size and shape of the lot, and information about its condition and quality. The prediction target is the sale price in USD, hence it is a regression task. -```{r, message=FALSE} +```{r preprocessing-001, message=FALSE} ames = mlr3data::ames_housing ``` @@ -45,7 +45,7 @@ This can be done quite efficiently with a package like `r ref_pkg("DataExplorer" Below we summarize the most important findings for data cleaning, but we only consider this aspect in a cursory manner: -```{r preprocessing-003, message=FALSE} +```{r preprocessing-002, message=FALSE} # 1. `Misc_Feature_2` is a factor with only a single level `Othr`. summary(ames$Misc_Feature_2) # 2. `Condition_2` and `Condition_3` are identical. @@ -56,7 +56,7 @@ cor(ames$Lot_Area, ames$Lot_Area_m2) For all three problems, simply removing the problematic features (or feature in a pair) might be the best course of action. -```{r preprocessing-006, message=FALSE} +```{r preprocessing-003, message=FALSE} to_remove = c("Lot_Area_m2", "Condition_3", "Misc_Feature_2") ``` @@ -69,7 +69,7 @@ Other typical problems that should be checked are: Before we continue with feature engineering we will create a task, measure, and resampling strategy to use throughout the chapter. -```{r preprocessing-007, message=FALSE} +```{r preprocessing-004, message=FALSE} tsk_ames = as_task_regr(ames, target = "Sale_Price", id = "ames") # remove problematic features tsk_ames$select(setdiff(tsk_ames$feature_names, to_remove)) @@ -81,7 +81,7 @@ rsmp_cv3$instantiate(tsk_ames) Lastly, we run a very simple experiment to verify our setup works as expected with a simple featureless baseline, note below we set `robust = TRUE` to always predict the *median* sale price as opposed to the *mean*. -```{r preprocessing-008, message=FALSE} +```{r preprocessing-005, message=FALSE} lrn_baseline = lrn("regr.featureless", robust = TRUE) lrn_baseline$id = "Baseline" rr_baseline = resample(tsk_ames, lrn_baseline, rsmp_cv3) @@ -92,7 +92,7 @@ rr_baseline$aggregate(msr_mae) Many machine learning algorithm implementations, such as XGBoost [@chen2016xgboost], cannot handle categorical data and so categorical features must be encoded\index{encoding} into numerical variables. -```{r preprocessing-010, message=FALSE, error=TRUE} +```{r preprocessing-006, message=FALSE, error=TRUE} lrn_xgb = lrn("regr.xgboost", nrounds = 100) lrn_xgb$train(tsk_ames) ``` @@ -100,13 +100,13 @@ lrn_xgb$train(tsk_ames) Categorical features can be grouped by their cardinality, which refers to the number of levels they contain: binary features (two levels), low-cardinality features, and high-cardinality features; there is no universal threshold for when a feature should be considered high-cardinality and this threshold can even be tuned. For now, we will consider high-cardinality to be features with more than 10 levels: -```{r} +```{r preprocessing-007} names(which(lengths(tsk_ames$levels()) > 10)) ``` Binary features can be trivially encoded by setting one of the feature levels to `1` and the other to `0`. -```{r} +```{r preprocessing-008} names(which(lengths(tsk_ames$levels()) == 2)) ``` @@ -137,7 +137,7 @@ Stratification on such features would be an alternative way to mitigate this (@s In the code below we use `po("removeconstants")` to remove features with only one level, `po("collapsefactors")` to collapse levels that occur less than 1% of the time in the data, `po("encodeimpact")` to impact-encode high-cardinality features, `po("encode", method = "one-hot")` to one-hot encode low-cardinality features, and finally `po("encode", method = "treatment")` to treatment encode binary features. -```{r preprocessing-011, message=FALSE} +```{r preprocessing-009, message=FALSE} factor_pipeline = po("removeconstants") %>>% po("collapsefactors", no_collapse_above_prevalence = 0.01) %>>% @@ -157,7 +157,7 @@ Likewise, once the treatment encoding PipeOp sees the data, all non-binary `fact Now we can apply this pipeline to our xgboost model to use it in a benchmark experiment; we also compare a simpler pipeline that only uses one-hot encoding to demonstrate performance differences resulting from different strategies. -```{r preprocessing-013, message=FALSE} +```{r preprocessing-010, message=FALSE} glrn_xgb_impact = as_learner(factor_pipeline %>>% lrn_xgb) glrn_xgb_impact$id = "XGB_enc_impact" @@ -177,7 +177,7 @@ If you are interested in learning more about different encoding strategies, incl A common problem in real-world data is `r index('missing values', 'missing data')` in features. In the Ames dataset, several variables have at least one missing data point: -```{r} +```{r preprocessing-011} # print first five with missing data names(which(tsk_ames$missings() > 0))[1:5] ``` @@ -189,7 +189,7 @@ Alternatively, one can impute by sampling from the empirical distribution of the Instead of guessing at what a missing feature might be, missing values could instead be replaced by a new level, for example, called `.MISSING` (`po("imputeoor")`). For numeric features, @ding2010investigation show that for binary classification and tree-based models, encoding missing values out-of-range (OOR), e.g. a constant value above the largest observed value, is a reasonable approach. -```{r, echo = FALSE, out.width = "60%"} +```{r preprocessing-012, echo = FALSE, out.width = "60%"} #| label: fig-imputation #| fig-cap: Mean imputation of missing values using observed values. #| fig-alt: "On the left is a vector of numbers in a column, (1.3, NA, 1.5, NA). The non-NA numbers have arrows pointing to (1.3+1.5)/2, which then has an arrow pointing to a vector of numbers in a column on the right but now (1.3, 1.4, 1.5, 1.4) with '1.4' in red to highlight they were imputed with the mean." @@ -204,7 +204,7 @@ Imputing data from minoritized communities would at best mask this data bias, an In the code below we create a pipeline from the `r ref("PipeOp")`s listed above as well as making use of `po("featureunion")` to combine multiple `PipeOp`s acting on the `"integer"` columns. -```{r preprocessing-014-evalF, eval = FALSE} +```{r preprocessing-013, eval = FALSE} impute_hist = list( po("missind", type = "integer", affect_columns = selector_type("integer") @@ -217,7 +217,7 @@ impute_hist = list( impute_hist$plot(horizontal = TRUE) ``` -```{r preprocessing-014-evalT, fig.width = 8, echo = FALSE} +```{r preprocessing-014, fig.width = 8, echo = FALSE} #| label: fig-impute #| fig-cap: Pipeline to impute missing values of numeric features by histogram with binary indicators and missings in categoricals out-of-range with a new level. #| fig-alt: "Flow diagram shows '' with arrows to 'missind' and 'imputehist', which both have arrows to 'featureunion', which has an arrow to 'imputeoor' that has an arrow to '." @@ -238,7 +238,7 @@ magick::image_trim(fig) Using this pipeline we can now run experiments with `lrn("regr.ranger")`, which cannot handle missing data; we also compare a simpler pipeline that only uses OOR imputation to demonstrate performance differences resulting from different strategies. -```{r preprocessing-016} +```{r preprocessing-015} glrn_rf_impute_hist = as_learner(impute_hist %>>% lrn("regr.ranger")) glrn_rf_impute_hist$id = "RF_imp_Hist" @@ -282,7 +282,7 @@ By default, when `task` and `learner` are not provided, the graph is set up to b Linear regression is a simple model that cannot handle most problems that we may face when processing data, but with the `ppl("robustify")` we can now include it in our experiment: -```{r preprocessing-019, warning = FALSE} +```{r preprocessing-016, warning = FALSE} glrn_lm_robust = as_learner(ppl("robustify") %>>% lrn("regr.lm")) glrn_lm_robust$id = "lm_robust" @@ -300,7 +300,7 @@ In particular, log transformation of the target can help in making the distribut Similarly, log transformation of skewed features can help to reduce the influence of outliers. In @fig-sale we plot the distribution of the target in the `ames` dataset and then the log-transformed target, we can see how simply taking the log of the variable results in a distribution that is much more symmetrical and with fewer outliers. -```{r preprocessing-001, message=FALSE} +```{r preprocessing-017, message=FALSE} #| output: false #| cache: false library(patchwork) @@ -314,7 +314,7 @@ autoplot(as_task_regr(log_ames, target = "Sale_Price")) + autoplot(as_task_regr(log_ames, target = "logSalePrice")) ``` -```{r} +```{r preprocessing-018} #| label: fig-sale #| fig-cap: Distribution of house sales prices (in USD) in the ames dataset before (left) and after (right) log transformation. Before transformation there is a skewed distribution of prices towards cheaper properties with a few outliers of very expensive properties. After transformation the distribution is much more symmetrical with the majority of points evenly spread around the same range. #| fig-alt: Two boxplots. Left plot shows house prices up to $600,000, the majority of prices are between roughly $100,000-$200,000. Right plot shows log house prices primarily around 12 with an even range between 11 and 13 and a few outliers on both sides. @@ -333,7 +333,7 @@ Many models internally scale the data if required by the algorithm so most of th Any transformations applied to the target during training must be inverted during model prediction to ensure predictions are made on the correct scale. By example, say we are interested in log transforming the target, then we would take the following steps: -```{r} +```{r preprocessing-019} df = data.table(x = runif(5), y = runif(5, 10, 20)) df # 1. log transform the target @@ -374,7 +374,7 @@ In @sec-feature-selection we look at automated `r index('feature selection')` an Functional feature extraction differs from this process as we are now interested in features that are dependent on one another and together may provide useful information but not individually. @fig-functional-features visualizes the difference between regular and functional features. -```{r optimization-003, echo = FALSE} +```{r preprocessing-021, echo = FALSE} #| label: fig-functional-features #| fig-cap: Variables x1,x2,x3 are regular features, variables xt1,...,xt365 are functional features that could be plotted to identify important properties. #| fig-alt: On the left is a table with columns 'x1,x2,x3,xt1,xt2,...,xt365'. Below the first three columns is the label 'Regular Features', below the others is the label 'Functional Features, e.g. days in year'. The table has a bidirectional arrow to a line graph that indicates plotting of one row of functional features. @@ -383,14 +383,14 @@ include_multi_graphics("mlr3book_figures-14") As a concrete example, consider the power consumption of kitchen appliances in houses in the Ames dataset. -```{r preprocessing-023, message=FALSE, warning=FALSE} +```{r preprocessing-022, message=FALSE, warning=FALSE} energy_data = mlr3data::energy_usage ``` In this dataset, each row represents one house and each feature is the total power consumption from kitchen appliances at a given time [@bagnall2017great]. The consumption is measured in two-minute intervals, resulting in 720 features. -```{r preprocessing-024, message=FALSE, warning=FALSE} +```{r preprocessing-023, message=FALSE, warning=FALSE} #| label: fig-energy #| fig-cap: Energy consumption of one example house in a day, recorded in two-minute intervals. #| fig-alt: Line plot with '2-Minute Interval' on axis ranging from 1 to 720 and 'Power Consumption' on y-axis ranging from 0 to 20. There are spikes at around (200, 20), (300, 20), and then some consistently raised usage between (500-700, 3). @@ -410,7 +410,7 @@ To do this we add a private method called `.transform_dt` that hardcodes the ope In this example, we select the functional features (which all start with "att"), extract the mean, minimum, maximum, and variance of the power consumption, and then remove the functional features. To read more about building custom `PipeOp`s, open the corresponding vignette by running `vignette("extending", package = "mlr3pipelines")` in R. -```{r preprocessing-025} +```{r preprocessing-024} PipeOpFuncExtract = R6::R6Class("PipeOpFuncExtract", inherit = mlr3pipelines::PipeOpTaskPreprocSimple, private = list( @@ -430,7 +430,7 @@ PipeOpFuncExtract = R6::R6Class("PipeOpFuncExtract", Before using this in an experiment we first test that the `PipeOp` works as expected. -```{r preprocessing-026} +```{r preprocessing-025} tsk_ames_ext = cbind(ames, energy_data) tsk_ames_ext = as_task_regr(tsk_ames_ext, "Sale_Price", "ames_ext") # remove the redundant variables identified at the start of this chapter @@ -445,7 +445,7 @@ tsk_ames_ext$data(1, These outputs look sensible compared to @fig-energy so we can now run our final benchmark experiment using feature extraction. We do not need to add the `PipeOp` to each learner as we can apply it once (as above) before any model training by applying it to all available data. -```{r preprocessing-027, warning=FALSE, R.options = list(datatable.print.nrows = 13, datatable.print.class = FALSE, datatable.print.keys = FALSE, datatable.print.trunc.cols = TRUE)} +```{r preprocessing-026, warning=FALSE, R.options = list(datatable.print.nrows = 13, datatable.print.class = FALSE, datatable.print.keys = FALSE, datatable.print.trunc.cols = TRUE)} learners = list(lrn_baseline, lrn("regr.rpart"), glrn_xgb_impact, glrn_rf_impute_oor, glrn_lm_robust, glrn_log_lm_robust) @@ -490,7 +490,7 @@ We will consider a prediction problem similar to the one from this chapter, but To evaluate the models, we again use 10-fold CV, mean absolute error and `lrn("regr.glmnet")`. For now we will ignore the `date` column and simply remove it: -```{r} +```{r preprocessing-027} library("mlr3data") kc_housing = tsk("kc_housing") kc_housing$select(setdiff(kc_housing$feature_names, "date")) diff --git a/book/index.qmd b/book/index.qmd index 0f4091562..1ed77b027 100644 --- a/book/index.qmd +++ b/book/index.qmd @@ -1,6 +1,6 @@ # Getting Started {.unnumbered .unlisted} -```{r, include = FALSE, cache = FALSE} +```{r index-001, include = FALSE, cache = FALSE} library(mlr3book) ``` From 9195731f0e3329e83c28dc0318ddde033270bedc Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 08:23:47 +0200 Subject: [PATCH 02/21] add learner weights --- book/chapters/chapter1/introduction_and_overview.qmd | 7 +++++++ book/chapters/chapter2/data_and_basic_modeling.qmd | 7 ++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 8d7b45026..5d4cb2365 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -3,6 +3,13 @@ aliases: - "/introduction_and_overview.html" --- + +```{r} +# extra packages that must be installed in the docker image +remotes::install_github("mlr-org/mlr3") +remotes::install_github("mlr-org/mlr3learners") +``` + # Introduction and Overview {#sec-introduction} {{< include ../../common/_setup.qmd >}} diff --git a/book/chapters/chapter2/data_and_basic_modeling.qmd b/book/chapters/chapter2/data_and_basic_modeling.qmd index 1ee1bb062..c1e0ebf27 100644 --- a/book/chapters/chapter2/data_and_basic_modeling.qmd +++ b/book/chapters/chapter2/data_and_basic_modeling.qmd @@ -1027,7 +1027,8 @@ There are seven column roles: 4. `"order"`: Variable(s) used to order data returned by `$data()`; must be sortable with `order()`. 5. `"group"`: Variable used to keep observations together during resampling. 6. `"stratum"`: Variable(s) to stratify during resampling. -7. `"weight"`: Observation weights. Only one numeric column may have this role. +7. `"weights_learner"`: Weights used during training by the learner. Only one numeric column may have this role. +8. `"weights_measure"`: Weights used during scoring by the measure. Only one numeric column may have this role. We have already seen how features and targets work in @sec-tasks, which are the only column roles that each task must have. In @sec-strat-group we will have a look at the `stratum` and `group` column roles. @@ -1051,7 +1052,7 @@ tsk_mtcars_order$data(ordered = TRUE) In this example we can see that by setting `"idx"` to have the `"order"` column role, it is no longer used as a feature when we run `$data()` but instead is used to order the observations according to its value. This metadata is not passed to a learner. -The `weights` column role is used to weight data points differently. +The `weights_learner` column role is used to weight data points differently. One example of why we would do this is in classification tasks with severe class imbalance, where weighting the minority class more heavily may improve the model's predictive performance for that class. For example in the `breast_cancer` dataset, there are more instances of benign tumors than malignant tumors, so if we want to better predict malignant tumors we could weight the data in favor of this class: @@ -1065,7 +1066,7 @@ df$weights = ifelse(df$class == "malignant", 2, 1) # create new task and role cancer_weighted = as_task_classif(df, target = "class") -cancer_weighted$set_col_roles("weights", roles = "weight") +cancer_weighted$set_col_roles("weights", roles = "weights_learner") # compare weighted and unweighted predictions split = partition(cancer_unweighted) From d954914ef0959fe0f6f88035f505a65eb1bd10fa Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 08:32:29 +0200 Subject: [PATCH 03/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 1 - 1 file changed, 1 deletion(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 5d4cb2365..e56a5e8ef 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -7,7 +7,6 @@ aliases: ```{r} # extra packages that must be installed in the docker image remotes::install_github("mlr-org/mlr3") -remotes::install_github("mlr-org/mlr3learners") ``` # Introduction and Overview {#sec-introduction} From 971f649f364be504c043a1f1a375b704112a1831 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 08:45:18 +0200 Subject: [PATCH 04/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index e56a5e8ef..a8a42e7bf 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -6,7 +6,7 @@ aliases: ```{r} # extra packages that must be installed in the docker image -remotes::install_github("mlr-org/mlr3") +remotes::install_github("mlr-org/mlr3@custom_cv") ``` # Introduction and Overview {#sec-introduction} From d2bf6c2c47f4d2aeaa60a563c7764ab1b6155464 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 09:17:15 +0200 Subject: [PATCH 05/21] ... --- book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd index 7020a4305..a1639242a 100644 --- a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd +++ b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd @@ -72,7 +72,7 @@ result$data(cols = c("diabetes", "insulin", "missing_insulin", "triceps", It is common in `r ref("Graph")`s for an operation to be applied to a subset of features. In `mlr3pipelines` this can be achieved in two ways (@fig-pipelines-select-affect): either by passing the column subset to the `affect_columns` hyperparameter of a `r ref("PipeOp")` (assuming it has that hyperparameter), which controls which columns should be affected by the `PipeOp`; or, one can use the `r ref("PipeOpSelect", index = TRUE)` operator to create operations in parallel on specified feature subsets, and then unite the result using `r ref("PipeOpFeatureUnion")`. -```{r non-sequential_pipelines_and_tuning-005= FALSE, out.width = "70%"} +```{r non-sequential_pipelines_and_tuning-005, echo = FALSE, out.width = "70%"} #| label: fig-pipelines-select-affect #| layout-nrow: 2 #| fig-cap: "Two methods of setting up `PipeOp`s (`po(op1)` and `po(op2)`) that operate on complementary features (X and ¬X) of an input task." From 108af3a52df69ab8e578aa7281fc91e7099c3319 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 09:46:27 +0200 Subject: [PATCH 06/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index a8a42e7bf..b1a02a11f 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -6,7 +6,8 @@ aliases: ```{r} # extra packages that must be installed in the docker image -remotes::install_github("mlr-org/mlr3@custom_cv") +remotes::install_github("mlr-org/mlr3") +remotes::install_github("mlr-org/mlr3pipelines") ``` # Introduction and Overview {#sec-introduction} From f7984a06e2acd130c883f8d9ec2393b308f74ed1 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 11:17:04 +0200 Subject: [PATCH 07/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 1 + 1 file changed, 1 insertion(+) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index b1a02a11f..8414097ec 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -8,6 +8,7 @@ aliases: # extra packages that must be installed in the docker image remotes::install_github("mlr-org/mlr3") remotes::install_github("mlr-org/mlr3pipelines") +remotes::install_github("mlr-org/mlr3fairness@weights") ``` # Introduction and Overview {#sec-introduction} From ba0ae59dc4f2fabf8ce5e8c1ddc8eace225191e3 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 11:20:46 +0200 Subject: [PATCH 08/21] ... --- book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd index a1639242a..c30927c4a 100644 --- a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd +++ b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd @@ -301,7 +301,7 @@ Stacking can be built with more than two levels (both conceptually, and in `mlr3 As with bagging, we will demonstrate how to create a stacking pipeline manually, although a pre-constructed pipeline is available with `ppl("stacking")`. -```{r non-sequential_pipelines_and_tuning-019= FALSE, out.width = "70%"} +```{r non-sequential_pipelines_and_tuning-019, echo = FALSE, out.width = "70%"} #| label: fig-pipelines-stacking #| fig-cap: "Graph that performs Stacking by fitting three models and using their outputs as features for another model after combining with `PipeOpFeatureUnion`." #| fig-alt: 'Graph shows "Dtrain" with arrows to three boxes: "Decision Tree", "KNN", and "Lasso Regression". Each of these points to the same "Feature Union -> Logistic Regression".' From 0bf8c438fa9272a3d6d852c62d07b2d13cfafb34 Mon Sep 17 00:00:00 2001 From: be-marc Date: Wed, 21 May 2025 12:50:30 +0200 Subject: [PATCH 09/21] ... --- book/chapters/chapter11/large-scale_benchmarking.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter11/large-scale_benchmarking.qmd b/book/chapters/chapter11/large-scale_benchmarking.qmd index 5277ed08a..31f6667b3 100644 --- a/book/chapters/chapter11/large-scale_benchmarking.qmd +++ b/book/chapters/chapter11/large-scale_benchmarking.qmd @@ -379,7 +379,7 @@ Among other things, the experiment registry stores the algorithms, problems, and Below, we create a registry in a subdirectory of our working directory -- on a real cluster, make sure that this folder is stored on a shared network filesystem, otherwise, the nodes cannot access it. We also set the registry's `seed` to `1` and the `packages` to `"mlr3verse"`, which will make these packages available in all our experiments. -```{r large-scale_benchmarking-034= FALSE} +```{r large-scale_benchmarking-034, echo = FALSE} #| cache: false if (dir.exists("experiments")) { unlink("experiments", recursive = TRUE) From d0c2f0c0d1fc7e47178de250631735c1b403dbc0 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 11:37:28 +0200 Subject: [PATCH 10/21] ... --- book/chapters/chapter11/large-scale_benchmarking.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter11/large-scale_benchmarking.qmd b/book/chapters/chapter11/large-scale_benchmarking.qmd index 31f6667b3..dcd72e3f6 100644 --- a/book/chapters/chapter11/large-scale_benchmarking.qmd +++ b/book/chapters/chapter11/large-scale_benchmarking.qmd @@ -585,7 +585,7 @@ bmr$aggregate()[1:5] In general, we recommend using `mlr3batchmark` for scheduling simpler `mlr3` jobs on an HPC, however, we will also briefly show you how to use `batchtools` without `mlr3batchmark` for finer control over your experiment. Again we start by creating an experiment registry. -```{r large-scale_benchmarking-052= FALSE} +```{r large-scale_benchmarking-052, include = FALSE} #| cache: false if (dir.exists("experiments-custom")) { unlink("experiments-custom", recursive = TRUE) From 15689c0428601026383b51b400dff194fa44bdcc Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 12:10:38 +0200 Subject: [PATCH 11/21] ... --- book/chapters/appendices/solutions.qmd | 6 +++--- book/chapters/appendices/tasks.qmd | 2 +- book/chapters/chapter11/large-scale_benchmarking.qmd | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/book/chapters/appendices/solutions.qmd b/book/chapters/appendices/solutions.qmd index 4f8ccd1a3..dad9012d9 100644 --- a/book/chapters/appendices/solutions.qmd +++ b/book/chapters/appendices/solutions.qmd @@ -1916,7 +1916,7 @@ costs Our cost matrix is as expected so we can plug it into our measure and setup our pipeline. -```{r solutions-101='hide'} +```{r solutions-101, results='hide'} # Create measure meas_costs = msr("classif.costs", costs = costs) @@ -2151,9 +2151,9 @@ We'll go through them one by one to deepen our understanding: This is an often encountered problem, as error metrics have a large variance when samples are small. Note, that the pre- and post-processing methods in general do not all support multiple protected attributes. -* We should question whether comparing the metric between all groups actually makes sense for the question we are trying to answer. Instead, we might want to observe the metric between two specific subgroups, in this case between individuals with `sex`: `Female` and `race`: `"Black"` or `"White`. +* We should question whether comparing the metric between all groups actually makes sense for the question we are trying to answer. Instead, we might want to observe the metric between two specific subgroups, in this case between individuals with `sex`: `Female` and `race`: `"Black"` or `"White"`. -First, we create a subset of only `sex`: `Female` and `race`: `"Black", "White`. +First, we create a subset of only `sex`: `Female` and `race`: `"Black", "White"`. ```{r solutions-121} adult_subset = tsk_adult_test$clone() diff --git a/book/chapters/appendices/tasks.qmd b/book/chapters/appendices/tasks.qmd index 2aba75489..9d9eb1624 100644 --- a/book/chapters/appendices/tasks.qmd +++ b/book/chapters/appendices/tasks.qmd @@ -8,7 +8,7 @@ aliases: {{< include ../../common/_setup.qmd >}} -```{r tasks-001=FALSE} +```{r tasks-001, include = FALSE} library(mlr3verse) library(mlr3proba) library(mlr3spatiotempcv) diff --git a/book/chapters/chapter11/large-scale_benchmarking.qmd b/book/chapters/chapter11/large-scale_benchmarking.qmd index dcd72e3f6..9613dc525 100644 --- a/book/chapters/chapter11/large-scale_benchmarking.qmd +++ b/book/chapters/chapter11/large-scale_benchmarking.qmd @@ -585,7 +585,7 @@ bmr$aggregate()[1:5] In general, we recommend using `mlr3batchmark` for scheduling simpler `mlr3` jobs on an HPC, however, we will also briefly show you how to use `batchtools` without `mlr3batchmark` for finer control over your experiment. Again we start by creating an experiment registry. -```{r large-scale_benchmarking-052, include = FALSE} +```{r large-scale_benchmarking-052 = FALSE} #| cache: false if (dir.exists("experiments-custom")) { unlink("experiments-custom", recursive = TRUE) From 9148578365f0ad1809cf8cb87c4b79bd5e16fea6 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 12:16:08 +0200 Subject: [PATCH 12/21] ... --- book/chapters/chapter9/preprocessing.qmd | 1 + 1 file changed, 1 insertion(+) diff --git a/book/chapters/chapter9/preprocessing.qmd b/book/chapters/chapter9/preprocessing.qmd index 95a7e1632..acd7d7797 100644 --- a/book/chapters/chapter9/preprocessing.qmd +++ b/book/chapters/chapter9/preprocessing.qmd @@ -239,6 +239,7 @@ magick::image_trim(fig) Using this pipeline we can now run experiments with `lrn("regr.ranger")`, which cannot handle missing data; we also compare a simpler pipeline that only uses OOR imputation to demonstrate performance differences resulting from different strategies. ```{r preprocessing-016} +#| eval: false glrn_rf_impute_hist = as_learner(impute_hist %>>% lrn("regr.ranger")) glrn_rf_impute_hist$id = "RF_imp_Hist" From 5bc056b1d9505c7fef6939cbd7aae2e6eb5def4d Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:14:55 +0200 Subject: [PATCH 13/21] add section about mirai --- .../advanced_technical_aspects_of_mlr3.qmd | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd index 7d76f27f2..92eed8b4a 100644 --- a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd +++ b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd @@ -491,6 +491,98 @@ lrn_rpart$parallel_predict = TRUE prediction = lrn_rpart$predict(tsk_sonar) ``` +### Parallelization with `mirai` + +With `mlr3` 1.0.0, we integrated the `mirai` package, which provides a new backend for parallelization. +`mirai` starts persistent R sessions which are called daemons to evaluate tasks in parallel. +Daemons can be launched locally or on a remote machine via ssh or cluster managers. +`mirai` is much faster than `future`. +Like the parallelization with `future`, as a user you only have to configure the backend before starting any computations. + + + +```{r, include = FALSE} +mirai::daemons(0) +``` + +Parallelization of Resamplings and Benchmarks with `mirai`. + +```{r} +library(mirai) + +mirai::daemons(2) + +tsk_sonar = tsk("sonar") +lrn_rpart = lrn("classif.rpart") +rsmp_cv3 = rsmp("cv", folds = 3) +system.time({resample(tsk_sonar, lrn_rpart, rsmp_cv3)}) +``` + +Setting a chunk size is a thing of the past with `mirai`. + +Parallelization of Tuning with `mirai`. + +The daemon is still running, so we can continue with the tuning. + +```{r} +instance = tune( + tnr("random_search", batch_size = 12), + tsk("penguins"), + lrn("classif.rpart", minsplit = to_tune(2, 128)), + rsmp("cv", folds = 3), + term_evals = 20 +) + +instance$archive$n_evals +``` + + +Nested resampling with `mirai`. +Parallelize the outer loop with `daemons(5)` + +```{r} +lrn_rpart = lrn("classif.rpart", + minsplit = to_tune(2, 128)) + +lrn_rpart_tuned = auto_tuner(tnr("random_search", batch_size = 2), + lrn_rpart, rsmp("cv", folds = 3), msr("classif.ce"), 2) + +rr = resample(tsk("penguins"), lrn_rpart_tuned, rsmp("cv", folds = 5)) +``` + +Parallelizing the outer and inner loop with `mirai`. +Use `everywhere()` to setup the daemons of the inner loop on the workers of the outer loop. + +```{r, eval = FALSE} +# reset daemons +mirai::daemons(0) + +mirai::daemons(5) + +everywhere({ + mirai::daemons(3) +}) +``` + +Running the outer loop in the main session and the inner loop in the workers is currently not supported. +But you can run the outer loop in a single daemon and the inner loop on multiple daemons. + +```{r, eval = FALSE} +# reset daemons +mirai::daemons(0) + +mirai::daemons(1) + +everywhere({ + mirai::daemons(3) +}) +``` + + + + + + ## Error Handling {#sec-error-handling} In large experiments, it is not uncommon that a model fit or prediction fails with an error.\index{debugging} From 55c42cdaf7623f6693846ca4511fa3ca360d02ca Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:15:45 +0200 Subject: [PATCH 14/21] ... --- book/chapters/chapter11/large-scale_benchmarking.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter11/large-scale_benchmarking.qmd b/book/chapters/chapter11/large-scale_benchmarking.qmd index 9613dc525..dcd72e3f6 100644 --- a/book/chapters/chapter11/large-scale_benchmarking.qmd +++ b/book/chapters/chapter11/large-scale_benchmarking.qmd @@ -585,7 +585,7 @@ bmr$aggregate()[1:5] In general, we recommend using `mlr3batchmark` for scheduling simpler `mlr3` jobs on an HPC, however, we will also briefly show you how to use `batchtools` without `mlr3batchmark` for finer control over your experiment. Again we start by creating an experiment registry. -```{r large-scale_benchmarking-052 = FALSE} +```{r large-scale_benchmarking-052, include = FALSE} #| cache: false if (dir.exists("experiments-custom")) { unlink("experiments-custom", recursive = TRUE) From 1c28b4a01025a0c04f25651152cfc842f5aab619 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:17:39 +0200 Subject: [PATCH 15/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 1 + 1 file changed, 1 insertion(+) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 8414097ec..368e4f901 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -9,6 +9,7 @@ aliases: remotes::install_github("mlr-org/mlr3") remotes::install_github("mlr-org/mlr3pipelines") remotes::install_github("mlr-org/mlr3fairness@weights") +remotes::install_github("mlr-org/mlr3learners") ``` # Introduction and Overview {#sec-introduction} From 792ae66b9b8e2c695c7a586cfbbf3106182f3266 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:30:59 +0200 Subject: [PATCH 16/21] ... --- .../chapter1/introduction_and_overview.qmd | 4 +- .../advanced_technical_aspects_of_mlr3.qmd | 57 ++++++++++--------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 30027a0ed..fd0be8fa7 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -6,9 +6,11 @@ aliases: ```{r} # extra packages that must be installed in the docker image -remotes::install_github("mlr-org/mlr3") +remotes::install("mirai") +remotes::install_github("mlr-org/mlr3@mirai") remotes::install_github("mlr-org/mlr3pipelines") remotes::install_github("mlr-org/mlr3fairness@weights") +remotes::install_github("mlr-org/mlr3learners") ``` # Introduction and Overview {#sec-introduction} diff --git a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd index 92eed8b4a..8238b5066 100644 --- a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd +++ b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd @@ -491,38 +491,42 @@ lrn_rpart$parallel_predict = TRUE prediction = lrn_rpart$predict(tsk_sonar) ``` -### Parallelization with `mirai` - -With `mlr3` 1.0.0, we integrated the `mirai` package, which provides a new backend for parallelization. -`mirai` starts persistent R sessions which are called daemons to evaluate tasks in parallel. -Daemons can be launched locally or on a remote machine via ssh or cluster managers. -`mirai` is much faster than `future`. -Like the parallelization with `future`, as a user you only have to configure the backend before starting any computations. - - +### Parallelization with `mirai` {#sec-parallel-mirai} ```{r, include = FALSE} mirai::daemons(0) ``` -Parallelization of Resamplings and Benchmarks with `mirai`. +With `mlr3` 1.0.0, we integrated the `r ref_pkg("mirai")` package as an alternative parallelization backend. +`mirai` provides a lightweight approach to parallelization by starting persistent R sessions called daemons that evaluate tasks in parallel. +These daemons can be launched either locally or on remote machines via SSH or cluster managers. +Compared to the `r ref_pkg("future")` package, `mirai` has significantly lower overhead per task. +Like parallelization with `future`, users only need to configure the backend before starting any computations. +The following sections demonstrate how to use `mirai` for parallelizing resamplings, benchmarks, and tuning. -```{r} +To use `mirai` for parallelization, we first need to start the daemons. +We start two daemons and check the status of the daemons. + +```{r, eval = FALSE} library(mirai) mirai::daemons(2) +mirai::status() +``` + +We parallelize a three-fold CV for a decision tree on the sonar task. + +```{r} tsk_sonar = tsk("sonar") lrn_rpart = lrn("classif.rpart") rsmp_cv3 = rsmp("cv", folds = 3) system.time({resample(tsk_sonar, lrn_rpart, rsmp_cv3)}) ``` -Setting a chunk size is a thing of the past with `mirai`. - -Parallelization of Tuning with `mirai`. +One advantage of `mirai` is that it eliminates the need to manually set chunk sizes, as it automatically handles task distribution efficiently. -The daemon is still running, so we can continue with the tuning. +Since the daemons are already running, we can proceed directly with the tuning example. ```{r} instance = tune( @@ -536,11 +540,16 @@ instance = tune( instance$archive$n_evals ``` - -Nested resampling with `mirai`. -Parallelize the outer loop with `daemons(5)` +`mirai` also supports nested resampling, where the outer loop can be parallelized while the inner loop runs sequentially. +We start a daemons for each outer resampling iteration. +The inner loop runs sequentially. ```{r} +# reset daemons +mirai::daemons(0) + +mirai::daemons(5) + lrn_rpart = lrn("classif.rpart", minsplit = to_tune(2, 128)) @@ -550,8 +559,7 @@ lrn_rpart_tuned = auto_tuner(tnr("random_search", batch_size = 2), rr = resample(tsk("penguins"), lrn_rpart_tuned, rsmp("cv", folds = 5)) ``` -Parallelizing the outer and inner loop with `mirai`. -Use `everywhere()` to setup the daemons of the inner loop on the workers of the outer loop. +We can also parallelize both outer and inner loops using the `everywhere()` function to set up daemons for the inner loop on the daemons of the outer loop. ```{r, eval = FALSE} # reset daemons @@ -564,8 +572,8 @@ everywhere({ }) ``` -Running the outer loop in the main session and the inner loop in the workers is currently not supported. -But you can run the outer loop in a single daemon and the inner loop on multiple daemons. +Note that running the outer loop in the main session while parallelizing the inner loop is currently not supported. +However, you can run the outer loop in a single daemon and the inner loop on multiple daemons ```{r, eval = FALSE} # reset daemons @@ -578,11 +586,6 @@ everywhere({ }) ``` - - - - - ## Error Handling {#sec-error-handling} In large experiments, it is not uncommon that a model fit or prediction fails with an error.\index{debugging} From d75668a5df01691a485d1d9bb0628fd2c8377736 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:32:13 +0200 Subject: [PATCH 17/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index fd0be8fa7..4b9991943 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -6,11 +6,11 @@ aliases: ```{r} # extra packages that must be installed in the docker image -remotes::install("mirai") +remotes::install_cran("mirai") remotes::install_github("mlr-org/mlr3@mirai") remotes::install_github("mlr-org/mlr3pipelines") remotes::install_github("mlr-org/mlr3fairness@weights") -remotes::install_github("mlr-org/mlr3learners") +remotes::install_github("mlr-org/mlr3") ``` # Introduction and Overview {#sec-introduction} From 7b768b8e7f257c49be52e4e02d5009584e7e1c62 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:37:54 +0200 Subject: [PATCH 18/21] ... --- book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd index 8238b5066..247d49af9 100644 --- a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd +++ b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd @@ -541,7 +541,7 @@ instance$archive$n_evals ``` `mirai` also supports nested resampling, where the outer loop can be parallelized while the inner loop runs sequentially. -We start a daemons for each outer resampling iteration. +We start a daemon for each outer resampling iteration. The inner loop runs sequentially. ```{r} From 82b42db99f0a607a378cdddf3ae4cc99765bd336 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:47:06 +0200 Subject: [PATCH 19/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 4b9991943..575c84a0e 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -10,7 +10,7 @@ remotes::install_cran("mirai") remotes::install_github("mlr-org/mlr3@mirai") remotes::install_github("mlr-org/mlr3pipelines") remotes::install_github("mlr-org/mlr3fairness@weights") -remotes::install_github("mlr-org/mlr3") +remotes::install_github("mlr-org/mlr3learners") ``` # Introduction and Overview {#sec-introduction} From eb5f96afce3f0ac70bb3dc19da15ac34c4631013 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 14:20:16 +0200 Subject: [PATCH 20/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index 575c84a0e..a6651039f 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -6,11 +6,11 @@ aliases: ```{r} # extra packages that must be installed in the docker image -remotes::install_cran("mirai") remotes::install_github("mlr-org/mlr3@mirai") remotes::install_github("mlr-org/mlr3pipelines") remotes::install_github("mlr-org/mlr3fairness@weights") remotes::install_github("mlr-org/mlr3learners") +remotes::install_github("r-lib/mirai") ``` # Introduction and Overview {#sec-introduction} From 3be9abeacefb6425f56300631df0534bb42bf10f Mon Sep 17 00:00:00 2001 From: be-marc Date: Mon, 28 Jul 2025 13:13:18 +0200 Subject: [PATCH 21/21] ... --- book/chapters/chapter1/introduction_and_overview.qmd | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd index a6651039f..15bdeccc8 100644 --- a/book/chapters/chapter1/introduction_and_overview.qmd +++ b/book/chapters/chapter1/introduction_and_overview.qmd @@ -7,10 +7,7 @@ aliases: ```{r} # extra packages that must be installed in the docker image remotes::install_github("mlr-org/mlr3@mirai") -remotes::install_github("mlr-org/mlr3pipelines") -remotes::install_github("mlr-org/mlr3fairness@weights") -remotes::install_github("mlr-org/mlr3learners") -remotes::install_github("r-lib/mirai") +remotes::install_cran("mirai") ``` # Introduction and Overview {#sec-introduction}