## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6 ) set.seed(123) # Quiet down lgr::get_logger("mlr3")$set_threshold("warn") options("xplain.progress" = interactive()) ## ----setup-------------------------------------------------------------------- library(xplainfi) library(mlr3) library(mlr3learners) library(data.table) library(ggplot2) ## ----setup-problem------------------------------------------------------------ task <- tgen("friedman1")$generate(n = 300) learner <- lrn("regr.ranger", num.trees = 100) measure <- msr("regr.mse") resampling <- rsmp("cv", folds = 3) ## ----pfi-basic---------------------------------------------------------------- pfi <- PFI$new( task = task, learner = learner, measure = measure, resampling = resampling, n_repeats = 10 ) pfi$compute() pfi$importance() ## ----pfi-parameters----------------------------------------------------------- pfi_stable <- PFI$new( task = task, learner = learner, measure = measure, resampling = resampling, n_repeats = 50 ) pfi_stable$compute() pfi_stable$importance() ## ----pif-nrepeats------------------------------------------------------------- pfi_stable$scores()[feature == "important2", ] |> ggplot(aes(y = importance, x = factor(iter_rsmp))) + geom_boxplot() + labs( title = "PFI variability within resampling iterations", subtitle = "Setting n_repeats higher improves PFI estimates", y = "PFI score (important2)", x = "Resampling iteration (3-fold CV)" ) + theme_minimal() ## ----pfi-scores-tmp, echo=FALSE----------------------------------------------- pfi_important2_range = round(range(pfi_stable$scores()[feature == "important2", importance]), 2) ## ----pfi-ratio---------------------------------------------------------------- pfi_stable$importance(relation = "ratio") ## ----loco-basic--------------------------------------------------------------- loco <- LOCO$new( task = task, learner = learner, measure = measure, resampling = resampling, n_repeats = 10 ) loco$compute() loco$importance() ## ----samplers-demo------------------------------------------------------------ arf_sampler <- ConditionalARFSampler$new(task) sample_data <- task$data(rows = 1:5) sample_data[, .(important1, important2)] ## ----conditional-sampling----------------------------------------------------- sampled_conditional <- arf_sampler$sample_newdata( feature = "important1", newdata = sample_data, conditioning_set = c("important2", "important3") ) sample_data[, .(important1, important2, important3)] sampled_conditional[, .(important1, important2, important3)] ## ----detailed-scores---------------------------------------------------------- pfi$scores() |> head(10) |> knitr::kable(digits = 4, caption = "Detailed PFI scores (first 10 rows)") ## ----scoring-summary---------------------------------------------------------- pfi$scores()[, .( features = uniqueN(feature), resampling_folds = uniqueN(iter_rsmp), permutation_iters = uniqueN(iter_repeat), total_scores = .N )] ## ----detailed-scores-ratio---------------------------------------------------- pfi$scores(relation = "ratio") |> head(10) |> knitr::kable(digits = 4, caption = "PFI scores using the ratio (first 10 rows)") ## ----pfi-obs-scores----------------------------------------------------------- pfi$obs_loss() ## ----pretrained-pfi----------------------------------------------------------- resampling_holdout <- rsmp("holdout")$instantiate(task) learner_trained <- lrn("regr.ranger", num.trees = 100) learner_trained$train(task, row_ids = resampling_holdout$train_set(1)) pfi_pretrained <- PFI$new( task = task, learner = learner_trained, measure = measure, resampling = resampling_holdout, n_repeats = 10 ) pfi_pretrained$compute() pfi_pretrained$importance() ## ----pretrained-custom-------------------------------------------------------- # Simulate: learner was trained elsewhere, we have new data to use new_data <- tgen("friedman1")$generate(n = 100) # Same as rsmp_all_test(task) resampling_custom <- rsmp("custom")$instantiate( new_data, train_sets = list(integer(0)), test_sets = list(new_data$row_ids) ) pfi_newdata <- PFI$new( task = new_data, learner = learner_trained, measure = measure, resampling = resampling_custom, n_repeats = 10 ) pfi_newdata$compute() pfi_newdata$importance() ## ----pretrained-error, error = TRUE------------------------------------------- try({ PFI$new( task = task, learner = learner_trained, measure = measure, resampling = rsmp("cv", folds = 3) ) }) ## ----parallel-future, eval = FALSE-------------------------------------------- # library(future) # plan("multisession", workers = 2) # # # PFI with parallelization across features # pfi_parallel = PFI$new( # task, # learner = lrn("regr.ranger"), # measure = msr("regr.mse"), # n_repeats = 10 # ) # pfi_parallel$compute() # pfi_parallel$importance() # # # LOCO with parallelization (uses mlr3fselect internally) # loco_parallel = LOCO$new( # task, # learner = lrn("regr.ranger"), # measure = msr("regr.mse") # ) # loco_parallel$compute() # loco_parallel$importance() ## ----parallel-mirai, eval = FALSE--------------------------------------------- # library(mirai) # daemons(n = 2) # # # Same PFI/LOCO code works with mirai backend # pfi_parallel = PFI$new( # task, # learner = lrn("regr.ranger"), # measure = msr("regr.mse"), # n_repeats = 10 # ) # pfi_parallel$compute() # pfi_parallel$importance() # # # Clean up daemons when done # daemons(0)