## ----ex_setup, include=FALSE------------------------------------------------------------ knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>", eval = rlang::is_installed("ggplot2") && rlang::is_installed("modeldata") ) options(digits = 3, width = 90) ## ----ggplot2_setup, include = FALSE----------------------------------------------------- library(ggplot2) theme_set(theme_bw()) ## ----attrition, message=FALSE----------------------------------------------------------- library(rsample) data("attrition", package = "modeldata") names(attrition) table(attrition$Attrition) ## ----form, message=FALSE---------------------------------------------------------------- mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ## ----model_vfold, message=FALSE--------------------------------------------------------- library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ## ----lm_func---------------------------------------------------------------------------- ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ## ----onefold, warning = FALSE----------------------------------------------------------- example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ## ----model_purrr, warning=FALSE--------------------------------------------------------- library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ## ----model_acc-------------------------------------------------------------------------- rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ## ----type_plot, fig.alt = "Two boxplots of monthly income separated by gender, showing a slight difference in median but largely overlapping boxes."---- ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ## ----mean_diff-------------------------------------------------------------------------- median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ## ----boot_mean_diff--------------------------------------------------------------------- set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ## ----stats------------------------------------------------------------------------------ bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ## ----stats_plot, fig.alt = "The bootstrap distribution of the differences in median monthly income: it is slightly bimodal and left-skewed."---- ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ## ----ci--------------------------------------------------------------------------------- quantile(bt_resamples$wage_diff, probs = c(0.025, 0.975)) ## ----coefs------------------------------------------------------------------------------ glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ## ----tidy_rsplit------------------------------------------------------------------------ first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ## ----tidy_rset-------------------------------------------------------------------------- class(bt_resamples) tidy(bt_resamples)