## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse  = TRUE,
  comment   = "#>",
  eval      = TRUE,
  warning   = FALSE,
  message   = FALSE
)

## ----install, eval = FALSE----------------------------------------------------
# # Install from GitHub
# remotes::install_github("JustinManjourides/misl")
# 
# # Optional backend packages for additional learners
# install.packages(c("ranger", "xgboost", "earth"))

## ----simulate-----------------------------------------------------------------
library(misl)

set.seed(42)
n <- 300

sim_data <- data.frame(
  # Continuous predictors (always observed)
  age    = round(rnorm(n, mean = 50, sd = 12)),
  bmi    = round(rnorm(n, mean = 26, sd = 4), 1),
  # Continuous outcome with missingness
  sbp    = round(120 + 0.4 * rnorm(n, mean = 50, sd = 12) +
                       0.3 * rnorm(n, mean = 26, sd = 4) + rnorm(n, sd = 10)),
  # Binary outcome with missingness (0 = no, 1 = yes)
  smoker = rbinom(n, 1, prob = 0.3),
  # Categorical outcome with missingness
  group  = factor(sample(c("A", "B", "C"), n, replace = TRUE,
                         prob = c(0.4, 0.35, 0.25)))
)

# Introduce missing values
sim_data[sample(n, 40), "sbp"]    <- NA
sim_data[sample(n, 30), "smoker"] <- NA
sim_data[sample(n, 30), "group"]  <- NA

# Summarise missingness
sapply(sim_data, function(x) sum(is.na(x)))

## ----list-learners------------------------------------------------------------
knitr::kable(list_learners())

## ----basic-usage--------------------------------------------------------------
misl_imp <- misl(
  sim_data,
  m          = 5,
  maxit      = 5,
  con_method = "glm",
  bin_method = "glm",
  cat_method = "multinom_reg",
  seed       = 42,
  quiet      = TRUE
)

## ----inspect-output-----------------------------------------------------------
# Number of imputed datasets
length(misl_imp)

# Confirm no missing values remain
anyNA(misl_imp[[1]]$datasets)

# Compare observed vs imputed distribution for sbp
obs_mean <- mean(sim_data$sbp, na.rm = TRUE)
imp_mean <- mean(misl_imp[[1]]$datasets$sbp)

cat("Observed mean sbp (non-missing):", round(obs_mean, 2), "\n")
cat("Imputed mean sbp  (all values): ", round(imp_mean, 2), "\n")

# Check binary imputations are valid
table(misl_imp[[1]]$datasets$smoker)

# Check categorical imputations stay within observed levels
levels(misl_imp[[1]]$datasets$group)

## ----multi-learner, eval = FALSE----------------------------------------------
# misl_stack <- misl(
#   sim_data,
#   m          = 5,
#   maxit      = 5,
#   con_method = c("glm", "rand_forest"),
#   bin_method = c("glm", "rand_forest"),
#   cat_method = c("multinom_reg", "rand_forest"),
#   cv_folds   = 3,
#   seed       = 42
# )

## ----pool-results-------------------------------------------------------------
# Fit a linear model to each imputed dataset
models <- lapply(misl_imp, function(imp) {
  lm(sbp ~ age + bmi + smoker + group, data = imp$datasets)
})

# Pool point estimates and standard errors using Rubin's rules
m       <- length(models)
ests    <- sapply(models, function(fit) coef(fit))
vars    <- sapply(models, function(fit) diag(vcov(fit)))

Q_bar   <- rowMeans(ests)                          # pooled estimate
U_bar   <- rowMeans(vars)                          # within-imputation variance
B       <- apply(ests, 1, var)                     # between-imputation variance
T_total <- U_bar + (1 + 1 / m) * B                # total variance

pooled <- data.frame(
  term     = names(Q_bar),
  estimate = round(Q_bar, 4),
  std.error = round(sqrt(T_total), 4),
  conf.low  = round(Q_bar - 1.96 * sqrt(T_total), 4),
  conf.high = round(Q_bar + 1.96 * sqrt(T_total), 4)
)
print(pooled)

## ----pool-mice, eval = FALSE--------------------------------------------------
# library(mice)
# pooled_mice <- summary(pool(models), conf.int = TRUE)

## ----trace-plot---------------------------------------------------------------
# Extract trace data across all m datasets
trace <- do.call(rbind, lapply(misl_imp, function(r) r$trace))

# Plot mean of imputed sbp values across iterations for each dataset
sbp_trace <- subset(trace, variable == "sbp" & statistic == "mean")

plot(
  sbp_trace$iteration,
  sbp_trace$value,
  col  = sbp_trace$m,
  pch  = 16,
  xlab = "Iteration",
  ylab = "Mean imputed sbp",
  main = "Trace plot: mean of imputed sbp values",
  xaxt = "n"
)
axis(1, at = 1:5)
legend("topright", legend = paste("m =", 1:5),
       col = 1:5, pch = 16, cex = 0.8)

## ----parallel, eval = FALSE---------------------------------------------------
# library(future)
# 
# # Use all available cores
# plan(multisession)
# 
# misl_parallel <- misl(
#   sim_data,
#   m          = 10,
#   maxit      = 5,
#   con_method = c("glm", "rand_forest"),
#   bin_method = c("glm", "rand_forest"),
#   cat_method = c("multinom_reg", "rand_forest"),
#   seed       = 42
# )
# 
# # Always reset the plan when done
# plan(sequential)

## ----parallel-limited, eval = FALSE-------------------------------------------
# plan(multisession, workers = 4)

## ----detect-cores, eval = FALSE-----------------------------------------------
# parallel::detectCores()

## ----session-info-------------------------------------------------------------
sessionInfo()