## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) ## ----load-packages------------------------------------------------------------ library(miceFast) library(dplyr) library(data.table) library(ggplot2) set.seed(123456) ## ----data--------------------------------------------------------------------- data(air_miss) str(air_miss) ## ----upset, eval=requireNamespace("UpSetR", quietly=TRUE)--------------------- upset_NA(air_miss, 6) ## ----vif---------------------------------------------------------------------- VIF( air_miss, posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp", "x_character", "Day", "weights", "groups") ) ## ----fill-na-basic------------------------------------------------------------ data(air_miss) result <- air_miss %>% # Continuous variable: Bayesian linear model (stochastic) mutate(Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )) %>% # Categorical variable: LDA mutate(x_char_imp = fill_NA( x = ., model = "lda", posit_y = "x_character", posit_x = c("Wind", "Temp") )) head(result[, c("Ozone", "Ozone_imp", "x_character", "x_char_imp")]) ## ----fill-na-grouped---------------------------------------------------------- data(air_miss) result_grouped <- air_miss %>% group_by(groups) %>% do(mutate(., Solar_R_imp = fill_NA( x = ., model = "lm_pred", posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept"), w = .[["weights"]] ) )) %>% ungroup() head(result_grouped[, c("Solar.R", "Solar_R_imp", "groups")]) ## ----fill-na-logreg----------------------------------------------------------- data(air_miss) result_log <- air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp", "Intercept"), logreg = TRUE )) # Compare distributions: log imputation avoids negative values summary(result_log[c("Ozone", "Ozone_imp")]) ## ----fill-na-position--------------------------------------------------------- data(air_miss) result_pos <- air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_bayes", posit_y = 1, posit_x = c(4, 6), logreg = TRUE )) head(result_pos[, c("Ozone", "Ozone_imp")]) ## ----fill-na-dt--------------------------------------------------------------- data(air_miss) setDT(air_miss) air_miss[, Ozone_imp := fill_NA( x = .SD, model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )] air_miss[, x_char_imp := fill_NA( x = .SD, model = "lda", posit_y = "x_character", posit_x = c("Wind", "Temp") )] # With grouping air_miss[, Solar_R_imp := fill_NA( x = .SD, model = "lm_pred", posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept"), w = .SD[["weights"]] ), by = .(groups)] ## ----fill-na-n-dplyr---------------------------------------------------------- data(air_miss) result_n <- air_miss %>% # PMM with 20 draws. Imputes from observed values. mutate(Ozone_pmm = fill_NA_N( x = ., model = "pmm", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20 )) %>% # lm_noise with 30 draws and weights mutate(Ozone_noise = fill_NA_N( x = ., model = "lm_noise", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), w = .[["weights"]], logreg = TRUE, k = 30 )) ## ----fill-na-n-dt------------------------------------------------------------- data(air_miss) setDT(air_miss) air_miss[, Ozone_pmm := fill_NA_N( x = .SD, model = "pmm", posit_y = "Ozone", posit_x = c("Wind", "Temp", "Intercept"), w = .SD[["weights"]], logreg = TRUE, k = 20 ), by = .(groups)] ## ----compare-imp-------------------------------------------------------------- data(air_miss) air_miss_imp <- air_miss %>% mutate( Ozone_bayes = fill_NA(x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")), Ozone_pmm = fill_NA_N(x = ., model = "pmm", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), k = 20) ) compare_imp(air_miss_imp, origin = "Ozone", target = c("Ozone_bayes", "Ozone_pmm")) ## ----mi-workflow-------------------------------------------------------------- data(air_miss) # Select the 4 core variables: Ozone and Solar.R have NAs; Wind and Temp are complete. df <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")] # Step 1: Generate m = 10 completed datasets. # Impute Solar.R first (predictors fully observed), then Ozone # (can now use the freshly imputed Solar.R). This sequential order # ensures all NAs are filled in a single pass. set.seed(1234) completed <- lapply(1:10, function(i) { df %>% mutate(Solar.R = fill_NA(., "lm_bayes", "Solar.R", c("Wind", "Temp"))) %>% mutate(Ozone = fill_NA(., "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))) }) # Step 2: Fit the analysis model on each completed dataset fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) # Step 3: Pool using Rubin's rules pool_res <- pool(fits) pool_res summary(pool_res) ## ----full-impute-all---------------------------------------------------------- data(air_miss) # Define a function that fills all variables with NAs in one pass. # Order matters: Solar.R first (Wind, Temp are complete), then Ozone # (uses the freshly imputed Solar.R), then categorical variables. impute_all <- function(data) { data %>% # Continuous: Solar.R (predictors fully observed) mutate(Solar.R = fill_NA( x = ., model = "lm_bayes", posit_y = "Solar.R", posit_x = c("Wind", "Temp") )) %>% # Continuous: Ozone (Solar.R now complete) mutate(Ozone = fill_NA( x = ., model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )) %>% # Categorical: x_character (LDA with random ridge for stochasticity) mutate(x_character = fill_NA( x = ., model = "lda", posit_y = "x_character", posit_x = c("Wind", "Temp"), ridge = runif(1, 0.5, 50) )) %>% # Categorical: Ozone_chac (use tryCatch for safety with small groups) group_by(groups) %>% do(mutate(., Ozone_chac = tryCatch( fill_NA( x = ., model = "lda", posit_y = "Ozone_chac", posit_x = c("Temp", "Wind") ), error = function(e) .[["Ozone_chac"]] ))) %>% ungroup() } # MI: generate m = 10 completed datasets set.seed(2024) m <- 10 completed <- lapply(1:m, function(i) impute_all(air_miss)) # Fit the analysis model on each completed dataset fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) # Pool using Rubin's rules pool_result <- pool(fits) pool_result summary(pool_result) ## ----full-impute-all-dt------------------------------------------------------- data(air_miss) setDT(air_miss) impute_all_dt <- function(dt) { d <- copy(dt) # Order: Solar.R first (predictors complete), then Ozone, then categoricals d[, Solar.R := fill_NA( x = .SD, model = "lm_bayes", posit_y = "Solar.R", posit_x = c("Wind", "Temp") )] d[, Ozone := fill_NA( x = .SD, model = "lm_bayes", posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") )] d[, x_character := fill_NA( x = .SD, model = "lda", posit_y = "x_character", posit_x = c("Wind", "Temp"), ridge = runif(1, 0.5, 50) )] d[, Ozone_chac := tryCatch( fill_NA( x = .SD, model = "lda", posit_y = "Ozone_chac", posit_x = c("Temp", "Wind") ), error = function(e) .SD[["Ozone_chac"]] ), by = .(groups)] d } set.seed(2024) completed_dt <- lapply(1:10, function(i) impute_all_dt(air_miss)) fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) pool(fits_dt) ## ----oop-simple--------------------------------------------------------------- data <- cbind(as.matrix(airquality[, 1:4]), intercept = 1, index = 1:nrow(airquality)) model <- new(miceFast) model$set_data(data) # Single imputation with linear model (col 1 = Ozone) model$update_var(1, model$impute("lm_pred", 1, 5)$imputations) # Averaged multiple imputation for Solar.R (col 2, Bayesian, k=10 draws) model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5), k = 10)$imputations) model$which_updated() head(model$get_data(), 4) ## ----oop-groups--------------------------------------------------------------- data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality)) weights <- rgamma(nrow(data), 3, 3) groups <- as.numeric(airquality[, 5]) model <- new(miceFast) model$set_data(data) model$set_w(weights) model$set_g(groups) model$update_var(1, model$impute("lm_pred", 1, 6)$imputations) model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5, 6), k = 10)$imputations) # Recover original row order head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4) ## ----oop-unsorted------------------------------------------------------------- data <- cbind(as.matrix(airquality[, -5]), intercept = 1, index = 1:nrow(airquality)) weights <- rgamma(nrow(data), 3, 3) groups <- as.numeric(sample(1:8, nrow(data), replace = TRUE)) model <- new(miceFast) model$set_data(data) model$set_w(weights) model$set_g(groups) model$update_var(1, model$impute("lm_pred", 1, 6)$imputations) model$update_var(2, model$impute_N("lm_bayes", 2, c(1, 3, 4, 5, 6), 10)$imputations) head(cbind(model$get_data(), model$get_g(), model$get_w())[order(model$get_index()), ], 4) ## ----oop-mi------------------------------------------------------------------- data(air_miss) # Prepare a numeric matrix with an intercept and row index mat <- cbind( as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]), intercept = 1, index = seq_len(nrow(air_miss)) ) groups <- as.numeric(air_miss$groups) impute_oop <- function(mat, groups) { m <- new(miceFast) m$set_data(mat + 0) # copy. set_data uses the matrix by reference. m$set_g(groups) # col 1 = Ozone, col 2 = Solar.R, col 3 = Wind, col 4 = Temp, col 5 = intercept m$update_var(1, m$impute("lm_bayes", 1, c(2, 3, 4))$imputations) m$update_var(2, m$impute("lm_bayes", 2, c(3, 4, 5))$imputations) completed <- m$get_data()[order(m$get_index()), ] as.data.frame(completed) } set.seed(2025) completed <- lapply(1:10, function(i) impute_oop(mat, groups)) fits <- lapply(completed, function(d) lm(V1 ~ V3 + V4, data = d)) pool(fits) summary(pool(fits)) ## ----fcs-dt------------------------------------------------------------------- data(air_miss) setDT(air_miss) fcs_dt <- function(dt, n_iter = 5) { d <- copy(dt) na_ozone <- is.na(d$Ozone) na_solar <- is.na(d[["Solar.R"]]) d <- naive_fill_NA(d) # initialise for (iter in seq_len(n_iter)) { set(d, which(na_ozone), "Ozone", NA_real_) # restore NAs d[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))] set(d, which(na_solar), "Solar.R", NA_real_) d[, Solar.R := fill_NA_N(.SD, "pmm", "Solar.R", c("Ozone", "Wind", "Temp", "Intercept"))] } d } set.seed(2025) completed_dt <- lapply(1:10, function(i) fcs_dt(air_miss)) fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Wind + Temp, data = d)) pool(fits_dt) ## ----corrdata-example--------------------------------------------------------- # 3 continuous variables, 100 observations means <- c(10, 20, 30) cor_matrix <- matrix(c( 1.0, 0.7, 0.3, 0.7, 1.0, 0.5, 0.3, 0.5, 1.0 ), nrow = 3) cd <- new(corrData, 100, means, cor_matrix) sim_data <- cd$fill("contin") round(cor(sim_data), 2) ## ----corrdata-discrete-------------------------------------------------------- # With 2 categorical variables: first argument is nr_cat cd2 <- new(corrData, 2, 200, means, cor_matrix) sim_discrete <- cd2$fill("discrete") head(sim_discrete)