## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- # Load necessary libraries library(auxvecLASSO) library(survey) library(sampling) library(dplyr) # Set seed for reproducibility set.seed(123) ## ----create_pop_file---------------------------------------------------------- # Load the population data data(api) # Load the population data file and add binary variables api_pop <- apipop api_pop$api00_bin <- as.factor(ifelse(api_pop$api00 > 650, 1, 0)) api_pop$growth_bin <- as.factor(ifelse(api_pop$growth > 25, 1, 0)) api_pop$meals_bin <- as.factor(ifelse(api_pop$meals > 40, 1, 0)) api_pop$ell_bin <- as.factor(ifelse(api_pop$ell > 15, 1, 0)) api_pop$hsg_bin <- as.factor(ifelse(api_pop$hsg > 20, 1, 0)) api_pop$full_bin <- as.factor(ifelse(api_pop$full > 90, 1, 0)) api_pop$sch.wide_bin <- as.factor(ifelse(api_pop$sch.wide == "Yes", 1, 0)) api_pop$awards_bin <- as.factor(ifelse(api_pop$awards == "Yes", 1, 0)) api_pop$comp.imp_bin <- as.factor(ifelse(api_pop$comp.imp == "Yes", 1, 0)) api_pop$stype <- as.factor(api_pop$stype) # Keep only relevant variables api_pop <- api_pop |> dplyr::select("cds", "stype", "enroll", ends_with("_bin")) ## ----create_sample------------------------------------------------------------ strata_sample <- sampling::strata( api_pop, # The population dataset stratanames = c("stype"), # Stratification variable size = c(150, 200, 175), # Desired sample sizes per stratum method = "srswor" # Sampling without replacement ) api_sample <- getdata(api_pop, strata_sample) api_sample$SamplingWeight <- 1 / api_sample$Prob ## ----dataimport--------------------------------------------------------------- # Artificial logistic regression coefficients (for both main effects and interactions) coefficients <- c( "api00_bin" = 0.5, "growth_bin" = -0.3, "ell_bin" = -0.6, "interaction1" = 0.9, # Interaction term api00_bin:growth_bin "interaction2" = 1.2 # Interaction term api00_bin:ell_bin ) # Create interaction terms for logistic model api_sample$interaction1 <- as.numeric(api_sample$api00_bin) * as.numeric(api_sample$growth_bin) api_sample$interaction2 <- as.numeric(api_sample$api00_bin) * as.numeric(api_sample$ell_bin) # Calculate the logit (log-odds) based on the artificial coefficients and interaction terms logit <- -1.2 + coefficients["api00_bin"] * as.numeric(api_sample$api00_bin) + coefficients["growth_bin"] * as.numeric(api_sample$growth_bin) + coefficients["ell_bin"] * as.numeric(api_sample$ell_bin) + coefficients["interaction1"] * api_sample$interaction1 + coefficients["interaction2"] * api_sample$interaction2 # Compute the logistic probabilities api_sample$response_prob <- 1 / (1 + exp(-logit)) # Logistic function # Simulate the response variable (1 for response, 0 for non-response) api_sample$response <- as.factor(rbinom(n = nrow(api_sample), size = 1, prob = api_sample$response_prob)) # --- Check the summary of the sample --- summary(api_sample) ## ----lassomodeling------------------------------------------------------------ # Apply LASSO for selecting auxiliary variables lasso_result <- select_auxiliary_variables_lasso_cv( df = api_sample, outcome_vars = c("response", "enroll"), auxiliary_vars = c( "api00_bin", "growth_bin", "comp.imp_bin", "meals_bin", "meals_bin", "ell_bin", "hsg_bin", "full_bin", "stype" ), must_have_vars = "stype", # Include the domain variable (stype) in the model check_twoway_int = TRUE, # Include two-way interactions nfolds = 5, # Use 5-fold cross-validation verbose = FALSE, # Don't print progress messages standardize = TRUE, # Standardize the predictors return_models = FALSE, # Don't return the models, only the selection results parallel = FALSE # Run without parallel processing ) # Display the results lasso_result ## ----create_resp-set---------------------------------------------------------- api_resp <- api_sample[api_sample$response == 1, ] ## ----svydesign---------------------------------------------------------------- design <- survey::svydesign( ids = ~1, data = api_resp, strata = ~stype, weights = ~SamplingWeight ) ## ----cal_reg_means------------------------------------------------------------ register_pop_means <- list( total = list(sch.wide_bin = mean(api_pop$sch.wide_bin == "1")), by_domain = list( stype = tapply( api_pop$sch.wide_bin, api_pop$stype, function(x) mean(x == "1") ), awards_bin = tapply( api_pop$sch.wide_bin, api_pop$awards_bin, function(x) mean(x == "1") ) ) ) ## ----calib_totals------------------------------------------------------------- ## --- Define the calibration formula --- calibration_formula <- ~ stype + growth_bin + api00_bin + ell_bin + comp.imp_bin + hsg_bin + api00_bin:stype + hsg_bin:stype + comp.imp_bin:stype + api00_bin:growth_bin ## --- Calculate population totals with a single terms object built on the POPULATION --- pop_totals <- generate_population_totals( population_df = api_pop, calibration_formula ) ## ----assess_vec--------------------------------------------------------------- result_diagnostics <- assess_aux_vector( design = design, df = api_resp, already_calibrated = FALSE, calibration_formula = calibration_formula, calibration_pop_totals = pop_totals$population_totals, register_vars = c("sch.wide_bin"), register_pop_means = register_pop_means, survey_vars = c("enroll", "full_bin"), domain_vars = c("stype", "awards_bin"), diagnostics = c( "weight_variation", "register_diagnostics", "survey_diagnostics" ), verbose = FALSE ) ## --- Display output --- result_diagnostics