## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
# Load necessary libraries
library(auxvecLASSO)
library(survey)
library(sampling)
library(dplyr)

# Set seed for reproducibility
set.seed(123)

## ----create_pop_file----------------------------------------------------------
# Load the population data
data(api)

# Load the population data file and add binary variables
api_pop <- apipop
api_pop$api00_bin <- as.factor(ifelse(api_pop$api00 > 650, 1, 0))
api_pop$growth_bin <- as.factor(ifelse(api_pop$growth > 25, 1, 0))
api_pop$meals_bin <- as.factor(ifelse(api_pop$meals > 40, 1, 0))
api_pop$ell_bin <- as.factor(ifelse(api_pop$ell > 15, 1, 0))
api_pop$hsg_bin <- as.factor(ifelse(api_pop$hsg > 20, 1, 0))
api_pop$full_bin <- as.factor(ifelse(api_pop$full > 90, 1, 0))
api_pop$sch.wide_bin <- as.factor(ifelse(api_pop$sch.wide == "Yes", 1, 0))
api_pop$awards_bin <- as.factor(ifelse(api_pop$awards == "Yes", 1, 0))
api_pop$comp.imp_bin <- as.factor(ifelse(api_pop$comp.imp == "Yes", 1, 0))
api_pop$stype <- as.factor(api_pop$stype)

# Keep only relevant variables
api_pop <- api_pop |>
  dplyr::select("cds", "stype", "enroll", ends_with("_bin"))

## ----create_sample------------------------------------------------------------
strata_sample <- sampling::strata(
  api_pop, # The population dataset
  stratanames = c("stype"), # Stratification variable
  size = c(150, 200, 175), # Desired sample sizes per stratum
  method = "srswor" # Sampling without replacement
)

api_sample <- getdata(api_pop, strata_sample)
api_sample$SamplingWeight <- 1 / api_sample$Prob

## ----dataimport---------------------------------------------------------------
# Artificial logistic regression coefficients (for both main effects and interactions)
coefficients <- c(
  "api00_bin" = 0.5,
  "growth_bin" = -0.3,
  "ell_bin" = -0.6,
  "interaction1" = 0.9, # Interaction term api00_bin:growth_bin
  "interaction2" = 1.2 # Interaction term api00_bin:ell_bin
)

# Create interaction terms for logistic model
api_sample$interaction1 <- as.numeric(api_sample$api00_bin) * as.numeric(api_sample$growth_bin)
api_sample$interaction2 <- as.numeric(api_sample$api00_bin) * as.numeric(api_sample$ell_bin)

# Calculate the logit (log-odds) based on the artificial coefficients and interaction terms
logit <- -1.2 +
  coefficients["api00_bin"] * as.numeric(api_sample$api00_bin) +
  coefficients["growth_bin"] * as.numeric(api_sample$growth_bin) +
  coefficients["ell_bin"] * as.numeric(api_sample$ell_bin) +
  coefficients["interaction1"] * api_sample$interaction1 +
  coefficients["interaction2"] * api_sample$interaction2

# Compute the logistic probabilities
api_sample$response_prob <- 1 / (1 + exp(-logit)) # Logistic function

# Simulate the response variable (1 for response, 0 for non-response)
api_sample$response <- as.factor(rbinom(n = nrow(api_sample), size = 1, prob = api_sample$response_prob))

# --- Check the summary of the sample ---
summary(api_sample)

## ----lassomodeling------------------------------------------------------------
# Apply LASSO for selecting auxiliary variables
lasso_result <- select_auxiliary_variables_lasso_cv(
  df = api_sample,
  outcome_vars = c("response", "enroll"),
  auxiliary_vars = c(
    "api00_bin", "growth_bin", "comp.imp_bin", "meals_bin",
    "meals_bin", "ell_bin", "hsg_bin", "full_bin", "stype"
  ),
  must_have_vars = "stype", # Include the domain variable (stype) in the model
  check_twoway_int = TRUE, # Include two-way interactions
  nfolds = 5, # Use 5-fold cross-validation
  verbose = FALSE, # Don't print progress messages
  standardize = TRUE, # Standardize the predictors
  return_models = FALSE, # Don't return the models, only the selection results
  parallel = FALSE # Run without parallel processing
)

# Display the results
lasso_result

## ----create_resp-set----------------------------------------------------------
api_resp <- api_sample[api_sample$response == 1, ]

## ----svydesign----------------------------------------------------------------
design <- survey::svydesign(
  ids     = ~1,
  data    = api_resp,
  strata  = ~stype,
  weights = ~SamplingWeight
)

## ----cal_reg_means------------------------------------------------------------
register_pop_means <- list(
  total = list(sch.wide_bin = mean(api_pop$sch.wide_bin == "1")),
  by_domain = list(
    stype = tapply(
      api_pop$sch.wide_bin, api_pop$stype,
      function(x) mean(x == "1")
    ),
    awards_bin = tapply(
      api_pop$sch.wide_bin, api_pop$awards_bin,
      function(x) mean(x == "1")
    )
  )
)

## ----calib_totals-------------------------------------------------------------
## --- Define the calibration formula ---
calibration_formula <- ~ stype + growth_bin + api00_bin + ell_bin + comp.imp_bin + hsg_bin + api00_bin:stype + hsg_bin:stype + comp.imp_bin:stype + api00_bin:growth_bin

## --- Calculate population totals with a single terms object built on the POPULATION ---
pop_totals <- generate_population_totals(
  population_df = api_pop,
  calibration_formula
)

## ----assess_vec---------------------------------------------------------------
result_diagnostics <- assess_aux_vector(
  design = design,
  df = api_resp,
  already_calibrated = FALSE,
  calibration_formula = calibration_formula,
  calibration_pop_totals = pop_totals$population_totals,
  register_vars = c("sch.wide_bin"),
  register_pop_means = register_pop_means,
  survey_vars = c("enroll", "full_bin"),
  domain_vars = c("stype", "awards_bin"),
  diagnostics = c(
    "weight_variation",
    "register_diagnostics",
    "survey_diagnostics"
  ),
  verbose = FALSE
)

## --- Display output ---
result_diagnostics