## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ## ----load_package------------------------------------------------------------- library(EstemPMM) set.seed(2025) ## ----basic_linear_bootstrap--------------------------------------------------- # Generate data with skewed errors n <- 200 x <- rnorm(n, mean = 5, sd = 2) # True parameters beta_0 <- 10 beta_1 <- 2.5 # Skewed errors: chi-squared distribution errors <- rchisq(n, df = 4) - 4 # Response y <- beta_0 + beta_1 * x + errors dat <- data.frame(y = y, x = x) # Fit PMM2 model fit_pmm2 <- lm_pmm2(y ~ x, data = dat, verbose = FALSE) # Perform bootstrap inference boot_results <- pmm2_inference(fit_pmm2, formula = y ~ x, data = dat, B = 1000, seed = 123) # Display summary summary(boot_results) ## ----bootstrap_distribution, fig.width=7, fig.height=6------------------------ # Extract bootstrap samples for visualization # (Note: This requires accessing internals - in practice, use summary output) # Refit to get bootstrap samples set.seed(123) B <- 1000 boot_samples <- matrix(0, nrow = B, ncol = 2) for(b in 1:B) { # Resample residuals res_b <- sample(residuals(fit_pmm2), size = n, replace = TRUE) # Generate bootstrap data y_b <- fitted(fit_pmm2) + res_b dat_b <- dat dat_b$y <- y_b # Refit fit_b <- lm_pmm2(y ~ x, data = dat_b, verbose = FALSE) boot_samples[b, ] <- coef(fit_b) } # Plot bootstrap distributions par(mfrow = c(2, 2)) # Intercept hist(boot_samples[, 1], breaks = 30, main = "Bootstrap: Intercept", xlab = "Intercept", col = "lightblue", border = "white") abline(v = beta_0, col = "red", lwd = 2, lty = 2) abline(v = coef(fit_pmm2)[1], col = "darkblue", lwd = 2) legend("topright", legend = c("True", "Estimate"), col = c("red", "darkblue"), lty = c(2, 1), lwd = 2, cex = 0.8) # Q-Q plot for intercept qqnorm(boot_samples[, 1], main = "Q-Q Plot: Intercept") qqline(boot_samples[, 1], col = "red", lwd = 2) # Slope hist(boot_samples[, 2], breaks = 30, main = "Bootstrap: Slope", xlab = "Slope", col = "lightgreen", border = "white") abline(v = beta_1, col = "red", lwd = 2, lty = 2) abline(v = coef(fit_pmm2)[2], col = "darkblue", lwd = 2) legend("topright", legend = c("True", "Estimate"), col = c("red", "darkblue"), lty = c(2, 1), lwd = 2, cex = 0.8) # Q-Q plot for slope qqnorm(boot_samples[, 2], main = "Q-Q Plot: Slope") qqline(boot_samples[, 2], col = "red", lwd = 2) par(mfrow = c(1, 1)) ## ----ci_comparison------------------------------------------------------------ # Fit OLS for comparison fit_ols <- lm(y ~ x, data = dat) # Extract standard errors and CIs se_ols <- summary(fit_ols)$coefficients[, "Std. Error"] ci_ols <- confint(fit_ols) # Bootstrap CIs (from summary) boot_summary <- boot_results # Create comparison table comparison <- data.frame( Parameter = c("Intercept", "Slope"), True = c(beta_0, beta_1), PMM2_Estimate = coef(fit_pmm2), OLS_SE = se_ols, PMM2_Boot_SE = boot_summary$Std.Error, SE_Ratio = boot_summary$Std.Error / se_ols ) print(comparison, row.names = FALSE, digits = 3) cat("\n=== Confidence Interval Comparison ===\n\n") cat("OLS 95% CI (Normal-based):\n") print(ci_ols) cat("\nPMM2 95% CI (Bootstrap Percentile):\n") print(data.frame( Parameter = rownames(ci_ols), Lower = boot_summary$conf.low, Upper = boot_summary$conf.high )) ## ----hypothesis_testing------------------------------------------------------- # H0: Slope = 0 (no relationship) # From bootstrap summary boot_summary <- boot_results cat("=== Hypothesis Test: Slope = 0 ===\n\n") cat("t-statistic:", boot_summary$t.value[2], "\n") cat("p-value: ", boot_summary$p.value[2], "\n\n") if(boot_summary$p.value[2] < 0.05) { cat("Decision: Reject H0 at 5% level\n") cat("Conclusion: Significant relationship between x and y\n") } else { cat("Decision: Fail to reject H0\n") } ## ----multiple_regression_test------------------------------------------------- # Generate multiple regression data n <- 250 x1 <- rnorm(n) x2 <- rnorm(n) errors <- rexp(n, rate = 1) - 1 # True model: similar coefficients beta <- c(5, 1.5, 1.8, -0.5) y <- beta[1] + beta[2]*x1 + beta[3]*x2 + beta[4]*x1*x2 + errors dat_multi <- data.frame(y = y, x1 = x1, x2 = x2) # Fit model fit_multi <- lm_pmm2(y ~ x1 + x2 + x1:x2, data = dat_multi, verbose = FALSE) # Bootstrap boot_multi <- pmm2_inference(fit_multi, formula = y ~ x1 + x2 + x1:x2, data = dat_multi, B = 1000, seed = 456) # Test H0: beta_x1 = beta_x2 summary(boot_multi) ## ----ar_bootstrap------------------------------------------------------------- # Generate AR(1) data with skewed innovations n <- 300 phi <- 0.7 innovations <- rgamma(n, shape = 2, rate = 2) - 1 x <- numeric(n) x[1] <- innovations[1] for(i in 2:n) { x[i] <- phi * x[i-1] + innovations[i] } # Fit AR(1) model fit_ar <- ar_pmm2(x, order = 1, method = "pmm2", include.mean = FALSE) # Bootstrap inference for time series boot_ar <- ts_pmm2_inference(fit_ar, B = 1000, seed = 789) # Summary summary(boot_ar) # Check if AR coefficient is significantly different from 0.5 cat("\n=== Testing H0: phi = 0.5 ===\n") boot_summary_ar <- boot_ar ci_lower <- boot_summary_ar$conf.low[1] ci_upper <- boot_summary_ar$conf.high[1] cat("95% CI: [", round(ci_lower, 3), ",", round(ci_upper, 3), "]\n") if(0.5 >= ci_lower && 0.5 <= ci_upper) { cat("Decision: Cannot reject H0 (0.5 is within CI)\n") } else { cat("Decision: Reject H0 (0.5 is outside CI)\n") } ## ----arma_bootstrap----------------------------------------------------------- # Generate ARMA(1,1) with heavy-tailed innovations n <- 400 phi <- 0.5 theta <- 0.4 innovations <- rt(n, df = 4) x <- arima.sim(n = n, model = list(ar = phi, ma = theta), innov = innovations) # Fit ARMA model fit_arma <- arma_pmm2(x, order = c(1, 1), method = "pmm2", include.mean = FALSE) # Bootstrap boot_arma <- ts_pmm2_inference(fit_arma, B = 1000, seed = 999, method = "block", block_length = 20) # Display results summary(boot_arma) ## ----bootstrap_stability, eval=FALSE------------------------------------------ # # Assess stability of bootstrap SEs with different B # B_values <- c(100, 500, 1000, 2000) # se_results <- data.frame(B = B_values, SE_Intercept = NA, SE_Slope = NA) # # for(i in seq_along(B_values)) { # boot_temp <- pmm2_inference(fit_pmm2, formula = y ~ x, data = dat, # B = B_values[i], seed = 123) # summary_temp <- boot_temp # se_results$SE_Intercept[i] <- summary_temp$Std.Error[1] # se_results$SE_Slope[i] <- summary_temp$Std.Error[2] # } # # print(se_results) ## ----parallel_bootstrap, eval=FALSE------------------------------------------- # # Enable parallel bootstrap (requires 'parallel' package) # boot_parallel <- pmm2_inference(fit_pmm2, # formula = y ~ x, # data = dat, # B = 2000, # seed = 123, # parallel = TRUE, # cores = 4) # Use 4 CPU cores # # # Check speedup # # Typical speedup: 3-4x on 4 cores ## ----bootstrap_diagnostics---------------------------------------------------- # Rerun bootstrap with verbose output for diagnostics boot_verbose <- pmm2_inference(fit_pmm2, formula = y ~ x, data = dat, B = 500, seed = 321) # Check for failed bootstrap samples # (In production code, inspect convergence attribute) summary(boot_verbose) cat("\nAll bootstrap samples should converge successfully.\n") cat("If many fail, consider:\n") cat("- Increasing max_iter in lm_pmm2()\n") cat("- Checking for outliers or influential points\n") cat("- Simplifying the model\n") ## ----boot_viz, fig.width=7, fig.height=4-------------------------------------- # For demonstration, regenerate bootstrap samples set.seed(123) B <- 1000 sample_size <- nrow(dat) boot_coefs <- matrix(0, nrow = B, ncol = 2) for(b in 1:B) { res_b <- sample(residuals(fit_pmm2), sample_size, replace = TRUE) y_b <- fitted(fit_pmm2) + res_b dat_b <- dat dat_b$y <- y_b fit_b <- lm_pmm2(y ~ x, data = dat_b, verbose = FALSE) boot_coefs[b, ] <- coef(fit_b) } # Scatter plot of bootstrap estimates par(mfrow = c(1, 2)) # Bivariate distribution plot(boot_coefs[, 1], boot_coefs[, 2], pch = 16, col = rgb(0, 0, 1, 0.1), xlab = "Intercept", ylab = "Slope", main = "Joint Bootstrap Distribution") points(coef(fit_pmm2)[1], coef(fit_pmm2)[2], pch = 19, col = "red", cex = 2) # Correlation cor_boot <- cor(boot_coefs[, 1], boot_coefs[, 2]) cat("Bootstrap correlation between intercept and slope:", round(cor_boot, 3), "\n") # Density plot plot(density(boot_coefs[, 2]), main = "Slope Bootstrap Density", xlab = "Slope", lwd = 2, col = "blue") abline(v = beta_1, col = "red", lwd = 2, lty = 2) abline(v = coef(fit_pmm2)[2], col = "darkblue", lwd = 2) legend("topright", legend = c("True", "Estimate", "Bootstrap"), col = c("red", "darkblue", "blue"), lty = c(2, 1, 1), lwd = 2) par(mfrow = c(1, 1)) ## ----studentized_bootstrap, eval=FALSE---------------------------------------- # # Not yet implemented in EstemPMM # # Future feature: studentized CIs for improved small-sample performance ## ----block_bootstrap, eval=FALSE---------------------------------------------- # # Not yet implemented in EstemPMM # # Current approach: residual bootstrap (adequate for most cases) # # Future: moving block bootstrap for highly dependent series