## ----chunk-1, message=FALSE, warning=FALSE------------------------------------ # simulate dataframe called simulated_data_norm library(hyreg2) library(flexmix) ### CLASS 1 ### # Set parameters set.seed(42) n_samples_tto1 <- 200 n_samples_dce1 <- 100 sigma1 <- 1.0 theta1 <- 5 beta1 <- c(0.5, -0.3, 0.8) # true values for beta # simulate matrix x_tto1 <- matrix(rnorm(n_samples_tto1 * length(beta1)), ncol = length(beta1)) x_dce1 <- matrix(rnorm(n_samples_dce1 * length(beta1)), ncol = length(beta1)) #Xb (linear predictor following formula xb = x1*beta1 + x2*beta2 + x3*beta3) Xb_tto1 <- x_tto1 %*% beta1 Xb_dce1 <- (x_dce1 %*% beta1) * theta1 # generate y y_tto1 <- rnorm(n_samples_tto1, mean = Xb_tto1, sd = sigma1) # continuous outcomes logistic_tmp1 <- 0.5 + 0.5 * tanh(Xb_dce1 / 2) y_dce1 <- rbinom(n_samples_dce1, size = 1, prob = logistic_tmp1) # binary outcomes ### dataset ### data_tto1 <- data.frame( type = rep("TTO", n_samples_tto1), y = y_tto1, x1 = x_tto1[, 1], x2 = x_tto1[, 2], x3 = x_tto1[, 3] ) data_dce1 <- data.frame( type = rep("DCE_A", n_samples_dce1), y = y_dce1, x1 = x_dce1[, 1], x2 = x_dce1[, 2], x3 = x_dce1[, 3] ) simulated_data1 <- rbind(data_tto1, data_dce1) ### CLASS 2 ### # Set parameters set.seed(37) n_samples_tto2 <- 200 n_samples_dce2 <- 100 sigma2 <- 0.5 theta2 <- 2 beta2 <- c(1.4, 2.3, -0.2) # true values for beta # simulate matrix x_tto2 <- matrix(rnorm(n_samples_tto2 * length(beta2)), ncol = length(beta2)) x_dce2 <- matrix(rnorm(n_samples_dce2 * length(beta2)), ncol = length(beta2)) # Xb (linear predictor following formula xb = x1*beta1 + x2*beta2 + x3*beta3) Xb_tto2 <- x_tto2 %*% beta2 Xb_dce2 <- (x_dce2 %*% beta2) * theta2 # generate y y_tto2 <- rnorm(n_samples_tto2, mean = Xb_tto2, sd = sigma2) # continuous outcomes logistic_tmp2 <- 0.5 + 0.5 * tanh(Xb_dce2 / 2) y_dce2 <- rbinom(n_samples_dce2, size = 1, prob = logistic_tmp2) # binary outcomes ### dataset ### data_tto2 <- data.frame( type = rep("TTO", n_samples_tto2), y = y_tto2, x1 = x_tto2[, 1], x2 = x_tto2[, 2], x3 = x_tto2[, 3] ) data_dce2 <- data.frame( type = rep("DCE_A", n_samples_dce2), y = y_dce2, x1 = x_dce2[, 1], x2 = x_dce2[, 2], x3 = x_dce2[, 3] ) simulated_data2 <- rbind(data_tto2, data_dce2) ### DATASET with two classes ### simulated_data1$class <- 1 simulated_data2$class <- 2 # set id numerbs simulated_data1$id <- c(1:100, 1:100, 1:100) simulated_data2$id <- c(101:200, 101:200, 101:200) simulated_data_norm<- rbind(simulated_data1,simulated_data2) # clean environment rm(list = setdiff(ls(), "simulated_data_norm")) ## ----chunk-2, message=FALSE, warning=FALSE------------------------------------ # visualize true classes from simulated_data_norm p <- plot_hyreg2(data = simulated_data_norm, x = "id", y = "y", id_col = "id", class_df_model = simulated_data_norm[,c("id","class")], colors = c("#F8766D","#00BFC4")) p <- p + ggplot2::labs(title = "Figure 1") p ## ----chunk-3, message=FALSE, warning=FALSE, results='hide'-------------------- set.seed(277) formula <- y ~ -1 + x1 + x2 + x3 k <- 2 stv <- setNames(c(0.2,0,1,1,1),c("x1","x2","x3","sigma","theta")) control <- list(iter.max = 1000, verbose = 4) mod <- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = stv, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "both" ) ## ----chunk-4, message=FALSE, warning=FALSE------------------------------------ summary_hyreg2(mod) ## ----chunk-5, message=FALSE, warning=FALSE------------------------------------ # if model was estimated without grouping variable simulated_data_norm$observation <- rownames(simulated_data_norm) p <- plot_hyreg2(data = simulated_data_norm, x = "id", y = "y", id_col = "observation", class_df_model = give_class(data = simulated_data_norm, model = mod, id_col = "observation")) p <- p + ggplot2::labs(title = "Figure 2") p ## ----chunk-6, message=FALSE, warning=FALSE------------------------------------ # only continuous datapoints p <- plot_hyreg2(data = simulated_data_norm, x = "id", y = "y", id_col = "observation", class_df_model = give_class(data = simulated_data_norm, model = mod, id_col = "observation"), type_to_plot = list("type","TTO")) p <- p + ggplot2::labs(title = "Figure 3") p ## ----chunk-7, message=FALSE, warning=FALSE------------------------------------ # only dichotomous datapoints p <- plot_hyreg2(data = simulated_data_norm, x = "id", y = "y", id_col = "observation", class_df_model = give_class(data = simulated_data_norm, model = mod, id_col = "observation"), type_to_plot = list("type","DCE_A")) p <- p + ggplot2::labs(title = "Figure 4") p ## ----chunk-8, message=FALSE, warning=FALSE------------------------------------ # ratio of correct classified datapoints (sum(mod@cluster == simulated_data_norm$class))/dim(simulated_data_norm)[1] # cross table of observation types and assigned classes proof <- merge(simulated_data_norm, give_class(simulated_data_norm,mod,"observation"), by = "observation") table(proof$type,proof$mod_comp) ## ----chunk-9, message=FALSE, warning=FALSE, results='hide'-------------------- set.seed(227) formula <- y ~ -1 + x1 + x2 + x3 | id k <- 2 stv <- setNames(c(0.2,0,1,1,1),c("x1","x2","x3","sigma","theta")) control <- list(iter.max = 1000, verbose = 4) mod_id <- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = stv, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "both" ) ## ----chunk-9a, message=FALSE, warning=FALSE----------------------------------- summary_hyreg2(mod_id) ## ----chunk-10, message=FALSE, warning=FALSE----------------------------------- p <- plot_hyreg2(data = simulated_data_norm, x ="id", y = "y", id_col ="id", class_df_model = unique(give_class(data = simulated_data_norm, model = mod_id, id_col = "id"))) p <- p + ggplot2::labs(title = "Figure 5") p ## ----chunk-13, message=FALSE, warning=FALSE----------------------------------- # ratio of correct classified datapoints (sum(mod_id@cluster == simulated_data_norm$class))/dim(simulated_data_norm)[1] # cross table of observation types and assigned classes proof <- merge(simulated_data_norm, unique(give_class(simulated_data_norm,mod_id,"id")), by = "id") table(proof$type,proof$mod_comp) ## ----chunk-14, message=FALSE, warning=FALSE, results='hide'------------------- set.seed(227) formula <- y ~ -1 + x1 + x2 + x3 | id k <- 2 stv <- setNames(c(0.2,0,1,1,1),c("x1","x2","x3","sigma","theta")) control <- list(iter.max = 1000, verbose = 4) mod_cont <- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = stv, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "cont", id_col = "id" ) ## ----chunk-14a, message=FALSE, warning=FALSE---------------------------------- summary_hyreg2(mod_cont) ## ----chunk-15, message=FALSE, warning=FALSE----------------------------------- p <- plot_hyreg2(data = simulated_data_norm, x ="id", y = "y", id_col ="id", class_df_model = give_class(data = simulated_data_norm, model = mod_cont, id_col = "id")) p <- p + ggplot2::labs(title = "Figure 6") p ## ----chunk-16, message=FALSE, warning=FALSE----------------------------------- # ratio of correct classified datapoints proof <- merge(unique(simulated_data_norm[,c("id","class")]),mod_cont[["id_classes"]], by = "id") sum((proof$class == proof$mod_comp)/dim(proof)[1]) # cross table of observation types and assigned classes proof <- merge(simulated_data_norm, mod_cont[[k+1]], by = "id") table(proof$type,proof$mod_comp) ## ----chunk-18, message=FALSE, warning=FALSE, results='hide'------------------- # upper bound for simulated_data_norm$y at 3 # new variable y_cens for(i in 1:length(simulated_data_norm$y)){ if(simulated_data_norm$y[i] <= 3){ simulated_data_norm$y_cens[i] <- simulated_data_norm$y[i] }else{ simulated_data_norm$y_cens[i] <- 3 } } set.seed(227) formula <- y_cens ~ -1 + x1 + x2 + x3 | id k <- 2 stv <- setNames(c(0.2,0,1,1,1),c("x1","x2","x3","sigma","theta")) control <- list(iter.max = 1000, verbose = 4) mod_cens <- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = stv, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, upper = 3, latent = "cont", id_col = "id" ) ## ----chunk-19, message=FALSE, warning=FALSE, results='hide', fig.show='hide'---- summary_hyreg2(mod_cens) plot_hyreg2(data = simulated_data_norm, x ="id", y = "y_cens", id_col = "id", class_df_model = give_class(data = simulated_data_norm, model = mod_cens, id_col = "id")) ## ----chunk-22, message=FALSE, warning=FALSE, results='hide'------------------- set.seed(227) formula <- y ~ -1 + x1 + x2 + x3 | id k <- 2 stv1 <- setNames(c(0.2,0,1,1,1),c("x1","x2","x3","sigma","theta")) stv2 <- setNames(c(1,2,0.5,1,1),c("x1","x2","x3","sigma","theta")) control <- list(iter.max = 1000, verbose = 4) mod_stvl<- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = list(stv1,stv2), k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "cont", id_col = "id" ) summary_hyreg2(mod_stvl) ## ----chunk-24, message=FALSE, warning=FALSE, results='hide'------------------- set.seed(227) formula <- y ~ -1 + x1 + x2 + x3 | id formula_sigma <- y ~ x2 + x3 k <- 2 stv1 <- get_stv(mod_cont[[1]])[-4] # -4 since "sigma" is on fourth place in return vector stv2 <- get_stv(mod_cont[[2]])[-4] # -4 since "sigma" is on fourth place in return vector stv_sigma <- setNames(rep(0.1,3), c("x2","x3","(Intercept)")) control <- list(iter.max = 1000, verbose = 4) mod_het <- hyreg2_het(formula = formula, formula_sigma = formula_sigma, data = simulated_data_norm, type = simulated_data_norm$type, stv = list(stv1,stv2), stv_sigma = stv_sigma, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "cont", id_col = "id" ) ## ----chunk-24a, message=FALSE, warning=FALSE---------------------------------- summary_hyreg2(mod_het) ## ----chunk-23, message=FALSE, warning=FALSE, results='hide'------------------- set.seed(227) formula <- y ~ -1 + x1 + x2 + x3 | id k <- 2 stv <- setNames(c(0.2,0,1,1,1),c(colnames(simulated_data_norm)[3:5],c("sigma","theta"))) control <- list(iter.max = 1000, verbose = 4) mod_partialc <- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = stv, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "cont", id_col = "id", variables_both = "x1", variables_cont = "x3", variables_dich = "x2" ) summary_hyreg2(mod_partialc) ## ----chunk-26, message=FALSE, warning=FALSE----------------------------------- # simulate vector y_non and add this to simulated_data_norm #library(hyreg2) ### CLASS 1 ### # Set parameters (same as in simulation above) set.seed(42) n_samples_tto1 <- 200 n_samples_dce1 <- 100 sigma1 <- 1.0 theta1 <- 5 beta_non1 <- c(0.5, -0.3, 0.8) # true values for beta # simulate matrix x_tto1 <- matrix(rnorm(n_samples_tto1 * length(beta_non1)), ncol = length(beta_non1)) x_dce1 <- matrix(rnorm(n_samples_dce1 * length(beta_non1)), ncol = length(beta_non1)) ### NON-classic ### # x1,x2 and x3 are columnnames of the dataset, beta1, beta2 and beta3 are the parameters to be estimated formula1 <- y ~ (x1 * beta1 + x2 * beta2 ) * (x1 *beta1 + x3 * beta3) x_tto_non1 <- data.frame(x1 = x_tto1[,1], x2 = x_tto1[,2], x3 = x_tto1[,3]) x_dce_non1 <- data.frame(x1 = x_dce1[,1], x2 = x_dce1[,2], x3 = x_dce1[,3]) beta_non1 <- setNames(beta_non1[1:3], c("beta1","beta2","beta3")) Xb_tto_non1 <- hyreg2:::eval_formula_non(formula1, x_tto_non1, beta_non1) Xb_dce_non1 <- (hyreg2:::eval_formula_non(formula1, x_dce_non1, beta_non1)) * theta1 # generate y_non y_tto_non1 <- rnorm(n_samples_tto1, mean = Xb_tto_non1, sd = sigma1) # continuous outcomes logistic_tmp_non1 <- 0.5 + 0.5 * tanh(Xb_dce_non1 / 2) y_dce_non1 <- rbinom(n_samples_dce1, size = 1, prob = logistic_tmp_non1) # binary outcomes ### CLASS 2 ### # Set parameters (same as in simulation above) set.seed(37) n_samples_tto2 <- 200 n_samples_dce2 <- 100 sigma2 <- 0.5 theta2 <- 2 beta_non2 <- c(1.4, 2.3, -0.2) # true values for beta # simulate matrix x_tto2 <- matrix(rnorm(n_samples_tto2 * length(beta_non2)), ncol = length(beta_non2)) x_dce2 <- matrix(rnorm(n_samples_dce2 * length(beta_non2)), ncol = length(beta_non2)) ### NON-classic ### # x1,x2 and x3 are columnnames of the dataset, beta1, beta2 and beta3 are the parameters to be estimated formula2 <- y ~ (x1 * beta1 + x2 * beta2 ) * (x1 * beta1 + x3 * beta3) x_tto_non2 <- data.frame(x1 = x_tto2[,1], x2 = x_tto2[,2], x3 = x_tto2[,3]) x_dce_non2 <- data.frame(x1 = x_dce2[,1], x2 = x_dce2[,2], x3 = x_dce2[,3]) beta_non2 <- setNames(beta_non2[1:3], c("beta1","beta2","beta3")) Xb_tto_non2 <- hyreg2:::eval_formula_non(formula2, x_tto_non2, beta_non2) Xb_dce_non2 <- (hyreg2:::eval_formula_non(formula2, x_dce_non2, beta_non2)) * theta2 # generate y_non y_tto_non2 <- rnorm(n_samples_tto2, mean = Xb_tto_non2, sd = sigma2) # continuous outcomes logistic_tmp_non2 <- 0.5 + 0.5 * tanh(Xb_dce_non2 / 2) y_dce_non2 <- rbinom(n_samples_dce2, size = 1, prob = logistic_tmp_non2) # binary outcomes simulated_data_norm$y_non <- c(y_tto_non1,y_dce_non1,y_tto_non2,y_dce_non2) # clean environment rm(n_samples_tto1, n_samples_dce1, sigma1, theta1, beta_non1, n_samples_tto2, n_samples_dce2, sigma2, theta2, beta_non2, logistic_tmp1, logistic_tmp2, logistic_tmp_non1, logistic_tmp_non2, x_tto_non1, x_tto_non2, x_dce_non1, x_dce_non2, Xb_tto_non1, Xb_tto_non2, Xb_dce_non1, Xb_dce_non2, x_dce1, x_dce2, x_tto1, x_tto2, y_dce_non1, y_dce_non2, y_tto_non1, y_tto_non2, formula1, formula2 ) ## ----chunk-25, message=FALSE, warning=FALSE, results='hide'------------------- set.seed(259) formula <- y_non ~ (x1 * beta1 + x2 * beta2 ) * (x1 * beta1 + x3 * beta3)| id k <- 2 stv <- setNames(c(0.2,0,1,1,1),c("beta1","beta2","beta3","sigma","theta")) control <- list(iter.max = 1000, verbose = 4) mod_non <- hyreg2(formula = formula, data = simulated_data_norm, type = simulated_data_norm$type, stv = stv, k = k, type_cont = "TTO", type_dich = "DCE_A", opt_method = "L-BFGS-B", control = control, latent = "cont", id_col = "id", formula_type_classic = FALSE ) ## ----chunk-21, message=FALSE, warning=FALSE, results='hide'------------------- summary_hyreg2(mod_non) p <- plot_hyreg2(data = simulated_data_norm, x ="id", y = "y_non", id_col = "id", class_df_model = give_class(data = simulated_data_norm, model = mod_non, id_col = "id")) p <- p + ggplot2::labs(title = "Figure 7") p