## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(slideimp)

## -----------------------------------------------------------------------------
data(khanmiss1)
obj <- t(khanmiss1)
set.seed(1234)

## -----------------------------------------------------------------------------
n_genes <- 30
n_samples <- 5
n_reps <- 10
na_location <- replicate(
  n = n_reps,
  {
    chosen_genes <- sample.int(ncol(obj), size = n_genes)
    values <- lapply(
      chosen_genes, \(x) {
        chosen_samples <- sample.int(nrow(obj), size = n_samples)
        matrix(c(chosen_samples, rep(x, n_samples)), ncol = 2, dimnames = list(NULL, c("row", "col")))
      }
    )
    do.call(rbind, values)
  },
  simplify = FALSE
)
na_location[[1]][1:10, ]

## -----------------------------------------------------------------------------
tune_knn <- tune_imp(obj, parameters = data.frame(k = 10), rep = na_location, cores = 4)
tune_pca <- tune_imp(obj, parameters = data.frame(ncp = 5), rep = na_location)

## -----------------------------------------------------------------------------
rmse_knn <- mean(compute_metrics(tune_knn, metrics = "rmse")$.estimate)
rmse_pca <- mean(compute_metrics(tune_pca, metrics = "rmse")$.estimate)
c("knn" = rmse_knn, "pca" = rmse_pca)

## -----------------------------------------------------------------------------
# 500 features, 50 samples, 2 chromosomes
sim_obj <- sim_mat(n = 500, m = 50, nchr = 2)
# sim_mat returns matrix with features in rows, so we use t() to put features in columns
obj <- t(sim_obj$input)
# `group_feature` contains metadata for the simulated data
head(sim_obj$group_feature)
n_needed <- 100
# Randomly select CpGs that are needed for clock calculation
subset_cpgs <- sample(colnames(obj), size = n_needed)

## -----------------------------------------------------------------------------
# Construct the group data.frame with one row per group (i.e., chromosome)
group_df <- group_features(obj, sim_obj$group_feature, subset = subset_cpgs)
group_df

## -----------------------------------------------------------------------------
pca_group <- group_df
# For PCA imputation, we use ncp = 5 for the first group and ncp = 10 for the second group
pca_group$parameters <- list(data.frame(ncp = 5), data.frame(ncp = 10))
# For K-NN imputation, we use k = 5 for both groups
knn_group <- group_df
knn_group$parameters <- list(data.frame(k = 5), data.frame(k = 5))

pca_group
knn_group

## -----------------------------------------------------------------------------
system.time(
  knn_results <- group_imp(obj, group = knn_group, cores = 4)
)
system.time(
  pca_results <- group_imp(obj, group = pca_group)
)

## -----------------------------------------------------------------------------
sample_names <- paste0("S", 1:10)
positions <- 500

methyl <- tibble::tibble(
  chr = "chr1",
  start = seq_len(positions),
  end = start,
  strand = "+"
)

set.seed(1234)
for (i in seq_along(sample_names)) {
  methyl[[paste0("numCs", i)]] <- sample.int(100, size = positions, replace = TRUE)
  methyl[[paste0("numTs", i)]] <- sample.int(100, size = positions, replace = TRUE)
  methyl[[paste0("coverage", i)]] <- methyl[[paste0("numCs", i)]] + methyl[[paste0("numTs", i)]]
}

methyl[1:5, 1:10]

## -----------------------------------------------------------------------------
methyl <- methyl[order(methyl$start), ] # <---- Important, sort!
numCs_matrix <- as.matrix(methyl[, paste0("numCs", seq_along(sample_names))])
cov_matrix <- as.matrix(methyl[, paste0("coverage", seq_along(sample_names))])
beta_matrix <- numCs_matrix / cov_matrix

colnames(beta_matrix) <- sample_names
rownames(beta_matrix) <- methyl$start

beta_matrix <- t(beta_matrix)
# Set 10% of the data to missing
set.seed(1234)
beta_matrix[sample.int(length(beta_matrix), floor(length(beta_matrix) * 0.1))] <- NA
beta_matrix[1:5, 1:5]

## -----------------------------------------------------------------------------
# For example, we are tuning `k` value here.
slide_knn_params <- tibble::tibble(k = c(10, 20), n_feat = 100, n_overlap = 10)
# Increase rep from 2 in actual analyses
tune_slide_knn <- tune_imp(
  obj = beta_matrix,
  parameters = slide_knn_params,
  cores = 4,
  rep = 2
)
compute_metrics(tune_slide_knn)

## -----------------------------------------------------------------------------
imputed_beta <- slide_imp(obj = beta_matrix, n_feat = 100, n_overlap = 10, k = 10, cores = 4)
imputed_beta