## ----setup, message = FALSE, warning = FALSE, collapse=TRUE-------------------
library(rtestim)
library(ggplot2)
library(dplyr)
library(nnet)
library(forcats)
library(tidyr)
theme_set(theme_bw())

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  dpi = 300,
  collapse = FALSE,
  comment = "#>",
  fig.asp = 0.618,
  fig.width = 6,
  out.width = "80%",
  fig.align = "center"
)

## ----gamma-pdf, echo = FALSE--------------------------------------------------
ggplot(data.frame(x = seq(.001, 20, length.out = 100)), aes(x)) +
  stat_function(fun = dgamma, args = list(shape = 2.5, scale = 2.5), color = "orange") +
  stat_function(fun = dgamma, args = list(shape = 1, scale = 2), color = "cornflowerblue") +
  stat_function(fun = dgamma, args = list(shape = 2, scale = 1), color = "forestgreen") +
  scale_y_continuous(expand = expansion(c(0, 0.05)))

## ----can-default--------------------------------------------------------------
can_default <- estimate_rt(cancovid$incident_cases, x = cancovid$date, nsol = 20L)
plot(can_default) + coord_cartesian(ylim = c(0.5, 2))

## ----cancov-nonpar------------------------------------------------------------
# Data from Backer et al.
delay <- read.csv("backer.csv") |>
  filter(delay > 0) |>
  select(-delay)
delay <- rowSums(delay)
delay <- delay / sum(delay)

## ----cancov-nonpar-plot, echo=FALSE-------------------------------------------
ggplot(
  data.frame(delay = 1:length(delay), probability = delay), 
  aes(delay, probability)
) +
  geom_point(colour = "cornflowerblue") +
  geom_segment(yend = 0, colour = "cornflowerblue") +
  scale_y_continuous(expand = expansion(c(0, 0.05)))

## ----can-nonpar---------------------------------------------------------------
can_nonpar <- estimate_rt(
  cancovid$incident_cases, 
  x = cancovid$date, 
  delay_distn = delay,
  nsol = 20L)
plot(can_nonpar) + coord_cartesian(ylim = c(0.5, 2))

## ----cdfs, echo = FALSE-------------------------------------------------------
cdfs <- data.frame(
  x = 0:20,
  default = pgamma(0:20, shape = 2.5, scale = 2.5),
  backer = cumsum(c(0, delay, rep(0, 20 - length(delay))))
)
ggplot(cdfs |> pivot_longer(-x), aes(x, value, color = name)) +
  geom_step() +
  scale_y_continuous(expand = expansion(0)) +
  scale_color_manual(values = c("orange", "cornflowerblue"), name = "")

## ----backer-matrix, eval=FALSE------------------------------------------------
# # library(Matrix)
# n <- nrow(cancovid)
# backer_delay <- c(0, delay, rep(0, n - length(delay) - 1))
# delay_mat <- matrix(0, n, n)
# delay_mat[1,1] <- 1
# for (iter in 2:n) delay_mat[iter, 1:iter] <- rev(backer_delay[1:iter])
# delay_mat <- drop0(as(delay_mat, "CsparseMatrix")) # make it sparse, not necessary
# delay_mat <- delay_mat / rowSums(delay_mat) # renormalize

## ----duotang-processing, eval=FALSE, echo=TRUE--------------------------------
# # Run on 19 April 2024
# duotang <- read_tsv("https://github.com/CoVaRR-NET/duotang/raw/main/data_needed/virusseq.metadata.csv.gz")
# columnlist <- c(
#   "fasta_header_name", "province", "host_gender", "host_age_bin",
#   "sample_collection_date", "sample_collected_by",
#   "purpose_of_sampling", "purpose_of_sequencing", "lineage",
#   "raw_lineage", "gisaid_accession", "isolate"
# )
# unknown.str <- c(
#   "Undeclared", "Not Provided", "Restricted Access", "Missing",
#   "Not Applicable", "", "NA", "unknow"
# )
# duotang <- duotang |>
#   rename(province = geo_loc_name_state_province_territory) |>
#   select(all_of(columnlist))
# 
# meta <- duotang |>
#   mutate(
#     week = cut(sample_collection_date, "week"),
#     month = gsub("-..$", "", as.character(cut(sample_collection_date, "month")))
#   )
# source("https://github.com/CoVaRR-NET/duotang/raw/main/scripts/scanlineages.R")
# meta <- meta |>
#   mutate(gisaid_accession = str_replace(gisaid_accession, "EPI_ISL_", "")) |>
#   rename(GID = gisaid_accession) |>
#   rowwise() |>
#   mutate(raw_lineage = ifelse(
#     grepl("^X", raw_lineage),
#     str_replace_all(paste0(
#       realtorawlineage(substr(
#         raw_lineage, 1, str_locate(raw_lineage, "\\.") - 1
#       )),
#       ".",
#       substr(raw_lineage, str_locate(raw_lineage, "\\.") + 1, nchar(raw_lineage))
#     ), "[\r\n]", ""),
#     raw_lineage
#   )) |>
#   ungroup()
# dico <- makepangolindico() # rebuild the lineage dictionary so the correct names gets assigned for XBB descedants not named XBB
# 
# VOCVOI <- read_csv("https://raw.githubusercontent.com/CoVaRR-NET/duotang/main/resources/vocvoi.csv")
# meta$pango_group <- create.pango.group(VOCVOI, meta)
# meta <- select(meta, province, week, pango_group) |>
#   mutate(week = as.Date(week))
# 
# counts <- group_by(meta, province, week, pango_group) |>
#   count() |>
#   ungroup() |>
#   arrange(province, week, pango_group)
# can_counts <- group_by(meta, week, pango_group) |>
#   count() |>
#   ungroup() |>
#   arrange(week, pango_group) |>
#   mutate(province = "Canada")
# counts <- bind_rows(can_counts, counts)
# saveRDS(counts, "duotang-counts.rds")

## ----duotang-counts, echo=FALSE, message=FALSE--------------------------------
props <- readRDS("duotang-counts.rds") |>
  pivot_wider(names_from = pango_group, values_from = n, values_fill = 0) |>
  mutate(total = rowSums(across(-c(week, province)))) |>
  mutate(across(-c(week, province, total), ~ .x / total)) |>
  select(-total)

smooth_it <- function(props_group) {
  z <- props_group |> select(-week, -province)
  n <- names(z)
  nn <- gsub(" ", "_", n)
  names(z) <- nn
  form_resp <- paste0("cbind(", paste0(names(z), collapse = ",") ,") ~ ")
  z$time <- as.numeric(props_group$week)
  form <- as.formula(paste0(form_resp, "poly(time, degree = 3)"))
  fits <- multinom(form, z, trace = FALSE)
  rng <- range(props_group$week)
  alltime <- as.numeric(seq(rng[1], rng[2], by = 1))
  z <- as_tibble(predict(fits, data.frame(time = alltime), type = "probs")) |>
    mutate(Date = as.Date(alltime))
  z
}

can_props_smoothed <- smooth_it(props |> filter(province == "Canada"))

can_props_smoothed |>
  pivot_longer(-Date) |>
  ggplot(aes(Date, y = value, fill = name)) +
  geom_area(position = "stack") +
  ylab("Variants in circulation") +
  xlab("") + 
  theme_bw() +
  scale_x_date(name = "", date_breaks = "1 year", date_labels = "%Y", expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) + 
  scale_fill_viridis_d(name = "")

can_pred_class <- can_props_smoothed |>
  filter(Date <= max(cancovid$date)) |>
  pivot_longer(-Date) |> 
  group_by(Date) |> 
  summarise(var = name[which.max(value)], .groups = "drop") |>
  mutate(var = case_when(
    var == "other" ~ "Ancestral lineage",
    var == "Alpha" ~ "Alpha",
    var == "Beta" ~ "Beta",
    var == "Delta" ~ "Delta",
    TRUE ~ "Omicron"
  ))

boot <- data.frame(
  Date = seq(min(cancovid$date), min(can_props_smoothed$Date) - 1, by = "day"),
  var = "Ancestral lineage"
)
can_pred_class <- bind_rows(boot, can_pred_class)

## ----echo=FALSE, fig.height=1, dev='png'--------------------------------------
ggplot(can_pred_class, aes(x = Date, fill = fct_relevel(var, "Ancestral lineage"))) + 
  geom_ribbon(aes(ymax = 1, ymin = 0)) +
  scale_y_continuous(expand = expansion(0)) +
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
  scale_fill_brewer(palette = "Dark2", name = "")

## ----meta-analysis, eval=FALSE------------------------------------------------
# data_raw <- readxl::read_excel("xu-etal-DATA_RAW.xlsx") |>
#   select(type, para, n = Sample_size, mean, sd, se, median) |>
#   filter(!is.na(type)) |>
#   mutate(across(-c(type, para), as.numeric))
# 
# bonehead_meta <- data_raw |>
#   group_by(type, para) |>
#   mutate(
#     no_n = all(is.na(n)),
#     n = case_when(!is.na(n) ~ n, no_n ~ 1, TRUE ~ median(n, na.rm = TRUE))
#   ) |>
#   ungroup() |>
#   mutate(
#     mean = case_when(!is.na(mean) ~ mean, TRUE ~ median),
#     sd = case_when(!is.na(sd) ~ sd, TRUE ~ se * sqrt(n))
#   ) |>
#   group_by(type, para) |>
#   summarise(
#     mean = median(mean, na.rm = TRUE),
#     sd = median(sd, na.rm = TRUE),
#     .groups = "drop"
#   )
# ## There's only one Beta and only IP.
# ## We use the corresponding sd for Alpha IP,
# ## and duplicate Alpha for GT / ST
# 
# Beta_IP <- bonehead_meta |> filter(type == "Beta")
# Beta_IP$sd = bonehead_meta |>
#   filter(type == "Alpha", para == "IP") |> pull(sd)
# Beta <- bind_rows(
#   Beta_IP,
#   bonehead_meta |>
#     filter(type == "Alpha", para != "IP") |>
#     mutate(type = "Beta")
# )
# 
# delay_dstns_byvar <- bonehead_meta |>
#   filter(type != "Beta") |>
#   bind_rows(Beta) |>
#   arrange(type, para) |>
#   mutate(shape = mean^2 / sd^2, scale = mean / shape)
# saveRDS(delay_dstns_byvar, "delay-distns-byvar.rds")

## ----variant-si-plot, echo=FALSE----------------------------------------------
delay_dstns_can <- readRDS("delay-distns-byvar.rds") |>
  filter(para == "SI", type %in% unique(can_pred_class$var)) 

delay_dstns_can |>
  rowwise() |>
  mutate(probability = list(discretize_gamma(0:20, shape, scale))) |>
  ungroup() |>
  select(type, probability) |>
  unnest(probability) |>
  group_by(type) |>
  mutate(delay = row_number() - 1) |>
  ggplot(aes(delay, probability, colour = type)) +
  geom_line() + 
  scale_color_brewer(palette = "Dark2", name = "") +
  scale_y_continuous(expand = expansion(c(0, 0.05)))

## ----tvar-matrix, message=FALSE-----------------------------------------------
library(Matrix)
n <- nrow(cancovid)
delay_mat <- matrix(0, n, n)
delay_mat[1,1] <- 1
for (iter in 2:n) {
  current_var <- can_pred_class$var[iter]
  current_pars <- delay_dstns_can |> filter(type == current_var)
  delay <- discretize_gamma(0:(iter - 1), current_pars$shape, current_pars$scale)
  delay_mat[iter, 1:iter] <- rev(delay)
}
delay_mat <- drop0(as(delay_mat, "CsparseMatrix")) # make it sparse, not necessary
delay_mat <- delay_mat / rowSums(delay_mat) # renormalize

## ----tvar-Rt------------------------------------------------------------------
can_tvar <- estimate_rt(
  cancovid$incident_cases, 
  x = cancovid$date, 
  delay_distn = delay_mat,
  nsol = 20L)
plot(can_tvar) + coord_cartesian(ylim = c(0.5, 2))