--- title: "Normalizations Comparisons" output: github_document vignette: > %\VignetteIndexEntry{Comparisons of Normalization Methods} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Load libraries ```{r} library(ggplot2) library(SurprisalAnalysis) library(data.table) library(pheatmap) library(latex2exp) ``` Let us test the normalization methods here pseudocount vs log1p ```{r} #' Plot Bland Altman and density plots of the difference between the two zero handling #' methods #' #' @param aa the gene expression matrix #' @param pc the pseudocount #' #' @return two plots, the Bland Altman plot and the density plot plot.norm.diffs <- function(X, pc){ mat_logpc <- log(X + pc) mat_log1p <- log1p(X) avg_vals <- (mat_logpc + mat_log1p) / 2 diff_vals <- mat_logpc - mat_log1p df_ba <- data.frame( Avg = as.vector(avg_vals), Diff = as.vector(diff_vals) ) p_ba <- ggplot(df_ba, aes(x = Avg, y = Diff)) + geom_point(size = 0.8, color = "darkred") + geom_hline(yintercept = 0, linetype = 2, color = "navy") + labs( title = "Bland–Altman Plot", subtitle = "log(x+1e-6) vs log1p(x)", x = "Average of two transforms", y = "Difference (log(x+1e-6) - log1p(x))" ) + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) ) df_diff <- data.frame(Diff = as.vector(diff_vals)) p_density <- ggplot(df_diff, aes(x = Diff)) + geom_density(aes(y = after_stat(scaled)), fill = "steelblue", alpha = 0.5) + geom_vline(xintercept = 0, linetype = 2, color = "navy") + labs( title = "Distribution of Differences", x = "Difference (log(x+1e-6) - log1p(x))", y = "Density" ) + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) ) return(p_ba + p_density) } ``` ```{r, eval = FALSE} #pseudocount pc <- 1e-6 df <- read.table(system.file("extdata", "GSE60450_Lactation-GenewiseCounts.txt", package = "SurprisalAnalysis"),header = TRUE, sep = "\t", check.names = FALSE) df[,3:ncol(df)]->testing.case.1 df$EntrezGeneID->rownames(testing.case.1) as.matrix(testing.case.1)->testing.case.1 stopifnot(all(c("EntrezGeneID","Length") %in% names(df))) gene_ids <- df$EntrezGeneID len_bp <- df$Length counts <- as.matrix(df[ , -(1:2)]) keep <- is.finite(len_bp) & (len_bp > 0) counts <- counts[keep, , drop = FALSE] len_bp <- len_bp[keep] gene_ids <- gene_ids[keep] rownames(counts) <- gene_ids len_kb <- len_bp / 1000 #FPKM #FPKM_ij = counts_ij / (len_kb_i * libsize_j_in_millions) libsize <- colSums(counts, na.rm = TRUE) libsize_mill <- libsize / 1e6 fpkm <- sweep(counts, 1, len_kb, "/") fpkm <- sweep(fpkm, 2, libsize_mill, "/") #TPM # TPM_ij = (counts_ij / len_kb_i) / sum_i(counts_ij / len_kb_i) * 1e6 rpk <- sweep(counts, 1, len_kb, "/") scale <- colSums(rpk, na.rm = TRUE) tpm <- sweep(rpk, 2, scale, "/") * 1e6 testing.case.2 <- fpkm testing.case.3 <- tpm plot.norm.diffs(testing.case.1, pc) plot.norm.diffs(testing.case.2, pc) plot.norm.diffs(testing.case.3, pc) baseline <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,1], fpkm = surprisal_analysis(testing.case.2)[[1]][,1], tpm = surprisal_analysis(testing.case.3)[[1]][,1]) baseline.pseudo <- ggplot(baseline)+ geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+ geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+ theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) )+ scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+ labs(x="Sample", title=TeX("$\\lambda_0$"), y="Value",colour="Normalization") baseline.pseudo lambda_1 <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,2], fpkm = surprisal_analysis(testing.case.2)[[1]][,2], tpm = surprisal_analysis(testing.case.3)[[1]][,2]) lambda_1.pseudo <- ggplot(lambda_1)+ geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+ geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+ theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) )+ scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+ labs(x="Sample", title=TeX("$\\lambda_1$"), y="Value",colour="Normalization") lambda_1.pseudo lambda_2 <- data.frame(raw = surprisal_analysis(testing.case.1)[[1]][,3], fpkm = surprisal_analysis(testing.case.2)[[1]][,3], tpm = surprisal_analysis(testing.case.3)[[1]][,3]) lambda_2.pseudo <- ggplot(lambda_2)+ geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+ geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+ theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) )+ scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+ labs(x="Sample", title=TeX("$\\lambda_2$"), y="Value",colour="Normalization") lambda_2.pseudo baseline <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,1], fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,1], tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,1]) baseline.log1p <- ggplot(baseline)+ geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+ geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+ theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) )+ scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+ labs(x="Sample", title=TeX("$\\lambda_0$"), y="Value",colour="Normalization") baseline.log1p lambda_1 <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,2], fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,2], tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,2]) lambda_1.log1p <- ggplot(lambda_1)+ geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+ geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+ theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) )+ scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+ labs(x="Sample", title=TeX("$\\lambda_1$"), y="Value",colour="Normalization") lambda_1.log1p lambda_2 <- data.frame(raw = surprisal_analysis(testing.case.1, "log1p")[[1]][,3], fpkm = surprisal_analysis(testing.case.2, "log1p")[[1]][,3], tpm = surprisal_analysis(testing.case.3, "log1p")[[1]][,3]) lambda_2.log1p <- ggplot(lambda_2)+ geom_line(aes(x=1:nrow(baseline), y=raw, colour="Raw"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=raw, colour="Raw"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=fpkm, color = "FPKM Normalized"), size=1)+ geom_point(aes(x=1:nrow(baseline), y=fpkm, colour = "FPKM Normalized"), shape=8, stroke=1)+ geom_line(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), size=1,)+ geom_point(aes(x=1:nrow(baseline), y=tpm, colour="TPM Normalized"), shape=8, stroke=1)+ theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5, size = 20), axis.text = element_text(size = 15), text = element_text(size = 18) )+ scale_color_manual(values=c("Raw"="#9ECAD6", "FPKM Normalized" = "#4A9782","TPM Normalized"="#004030" ))+ labs(x="Sample", title=TeX("$\\lambda_2$"), y="Value",colour="Normalization") lambda_2.log1p #(baseline.pseudo+baseline.log1p)/(lambda.1.pseudo+lambda.1.log1p)/(lambda_2.pseudo+lambda_2.log1p) running.test.case.1<-data.frame(Sample = as.numeric(rownames(testing.case.1)), testing.case.1) running.test.case.2<-data.frame(Sample = as.numeric(rownames(testing.case.2)), testing.case.2) running.test.case.3<-data.frame(Sample = as.numeric(rownames(testing.case.3)), testing.case.3) testing.case.2.log1p <- GO_analysis_surprisal_analysis(surprisal_analysis(running.test.case.2, "log1p")[[2]], 0.85, 2, key_type = "ENTREZID", flip = FALSE, species.db.str = "org.Mm.eg.db", top_GO_terms=15) testing.case.2.pseudo <- GO_analysis_surprisal_analysis(surprisal_analysis(running.test.case.2, "pseudocount")[[2]], 0.85, 2, key_type = "ENTREZID", flip = FALSE, species.db.str = "org.Mm.eg.db", top_GO_terms=15) #ggplot(testing.case.2.pseudo, aes(x = reorder(Description, -Count), y = Count, fill = p.adjust)) + #geom_col() + #coord_flip() + #scale_fill_gradient(low = "steelblue", high = "red") + #labs( # title = "Top 15 Enriched GO Terms", # x = "GO Term", # y = "Gene Count", # fill = "Adj. p-value" #) + #theme_minimal(base_size = 14) #Jaccard similarity extreme_pct_lambda <- function(sa_mat, lambda_col = "lambda_1", p = 0.05) { stopifnot(lambda_col %in% colnames(sa_mat)) v <- sa_mat[, lambda_col] n <- length(v) k <- max(1, ceiling(p * n)) top_ids <- names(sort(v, decreasing = TRUE))[seq_len(k)] bot_ids <- names(sort(v, decreasing = FALSE))[seq_len(k)] return(sort(unique(c(top_ids, bot_ids)))) } sa1_log1p <- surprisal_analysis(running.test.case.1, "log1p")[[2]] sa1_pc <- surprisal_analysis(running.test.case.1, "pseudocount")[[2]] sa2_log1p <- surprisal_analysis(running.test.case.2, "log1p")[[2]] sa2_pc <- surprisal_analysis(running.test.case.2, "pseudocount")[[2]] sa3_log1p <- surprisal_analysis(running.test.case.3, "log1p")[[2]] sa3_pc <- surprisal_analysis(running.test.case.3, "pseudocount")[[2]] set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_1", 0.05) set1_pc <- extreme_pct_lambda(sa1_pc, "lambda_1", 0.05) set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_1", 0.05) set2_pc <- extreme_pct_lambda(sa2_pc, "lambda_1", 0.05) set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_1", 0.05) set3_pc <- extreme_pct_lambda(sa3_pc, "lambda_1", 0.05) sets <- list( `Raw-log1p` = set1_log1p, `Raw-pseudocount` = set1_pc, `FPKM-log1p` = set2_log1p, `FPKM-pseudocount` = set2_pc, `TPM-log1p` = set3_log1p, `TPM-pseudocount` = set3_pc ) pairwise_matrix <- function(sets, fun) { n <- length(sets) M <- matrix(0, n, n, dimnames = list(names(sets), names(sets))) for (i in seq_len(n)) for (j in seq_len(n)) M[i, j] <- fun(sets[[i]], sets[[j]]) M } overlap_fun <- function(a, b) length(intersect(a, b)) jaccard_fun <- function(a, b) { inter <- length(intersect(a, b)); uni <- length(union(a, b)) if (uni == 0) return(NA_real_) else inter / uni } overlap_mat <- pairwise_matrix(sets, overlap_fun) diag(overlap_mat) <- vapply(sets, length, integer(1)) jaccard_mat <- pairwise_matrix(sets, jaccard_fun) pheatmap::pheatmap( jaccard_mat, main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_0$)"), display_numbers = TRUE, number_format = "%.2f", fontsize_row = 10, fontsize_col = 10, border_color = NA, color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100) ) set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_2", 0.05) set1_pc <- extreme_pct_lambda(sa1_pc, "lambda_2", 0.05) set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_2", 0.05) set2_pc <- extreme_pct_lambda(sa2_pc, "lambda_2", 0.05) set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_2", 0.05) set3_pc <- extreme_pct_lambda(sa3_pc, "lambda_2", 0.05) sets <- list( `Raw-log1p` = set1_log1p, `Raw-pseudocount` = set1_pc, `FPKM-log1p` = set2_log1p, `FPKM-pseudocount` = set2_pc, `TPM-log1p` = set3_log1p, `TPM-pseudocount` = set3_pc ) overlap_mat <- pairwise_matrix(sets, overlap_fun) diag(overlap_mat) <- vapply(sets, length, integer(1)) jaccard_mat <- pairwise_matrix(sets, jaccard_fun) pheatmap::pheatmap( jaccard_mat, main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_1$)"), display_numbers = TRUE, number_format = "%.2f", fontsize_row = 10, fontsize_col = 10, border_color = NA, color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100) ) set1_log1p <- extreme_pct_lambda(sa1_log1p, "lambda_3", 0.05) set1_pc <- extreme_pct_lambda(sa1_pc, "lambda_3", 0.05) set2_log1p <- extreme_pct_lambda(sa2_log1p, "lambda_3", 0.05) set2_pc <- extreme_pct_lambda(sa2_pc, "lambda_3", 0.05) set3_log1p <- extreme_pct_lambda(sa3_log1p, "lambda_3", 0.05) set3_pc <- extreme_pct_lambda(sa3_pc, "lambda_3", 0.05) sets <- list( `Raw-log1p` = set1_log1p, `Raw-pseudocount` = set1_pc, `FPKM-log1p` = set2_log1p, `FPKM-pseudocount` = set2_pc, `TPM-log1p` = set3_log1p, `TPM-pseudocount` = set3_pc ) overlap_mat <- pairwise_matrix(sets, overlap_fun) diag(overlap_mat) <- vapply(sets, length, integer(1)) jaccard_mat <- pairwise_matrix(sets, jaccard_fun) pheatmap::pheatmap( jaccard_mat, main = TeX("Jaccard similarity (Top & Bottom 5% of $\\lambda_2$)"), display_numbers = TRUE, number_format = "%.2f", fontsize_row = 10, fontsize_col = 10, border_color = NA, color = colorRampPalette(c("#1B3C53", "#456882", "#91C8E4"))(100) ) ```