## ----global_options, include=FALSE-------------------------------------------- knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, fig.pos = "h", collapse = TRUE, comment = "#>", knitr.table.format = "simple" ) ## ----setup, include = FALSE--------------------------------------------------- chooseCRANmirror(graphics = FALSE, ind = 1) options(tinytex.verbose = TRUE) ## ----install_proBatch, fig.show='hold', eval = FALSE-------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # BiocManager::install("proBatch") ## ----load_packages, message=FALSE--------------------------------------------- library(proBatch) library(dplyr) library(tibble) library(ggplot2) library(QFeatures) ## ----load_data---------------------------------------------------------------- data("example_ecoli_data", package = "proBatch") ## ----extract-data------------------------------------------------------------- # Extracting data all_metadata <- example_ecoli_data$all_metadata all_precursors <- example_ecoli_data$all_precursors all_protein_groups <- example_ecoli_data$all_protein_groups all_precursor_pg_match <- example_ecoli_data$all_precursor_pg_match # Keeping only essential rm(example_ecoli_data) ## ----build-pbf-from-matrix---------------------------------------------------- # Building from a (features x samples) matrix pbf <- ProBatchFeatures( data_matrix = all_precursors, # matrix of peptide intensities (features x samples) sample_annotation = all_metadata, # data.frame of sample metadata sample_id_col = "Run", # column in metadata that matches sample IDs level = "peptide" # label this assay level as "peptide" ) pbf # show basic info about the ProBatchFeatures object ## ----add-new-pbf-level-------------------------------------------------------- # Adding proteins as a new level and mapping them to precursors # all_precursor_pg_match has columns: "Precursor.Id", "Protein.Ids" pbf <- pb_add_level( object = pbf, from = "peptide::raw", new_matrix = all_protein_groups, to_level = "protein", # will name "protein::raw" by default mapping_df = all_precursor_pg_match, from_id = "Precursor.Id", to_id = "Protein.Ids", map_strategy = "as_is" ) pbf ## ----check_pbf---------------------------------------------------------------- # Check validObject(pbf) # should be TRUE assayLink(pbf, "protein::raw") # verify link summary # Remove unused variables to clean up the workspace rm(all_metadata, all_precursor_pg_match, all_precursors, all_protein_groups) ## ----nNA_calc----------------------------------------------------------------- # pb_nNA returns a DataFrame with counts of NAs per feature/sample na_counts <- pb_nNA(pbf) # na_counts contains info about total number of NAs and % in the data: na_counts[["nNA"]] %>% as_tibble() ## ----nNA_individual----------------------------------------------------------- # check NAs per sample: head(na_counts[["peptide::raw"]]$nNAcols) %>% as_tibble() ## ----print_na_counts---------------------------------------------------------- print(names(na_counts[["peptide::raw"]])) ## ----define_colors------------------------------------------------------------ # color scheme for lab and condition labels color_scheme <- sample_annotation_to_colors( pbf, sample_id_col = "Run", factor_columns = c("Lab", "Condition"), numeric_columns = NULL ) ## ----plot_na_heatmap_single, fig.show='hold', fig.width=10, fig.height=4------ plot_NA_heatmap( pbf, # by default the last created assay show_rownames = FALSE, show_row_dend = FALSE, color_by = "Lab" ) ## ----plot_na_heatmap_multi, fig.show='hold', fig.width=10, fig.height=4------- plot_NA_heatmap( pbf, pbf_name = c("peptide::raw", "protein::raw"), show_rownames = FALSE, show_row_dend = FALSE, color_by = "Lab" ) ## ----plot_na_frequency_panel, fig.show='hold', fig.width=7, fig.height=4------ plot_NA_frequency( pbf, pbf_name = c("peptide::raw", "protein::raw"), show_percent = FALSE ) ## ----apply_missing_filter----------------------------------------------------- pbf <- pb_filterNA( pbf, # without specifying, the filter will be applied to all assays in the object inplace = TRUE, # if false, filtered matrix will be saved as a new assay. pNA = 0.6 # the maximal proportion of missing values per feature ) pbf ## ----plot_na_heatmap_filtered, fig.show='hold', fig.width=7, fig.height=4----- plot_NA_heatmap( pbf, pbf_name = c("peptide::raw", "protein::raw"), show_rownames = FALSE, show_row_dend = FALSE, color_by = "Lab", # currently only one level is supported border_color = NA # pheatmap parameter to remove cell borders ) ## ----run_log_transform-------------------------------------------------------- pbf <- log_transform_dm( pbf, log_base = 2, offset = 1, pbf_name = "protein::raw" ) pbf <- log_transform_dm( pbf, log_base = 2, offset = 1, pbf_name = "peptide::raw" ) pbf ## ----plot_logmean, fig.height=6, fig.width=8, warning=FALSE------------------- plot_boxplot( pbf, pbf_name = c("peptide::log2_on_raw", "protein::log2_on_raw"), # plot multiple on one sample_id_col = "Run", order_col = "Run", batch_col = "Lab", color_by_batch = TRUE, color_scheme = color_scheme, base_size = 7, plot_ncol = 1, # how many columns use for plotting multy-assay plot plot_title = c( # title for each assay "Peptide level - log2 scale", "Protein level - log2 scale" ) ) ## ----plot_na_density_panels, fig.show='hold', fig.width=10, fig.height=4------ plot_NA_density( pbf, pbf_name = c("peptide::log2_on_raw", "protein::log2_on_raw") ) ## ----plot_PCA, fig.show='hold', fig.width=8, fig.height=4--------------------- pca1 <- plot_PCA( pbf, pbf_name = "protein::log2_on_raw", sample_id_col = "Run", color_scheme = color_scheme, color_by = "Lab", shape_by = "Condition", fill_the_missing = NULL, # remove all rows with missing values. Can be also "remove" plot_title = "NA rows removed, protein, log2", base_size = 10, point_size = 3, point_alpha = 0.5 ) pca2 <- plot_PCA( pbf, pbf_name = "protein::log2_on_raw", sample_id_col = "Run", color_scheme = color_scheme, color_by = "Condition", shape_by = "Lab", fill_the_missing = 0, # default value is -1 plot_title = "NA replaced with 0, protein, log2", base_size = 10, point_size = 3, point_alpha = 0.5 ) library(gridExtra) grid.arrange(pca1, pca2, ncol = 2, nrow = 1) ## ----apply_median_normalization----------------------------------------------- pbf <- pb_transform( object = pbf, from = "peptide::log2_on_raw", steps = "medianNorm" ) pbf <- pb_transform( object = pbf, from = "protein::log2_on_raw", steps = "medianNorm" ) pbf ## ----compare_median_norm_diagnostics, fig.show='hold', fig.width=8, fig.height=4---- # boxplot plot_boxplot( pbf, sample_id_col = "Run", order_col = "Run", batch_col = "Lab", color_by_batch = TRUE, color_scheme = color_scheme, base_size = 7, pbf_name = c("protein::log2_on_raw", "protein::medianNorm_on_log2_on_raw"), plot_ncol = 1, plot_title = c( "Before Median Normalization, protein level", "After Median Normalization, protein level" ) ) # plot PCA plot plot_PCA( pbf, pbf_name = c("protein::log2_on_raw", "protein::medianNorm_on_log2_on_raw"), sample_id_col = "Run", color_scheme = color_scheme, color_by = "Lab", shape_by = "Condition", fill_the_missing = NULL, # remove all rows with missing values. Can be also "remove" plot_title = c( "Before Median Normalization, protein level", "After Median Normalization, protein level" ), base_size = 10, point_size = 3, point_alpha = 0.5 ) ## ----prepare_sample_annotations----------------------------------------------- # sample annotations used by both correction methods sa <- colData(pbf) %>% as.data.frame() ## ----apply_limma_correction--------------------------------------------------- # limma::removeBatchEffect method pbf <- pb_transform( pbf, from = "protein::log2_on_raw", steps = "limmaRBE", params_list = list(list( sample_annotation = sa, batch_col = "Lab", covariates_cols = c("Condition"), fill_the_missing = FALSE )) ) pbf <- pb_transform( pbf, from = "protein::medianNorm_on_log2_on_raw", steps = "limmaRBE", params_list = list(list( sample_annotation = sa, batch_col = "Lab", covariates_cols = c("Condition"), fill_the_missing = FALSE )) ) ## ----inspect_pbf_after_limma-------------------------------------------------- print(pbf) ## ----apply_combat_correction-------------------------------------------------- # sva::ComBat wrapped via pb_transform() pbf <- pb_transform( pbf, from = "protein::log2_on_raw", steps = "combat", params_list = list(list( sample_annotation = sa, batch_col = "Lab", sample_id_col = "Run", par.prior = TRUE, fill_the_missing = "remove" )) ) pbf <- pb_transform( pbf, from = "protein::medianNorm_on_log2_on_raw", steps = "combat", params_list = list(list( sample_annotation = sa, batch_col = "Lab", sample_id_col = "Run", par.prior = TRUE, fill_the_missing = "remove" )) ) ## ----inspect_pbf_after_combat------------------------------------------------- print(pbf) ## ----apply_batch_correction_peptide------------------------------------------- pbf <- pb_transform( pbf, from = "peptide::medianNorm_on_log2_on_raw", steps = "limmaRBE", params_list = list( list( sample_annotation = sa, batch_col = "Lab", covariates_cols = c("Condition"), fill_the_missing = FALSE ) ) ) pbf <- pb_transform( pbf, from = "peptide::medianNorm_on_log2_on_raw", steps = "combat", params_list = list( list( sample_annotation = sa, batch_col = "Lab", sample_id_col = "Run", par.prior = TRUE, fill_the_missing = "remove" ) ) ) pbf ## ----plot_pca_limma_series, fig.height=7, fig.show='hold', fig.width=8, message=FALSE, warning=FALSE---- plot_PCA( pbf, pbf_name = c( "protein::log2_on_raw", "protein::medianNorm_on_log2_on_raw", "protein::limmaRBE_on_medianNorm_on_log2_on_raw", "protein::combat_on_medianNorm_on_log2_on_raw" ), sample_id_col = "Run", color_scheme = color_scheme, color_by = "Lab", shape_by = "Condition", fill_the_missing = NULL, base_size = 8, point_size = 3, point_alpha = 0.5 ) ## ----plot_pca_peptides, fig.height=7, fig.show='hold', fig.width=8, message=FALSE, warning=FALSE---- plot_PCA( pbf, pbf_name = c( "peptide::log2_on_raw", "peptide::medianNorm_on_log2_on_raw", "peptide::limmaRBE_on_medianNorm_on_log2_on_raw", "peptide::combat_on_medianNorm_on_log2_on_raw" ), sample_id_col = "Run", color_scheme = color_scheme, color_by = "Lab", shape_by = "Condition", fill_the_missing = "remove", base_size = 8, point_size = 3, point_alpha = 0.5 ) ## ----plot_hclust_combat, fig.show='hold', fig.width=12, fig.height=4---------- plot_hierarchical_clustering( pbf, pbf_name = "protein::combat_on_medianNorm_on_log2_on_raw", sample_id_col = "Run", label_font = 0.6, color_list = color_scheme ) ## ----plot_hclust_limma, fig.show='hold', fig.width=12, fig.height=4----------- plot_hierarchical_clustering( pbf, pbf_name = "protein::limmaRBE_on_medianNorm_on_log2_on_raw", sample_id_col = "Run", label_font = 0.6, color_list = color_scheme, fill_the_missing = "remove" ) ## ----plot_pvca_panels, fig.height=6, fig.show='hold', fig.width=5, message=FALSE, warning=FALSE---- plot_PVCA( pbf, pbf_name = c( "protein::medianNorm_on_log2_on_raw", "protein::combat_on_medianNorm_on_log2_on_raw" ), sample_id_col = "Run", technical_factors = c("Lab"), biological_factors = c("Condition"), fill_the_missing = NULL, base_size = 7, plot_ncol = 2, # the number of plots in a row variance_threshold = 0 # the the percentile value of weight each of the # covariates needs to explain # (the rest will be lumped together) ) ## ----show_operation_log------------------------------------------------------- get_operation_log(pbf) %>% as_tibble() ## ----compute_pipeline_name---------------------------------------------------- pb_pipeline_name(pbf, "protein::combat_on_medianNorm_on_log2_on_raw") ## ----matr--------------------------------------------------------------------- extracted_matrix <- pb_assay_matrix( pbf, assay = "peptide::raw" ) # show extracted_matrix[1:5, 1:5] rm(extracted_matrix) ## ----extract_to_long---------------------------------------------------------- extracted_long <- pb_as_long( pbf, sample_id_col = "Run", pbf_name = "peptide::raw" ) # show extracted_long[1:3, ] rm(extracted_long) ## ----sessionInfo-------------------------------------------------------------- sessionInfo() ## ----citation----------------------------------------------------------------- citation("proBatch")