## ----loadLibs, include = FALSE------------------------------------------------ # Required package: library(GRIN2) # Suggested packages: load if available if (requireNamespace("knitr", quietly = TRUE)) { library(knitr) } if (requireNamespace("survival", quietly = TRUE)) { library(survival) } #old<- options() # Load example datasets from GRIN2 package data(clin_data) data(lesion_data) data(expr_data) data(hg38_gene_annotation) data(hg38_chrom_size) data(hg38_cytoband) data(pathways) # Set knitr options knitr::opts_chunk$set( comment = "#>", collapse = TRUE, echo = TRUE, error = TRUE, eval = TRUE, digits = 3, tidy = FALSE, background = "#FFFF00", fig.align = 'center', warning = FALSE, message = FALSE ) # Reset options at end of vignette oldOpt <- options(width = 55, digits = 3) on.exit(options(oldOpt), add = TRUE) # Helper function getInfo <- function(what = "Suggests") { text <- packageDescription("GRIN2")[what][[1]] text <- gsub("\n", ", ", text, fixed = TRUE) text <- gsub(">=", "$\\\\ge$", text, fixed = TRUE) eachPkg <- strsplit(text, ", ", fixed = TRUE)[[1]] eachPkg <- gsub(",", "", eachPkg, fixed = TRUE) length(eachPkg) } ## ----------------------------------------------------------------------------- data(clin_data) data(lesion_data) data(expr_data) # Please make sure that coordinates in the lesion data file (loc.start and loc.end) are based on GRCh38 (hg38) genome assembly. Multiple tools that include the UCSC LiftOver tool () can be used for such conversions. head(lesion_data) # Specify a folder on your local machine to store the analysis results: # resultsPath=tempdir() # knitr::opts_knit$set(root.dir = normalizePath(path = resultsPath)) ## ----message=FALSE, warning=FALSE--------------------------------------------- # A subset (417 genes) retrieved from ensembl BioMart will be used as an example gene annotation data file: data(hg38_gene_annotation) # To retrieve a complete annotation data for genes and regulatory features from ensembl BioMart, users can use get.ensembl.annotation function: # hg38.ann <- get.ensembl.annotation("Human_GRCh38") head(hg38_gene_annotation) ## ----message=FALSE, warning=FALSE--------------------------------------------- # chromosome size data: head(hg38_chrom_size) # chromosome size data can be also retrieved for GRCh38 (hg38) genome build from chr.info txt file available on UCSC genome browser: ## ----message=FALSE, warning=FALSE--------------------------------------------- # grin.stats is the function that can be used to excute the GRIN statistical framework: grin.results=grin.stats(lesion_data, hg38_gene_annotation, hg38_chrom_size) # In addition to annotated genes, users can run GRIN analysis on regulatory sequences from ensembl regulatory build and FANTOM5 project. ## ----------------------------------------------------------------------------- # Extract GRIN results table: grin.table=grin.results$gene.hits sorted.results <- grin.table[order(as.numeric(as.character(grin.table$p2.nsubj))),] ## ----------------------------------------------------------------------------- head(sorted.results[,c(7,11:14)]) ## ----------------------------------------------------------------------------- head(sorted.results[,c(7,19:22)]) ## ----------------------------------------------------------------------------- head(sorted.results[,c(7,27:30)]) ## ----------------------------------------------------------------------------- head(sorted.results[,c(7,31:34)]) ## ----------------------------------------------------------------------------- # write.grin.xlsx function return an excel file with multiple sheets that include GRIN results table, interpretation of each column in the results, and methods paragraph # write.grin.xlsx(grin.results, "T-ALL_GRIN_result_annotated_genes.xlsx") # To return the results table without other information (will be helpful in case of large lesion data files where the gene.lsn.data sheet will be > 1 million rows that halt the write.grin.xlsx function). grin.res.table=grin.results$gene.hits ## ----genomewide.lesion.plot, fig.height = 7, fig.width = 8, fig.cap="Figure 1. Genome-wide lesion plot"---- genomewide.plot=genomewide.lsn.plot(grin.results, max.log10q=50) # This function use the list of grin.results ## ----grin.stacked.barplot, fig.height = 10, fig.width = 10, fig.cap="Figure 2. stacked barplot with number of patients affected by different types of lesions in a list of genes of interest"---- # This barplot shows the number of patients affected by different types of lesions in a list of genes of interest: count.genes=as.vector(c("CDKN2A", "NOTCH1", "CDKN2B", "TAL1", "FBXW7", "PTEN", "IRF8", "NRAS", "BCL11B", "MYB", "LEF1","RB1", "MLLT3", "EZH2", "ETV6", "CTCF", "JAK1", "KRAS", "RUNX1", "IKZF1", "KMT2A", "RPL11", "TCF7", "WT1", "JAK2", "JAK3", "FLT3")) # return the stacked barplot grin.barplt(grin.results, count.genes) ## ----message=FALSE, warning=FALSE--------------------------------------------- # First identify the list of genes to be included in the oncoprint: oncoprint.genes=as.vector(c("ENSG00000101307", "ENSG00000171862", "ENSG00000138795", "ENSG00000139083", "ENSG00000162434", "ENSG00000134371", "ENSG00000118058", "ENSG00000171843", "ENSG00000139687", "ENSG00000184674", "ENSG00000118513", "ENSG00000197888", "ENSG00000111276", "ENSG00000258223", "ENSG00000187266", "ENSG00000174473", "ENSG00000133433", "ENSG00000159216", "ENSG00000107104", "ENSG00000099984", "ENSG00000078403", "ENSG00000183150", "ENSG00000081059", "ENSG00000175354", "ENSG00000164438")) # Prepare a lesion matrix for the selected list of genes with each row as a gene and each column is a patient (this matrix is compatible with oncoPrint function in ComplexHeatmap package): oncoprint.mtx=grin.oncoprint.mtx(grin.results, oncoprint.genes) head(oncoprint.mtx[,1:6]) # Use onco.print.props function to specify a height proportion for each lesion category: onco.props<-onco.print.props(lesion.data, hgt = c("gain"=5, "loss"=4, "mutation"=2, "fusion"=1)) column_title = "" # optional # use oncoprint function from ComplexHeatmap library to plot the oncoprint: ## ----locus lesion plot, fig.keep='last', fig.height =8, fig.width = 9, fig.cap="Figure 3. lesion plot showing all deletions affecting CDKN2A locus on chromosome 9"---- # First call the data for "hg38_cytoband": data(hg38_cytoband) # Plots Showing a Selected Type of Lesion (Ex: deletions) Affecting a region of Interest: cdkn2a.locus=lsn.transcripts.plot(grin.results, transTrack = FALSE, hg38.cytoband=hg38_cytoband, chrom=9, plot.start=19900000, plot.end=25600000, lesion.grp = "loss", spec.lsn.clr = "blue") # This type of lesion plots can be also prepared for a certain gene or locus of interest with transcripts track added using the same lsn.transcripts.plot function. ## ----regional.lesion.plot, fig.keep='last', fig.height =10, fig.width = 8, fig.cap="Figure 4. Plots showing all different types of lesions affecting the whole chromosome"---- # Plots Showing Different Types of Lesions Affecting the whole chromosome: chrom.plot=lsn.transcripts.plot(grin.results, transTrack = FALSE, hg38.cytoband=hg38_cytoband, chrom=9, plot.start=1, plot.end=141000000) ## ----message=FALSE, warning=FALSE--------------------------------------------- # Prepare gene and lesion data for later computations # This lesion matrix has all lesion types that affect a single gene in one row. It can be used to run association analysis with expression data (part of alex.prep.lsn.expr function) # First step is to prepare gene and lesion data for later computations gene.lsn=prep.gene.lsn.data(lesion_data, hg38_gene_annotation) # Then determine lesions that overlap each gene (locus) gene.lsn.overlap= find.gene.lsn.overlaps(gene.lsn) # Finally, build the lesion matrix using prep.lsn.type.matrix function: gene.lsn.type.mtx=prep.lsn.type.matrix(gene.lsn.overlap, min.ngrp=5) # prep.lsn.type.matrix function return each gene in a row, if the gene is affected by multiple types of lesions (for example gain AND mutations), entry will be denoted as "multiple" for this specific patient. # min.ngrp can be used to specify the minimum number of patients with a lesion to be included in the final lesion matrix. head(gene.lsn.type.mtx[,1:5]) ## ----message=FALSE, warning=FALSE--------------------------------------------- # alex.prep.lsn.expr function prepare expression, lesion data and return the set of genes with both types of data available ordered by gene IDs in rows and patient IDs in columns: alex.data=alex.prep.lsn.expr(expr_data, lesion_data, hg38_gene_annotation, min.expr=1, min.pts.lsn=5) # ALEX ordered lesion data: alex.lsn=alex.data$alex.lsn head(alex.lsn[,1:5]) # ALEX ordered expression data: alex.expr=alex.data$alex.expr head(alex.expr[,1:5]) ## ----message=FALSE, warning=FALSE--------------------------------------------- # KW.hit.express function runs Kruskal-Wallis test for association between lesion groups and expression level of the same corresponding gene: alex.kw.results=KW.hit.express(alex.data, hg38_gene_annotation, min.grp.size=5) ## ----------------------------------------------------------------------------- # order the genes by the ones with most significant KW q-value: sorted.kw <- alex.kw.results[order(as.numeric(as.character(alex.kw.results$q.KW))),] ## ----------------------------------------------------------------------------- head(sorted.kw[,c(6,7,11,12)]) ## ----------------------------------------------------------------------------- head(sorted.kw[,c(13:18)]) ## ----------------------------------------------------------------------------- head(sorted.kw[,c(19:24)]) ## ----JAK2.waterfall.plot, fig.height =6, fig.width = 6, fig.cap="Figure 5. JAK2 Water-fall plot which offers a side-by-side graphical representation of lesion and expression data for each patient"---- # waterfall plots allow a side-by-side representation of expression and lesion data of the gene of interest. # First prepare expression and lesion data for waterfall plots: WT1.waterfall.prep=alex.waterfall.prep(alex.data, alex.kw.results, "WT1", lesion_data) # alex.waterfall.plot can be used to return the plot WT1.waterfall.plot=alex.waterfall.plot(WT1.waterfall.prep, lesion_data) ## ----lesion.expression.pathway, fig.height =6, fig.width = 6, fig.cap="Figure 6. Ordered Lesion and Expression Data based on the Clustering Analysis on the pathway level (JAK/STAT pathway)"---- data("pathways") # alex.pathway will return two panels figure of lesion and expression data of ordered subjects based on the computed lesions distance in all genes assigned to the pathway of interest: alex.path=alex.pathway(alex.data, lsn.data = lesion_data, pathways = pathways, selected.pathway = "Jak_Pathway") # To return ordered lesion and expression data of the genes assigned to the pathway of interest (same patients order in the plot): alex.path[1:10,1:5] ## ----message=FALSE, warning=FALSE--------------------------------------------- # This type of lesion matrices with each gene affected by a certain type of lesion in a separate row is very helpful to run multiple levels of association analysis that include association between lesions and treatment outcomes. # Users should first Prepare gene and lesion data and determine lesions that overlap each gene (locus): gene.lsn=prep.gene.lsn.data(lesion_data, hg38_gene_annotation) gene.lsn.overlap= find.gene.lsn.overlaps(gene.lsn) # use prep.binary.lsn.mtx function to prepare the lesion binary matrix: lsn.binary.mtx.atleast5=prep.binary.lsn.mtx(gene.lsn.overlap, min.ngrp=5) # Each row is a lesion type that affect a certain gene for example NOTCH1_mutation (entry will be labelled as 1 if the patient is affected by by this type of lesion and 0 otherwise). # min.ngrp can be used to specify the minimum number of patients with a lesion to be included in the final lesion matrix. head(lsn.binary.mtx.atleast5[,1:5]) ## ----message=FALSE, warning=FALSE--------------------------------------------- # Prepare Event-free Survival (EFS) and Overall Survival (OS) as survival objects: clin_data$EFS <- Surv(clin_data$efs.time, clin_data$efs.censor) clin_data$OS <- Surv(clin_data$os.time, clin_data$os.censor) # List all clinical variables of interest to be included in the association analysis: clinvars=c("MRD.binary", "EFS", "OS") # Run association analysis between lesions and clinical variables: assc.outcomes=grin.assoc.lsn.outcome(lsn.binary.mtx.atleast5, clin_data, hg38_gene_annotation, clinvars) # Optional: Adjust for covariates using the 'covariate' argument ## ----------------------------------------------------------------------------- # order the genes by the ones with most significant KW q-value: sorted.outcomes <- assc.outcomes[order(as.numeric(as.character(assc.outcomes$`MRD.binary.p-value`))),] ## ----------------------------------------------------------------------------- head(sorted.outcomes[1:7,c(6,11,14,15)]) ## ----------------------------------------------------------------------------- head(sorted.outcomes[1:7,c(6, 16:19)]) ## ----setup-------------------------------------------------------------------- library(GRIN2)