The material in this course requires R version 3.2 and Bioconductor version 3.2
stopifnot(
    getRversion() >= '3.2' && getRversion() < '3.3',
    BiocInstaller::biocVersion() == "3.2"
)
TxDb packagese.g., `{r Biocpkg(“TxDb.Hsapiens.UCSC.hg19.knownGene”)}
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
##  [1] annotatedDataFrameFrom asBED                  asGFF                  as.list               
##  [5] assayData              assayData<-            cdsByOverlaps          cdsBy                 
##  [9] cds                    coerce                 columns                combine               
## [13] contents               dbconn                 dbfile                 dbInfo                
## [17] dbmeta                 dbschema               disjointExons          distance              
## [21] $<-                    $                      exonsByOverlaps        exonsBy               
## [25] exons                  ExpressionSet          extractUpstreamSeqs    featureNames<-        
## [29] featureNames           fiveUTRsByTranscript   genes                  initialize            
## [33] intronsByTranscript    isActiveSeq<-          isActiveSeq            isNA                  
## [37] keys                   keytypes               locateVariants         mapIds                
## [41] mappedkeys             mapToTranscripts       metadata               microRNAs             
## [45] nhit                   organism               predictCoding          promoters             
## [49] revmap                 sample                 sampleNames<-          sampleNames           
## [53] saveDb                 select                 seqinfo                seqlevels0            
## [57] seqlevels<-            show                   species                storageMode<-         
## [61] storageMode            summarizeVariants      taxonomyId             threeUTRsByTranscript 
## [65] transcriptsByOverlaps  transcriptsBy          transcripts            tRNAs                 
## [69] updateObject          
## see '?methods' for accessing help and source code
TxDb objects
dbfile(txdb)GenomicFeatures::makeTxDbFrom*()Accessing gene models
exons(), transcripts(), genes(), cds() (coding sequence)promoters() & friendsexonsBy() & friends – exons by gene, transcript, …keytypes(), columns(), keys(), select(), mapIds()exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
##                  seqnames         ranges strand   |   exon_id
##                     <Rle>      <IRanges>  <Rle>   | <integer>
##        [1]           chr1 [11874, 12227]      +   |         1
##        [2]           chr1 [12595, 12721]      +   |         2
##        [3]           chr1 [12613, 12721]      +   |         3
##        [4]           chr1 [12646, 12697]      +   |         4
##        [5]           chr1 [13221, 14409]      +   |         5
##        ...            ...            ...    ... ...       ...
##   [289965] chrUn_gl000241 [35706, 35859]      -   |    289965
##   [289966] chrUn_gl000241 [36711, 36875]      -   |    289966
##   [289967] chrUn_gl000243 [11501, 11530]      +   |    289967
##   [289968] chrUn_gl000243 [13608, 13637]      +   |    289968
##   [289969] chrUn_gl000247 [ 5787,  5816]      -   |    289969
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
BSgenome packagese.g., `{r Biocpkg(“BSgenome.Hsapiens.UCSC.hg19”)}
library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
getSeq(genome, exons(txdb)[1:100])
##   A DNAStringSet instance of length 100
##       width seq
##   [1]   354 CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGT...AAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCA
##   [2]   127 GCTCCTGTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCC...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
##   [3]   109 GTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCT...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
##   [4]    52 CATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTT
##   [5]  1189 GCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCT...ACAGCTAGAGATCCTTTATTAAAAGCACACTGTTGGTTTCTG
##   ...   ... ...
##  [96]   251 CCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGC...ATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG
##  [97]   262 GTTCGGGTCTGGCGGTACTTGAAGGGCAAAGACCTGGTGGCCC...TCACCCTGCGGAACCTGGAGGAGGTGGAGTTCTGTGTGGAAG
##  [98]    48 ATAAACCCGGGACCCACTTCACTCCAGTGCCTCCGACGCCTCCTGATG
##  [99]   216 CGTGCCGGGGAATGCTGTGCGGCTTCGGCGCCGTGTGCGAGCC...GCCAGCAGCGCCGCATCCGCCTGCTCAGCCGCGGGCCGTGCG
## [100]   225 GCTCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAG...CCCGCCAGGAGAATGTCTTCAAGAAGTTCGACGGCCCTTGTG
OrgDblibrary(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2015-Sep27
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20150919
## | GOEGSOURCEDATE: 2015-Sep27
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2015-Jul16
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Fri Oct  9 19:54:03 2015
## 
## Please see: help('select') for usage information
OrgDb objects
TxDbkeytypes(), columns(), keys(), select(), mapIds()select()
Specification of key type
select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
##   SYMBOL ENTREZID                       GENENAME
## 1  BRCA1      672   breast cancer 1, early onset
## 2   PTEN     5728 phosphatase and tensin homolog
keytypes(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"Related functionality
mapIds() – special case for mapping from 1 identifier to anotherOrganismDb objects: combined org.*, TxDb.*, and other annotation resources for easy access
library(Homo.sapiens)
## Loading required package: OrganismDbi
## Loading required package: GO.db
## 
## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly
select(Homo.sapiens, c("BRCA1", "PTEN"), 
       c("TXNAME", "TXCHROM", "TXSTART", "TXEND"), 
       "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
##    SYMBOL     TXNAME TXCHROM  TXSTART    TXEND
## 1   BRCA1 uc010whl.2   chr17 41196312 41276132
## 2   BRCA1 uc002icp.4   chr17 41196312 41277340
## 3   BRCA1 uc010whm.2   chr17 41196312 41277340
## 4   BRCA1 uc002icu.3   chr17 41196312 41277468
## 5   BRCA1 uc010cyx.3   chr17 41196312 41277468
## 6   BRCA1 uc002icq.3   chr17 41196312 41277500
## 7   BRCA1 uc002ict.3   chr17 41196312 41277500
## 8   BRCA1 uc010whn.2   chr17 41196312 41277500
## 9   BRCA1 uc010who.3   chr17 41196312 41277500
## 10  BRCA1 uc010whp.2   chr17 41196312 41322420
## 11  BRCA1 uc010whq.1   chr17 41215350 41256973
## 12  BRCA1 uc002idc.1   chr17 41215350 41277468
## 13  BRCA1 uc010whr.1   chr17 41215350 41277468
## 14  BRCA1 uc002idd.3   chr17 41243117 41276132
## 15  BRCA1 uc002ide.1   chr17 41243452 41256973
## 16  BRCA1 uc010cyy.1   chr17 41243452 41277340
## 17  BRCA1 uc010whs.1   chr17 41243452 41277468
## 18  BRCA1 uc010cyz.2   chr17 41243452 41277500
## 19  BRCA1 uc010cza.2   chr17 41243452 41277500
## 20  BRCA1 uc010wht.1   chr17 41243452 41277500
## 21   PTEN uc001kfb.3   chr10 89623195 89728532
## 22   PTEN uc021pvw.1   chr10 89623195 89728532biomaRt, AnnotationHubhttp://biomart.org; Bioconductor package biomaRt
## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3)                      ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <-                                ## fully specified mart
    useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3)             ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3)          ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(attributes =  myAttributes, filters =  myFilter,
             values =  myValues, mart = ensembl)
Other internet resources
Example: Ensembl ‘GTF’ files to R / Bioconductor GRanges and TxDb
library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)
Example: non-model organism OrgDb packages
library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")
Example: Map Roadmap epigenomic marks to hg38
Roadmap BED file as GRanges
library(AnnotationHub)
hub <- AnnotationHub()
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]UCSC ‘liftOver’ file to map coordinates
query(hub , c("hg19", "hg38", "chainfile"))
chain <- hub[["AH14150"]]lift over – possibly one-to-many mapping, so GRanges to GRangesList
library(rtracklayer)
E126hg38 <- liftOver(E126, chain)
E126hg38Example: read variants from a VCF file, and annotate with respect to a known gene model
## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
    TxDb.Hsapiens.UCSC.hg19.knownGene,
    CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
##     seqnames               ranges strand | LOCATION  LOCSTART    LOCEND   QUERYID        TXID
##        <Rle>            <IRanges>  <Rle> | <factor> <integer> <integer> <integer> <character>
##   1    chr22 [50301422, 50301422]      - |   coding       939       939        24       75253
##   2    chr22 [50301476, 50301476]      - |   coding       885       885        25       75253
##   3    chr22 [50301488, 50301488]      - |   coding       873       873        26       75253
##   4    chr22 [50301494, 50301494]      - |   coding       867       867        27       75253
##   5    chr22 [50301584, 50301584]      - |   coding       777       777        28       75253
##   6    chr22 [50302962, 50302962]      - |   coding       698       698        57       75253
##             CDSID      GENEID       PRECEDEID        FOLLOWID
##     <IntegerList> <character> <CharacterList> <CharacterList>
##   1        218562       79087                                
##   2        218562       79087                                
##   3        218562       79087                                
##   4        218562       79087                                
##   5        218562       79087                                
##   6        218563       79087                                
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
Acknowledgements
Core (Seattle): Sonali Arora, Marc Carlson, Nate Hayden, Jim Hester, Valerie Obenchain, Hervé Pagès, Paul Shannon, Dan Tenenbaum.
The research reported in this presentation was supported by the National Cancer Institute and the National Human Genome Research Institute of the National Institutes of Health under Award numbers U24CA180996 and U41HG004059, and the National Science Foundation under Award number 1247813. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health or the National Science Foundation.
sessionInfo()sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux stretch/sid
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] Homo.sapiens_1.3.1                      GO.db_3.2.2                            
##  [3] OrganismDbi_1.11.43                     biomaRt_2.25.3                         
##  [5] AnnotationHub_2.1.45                    VariantAnnotation_1.15.34              
##  [7] RNAseqData.HNRNPC.bam.chr14_0.7.0       GenomicAlignments_1.5.18               
##  [9] Rsamtools_1.21.21                       ALL_1.11.0                             
## [11] org.Hs.eg.db_3.2.3                      RSQLite_1.0.0                          
## [13] DBI_0.3.1                               ggplot2_1.0.1                          
## [15] airway_0.103.1                          limma_3.25.18                          
## [17] DESeq2_1.9.51                           RcppArmadillo_0.6.100.0.0              
## [19] Rcpp_0.12.1                             BSgenome.Hsapiens.UCSC.hg19_1.4.0      
## [21] BSgenome_1.37.6                         rtracklayer_1.29.28                    
## [23] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicFeatures_1.21.33                
## [25] AnnotationDbi_1.31.19                   SummarizedExperiment_0.3.11            
## [27] Biobase_2.29.1                          GenomicRanges_1.21.32                  
## [29] GenomeInfoDb_1.5.16                     microbenchmark_1.4-2                   
## [31] Biostrings_2.37.8                       XVector_0.9.4                          
## [33] IRanges_2.3.26                          S4Vectors_0.7.23                       
## [35] BiocGenerics_0.15.11                    BiocStyle_1.7.9                        
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                 RColorBrewer_1.1-2           httr_1.0.0                  
##  [4] tools_3.2.2                  R6_2.1.1                     rpart_4.1-10                
##  [7] Hmisc_3.17-0                 colorspace_1.2-6             nnet_7.3-11                 
## [10] gridExtra_2.0.0              graph_1.47.2                 formatR_1.2.1               
## [13] sandwich_2.3-4               labeling_0.3                 scales_0.3.0                
## [16] mvtnorm_1.0-3                genefilter_1.51.1            RBGL_1.45.1                 
## [19] stringr_1.0.0                digest_0.6.8                 foreign_0.8-66              
## [22] rmarkdown_0.8.1              htmltools_0.2.6              BiocInstaller_1.19.14       
## [25] shiny_0.12.2                 zoo_1.7-12                   BiocParallel_1.3.54         
## [28] acepack_1.3-3.3              RCurl_1.95-4.7               magrittr_1.5                
## [31] Formula_1.2-1                futile.logger_1.4.1          munsell_0.4.2               
## [34] proto_0.3-10                 stringi_0.5-5                multcomp_1.4-1              
## [37] yaml_2.1.13                  MASS_7.3-44                  zlibbioc_1.15.0             
## [40] plyr_1.8.3                   grid_3.2.2                   lattice_0.20-33             
## [43] splines_3.2.2                annotate_1.47.4              locfit_1.5-9.1              
## [46] knitr_1.11                   geneplotter_1.47.0           reshape2_1.4.1              
## [49] codetools_0.2-14             futile.options_1.0.0         XML_3.98-1.3                
## [52] evaluate_0.8                 latticeExtra_0.6-26          lambda.r_1.1.7              
## [55] httpuv_1.3.3                 gtable_0.1.2                 mime_0.4                    
## [58] xtable_1.7-4                 survival_2.38-3              cluster_2.0.3               
## [61] TH.data_1.0-6                interactiveDisplayBase_1.7.3