Simple example: classic vs. tidy
aggregate(hp ~ cyl, mtcars, mean)                     # classic
##   cyl        hp
## 1   4  82.63636
## 2   6 122.28571
## 3   8 209.21429
library(tidyverse)
mtcars %>% group_by(cyl) %>% summarize(hp = mean(hp)) # tidy
## # A tibble: 3 x 2
##     cyl        hp
##   <dbl>     <dbl>
## 1     4  82.63636
## 2     6 122.28571
## 3     8 209.21429
Biological example
library(airway)
data(airway)
airway                                   # rich
## class: RangedSummarizedExperiment 
## dim: 64102 8 
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
hist(log10(rowMeans(assay(airway))))
library(reshape2)
library(tibble)
tbl <- as_tibble(melt(
    assay(airway), c("Gene", "Sample"), value.name="Count"
))
tbl                                      # tidy (incomplete -- no metadata)
## # A tibble: 512,816 x 3
##               Gene     Sample Count
##             <fctr>     <fctr> <int>
##  1 ENSG00000000003 SRR1039508   679
##  2 ENSG00000000005 SRR1039508     0
##  3 ENSG00000000419 SRR1039508   467
##  4 ENSG00000000457 SRR1039508   260
##  5 ENSG00000000460 SRR1039508    60
##  6 ENSG00000000938 SRR1039508     0
##  7 ENSG00000000971 SRR1039508  3251
##  8 ENSG00000001036 SRR1039508  1433
##  9 ENSG00000001084 SRR1039508   519
## 10 ENSG00000001167 SRR1039508   394
## # ... with 512,806 more rows
tbl %>% group_by(Gene) %>% summarize(mean(Count))
## # A tibble: 64,102 x 2
##               Gene `mean(Count)`
##             <fctr>         <dbl>
##  1 ENSG00000000003       741.875
##  2 ENSG00000000005         0.000
##  3 ENSG00000000419       534.875
##  4 ENSG00000000457       242.000
##  5 ENSG00000000460        58.375
##  6 ENSG00000000938         0.375
##  7 ENSG00000000971      6034.750
##  8 ENSG00000001036      1305.000
##  9 ENSG00000001084       615.125
## 10 ENSG00000001167       392.500
## # ... with 64,092 more rows
Context
data.frame().data.frame(); see the tidyverseVocabulary
Constraints (e.g., probes & samples)
Flexibility
Programming contract
Lessons learned / best practices
library(BiocFileCache)
library(TENxGenomics)              # https://github.com/mtmorgan/TENxGenomics
fl <- bfcrpath(BiocFileCache(), "1M_neurons_filtered_gene_bc_matrices_h5.h5")
## Warning: call dbDisconnect() when finished working with a connection
TENxGenomics(fl)                          # big!
## class: TENxGenomics 
## h5path: /home/mtmorgan/.cache/BiocFileCache/38e45fadc64 
## dim(): 27998 x 1306127
m = as.matrix(TENxMatrix(fl)[, 1:1000])   # first 1000 cells
sum(m == 0) / length(m)                   # % zeros
## [1] 0.9276541
sum(rowSums(m == 0) == ncol(m)) / nrow(m) # % genes with no expression
## [1] 0.4264947
hist(log10(1 + colSums(m)))               # reads per cell
library(ggplot2)
library(plotly)
static <- mtcars %>% mutate(cyl = factor(cyl)) %>%
    ggplot(aes(x=qsec, y=mpg)) +
        geom_point(aes(color=cyl))
ggplotly(static)
::ggplotly() makes interactive visualization more-or-less a no-brainer.devtools create(), load_all(), check()
roxygen2 (e.g., devtools::document())
Rd-style.testthat (e.g., devtools::use_testthat(), devtools::test())
Pipes and lazy evaluation (e.g., magrittr, %>%)
S3 classes or S4 ‘reference’ classes
Other class systems
Continuous integration
R CMD build / R CMD check before pushing commits.(R / Bioconductor) Docker and AMI containers
Research reported in this course was supported by the National Human Genome Research Institute and the National Cancer Institute of the National Institutes of Health under award numbers U41HG004059 and U24CA180996.
This project has received funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement number 633974)