## ----------------------------------------------------------------------------- library(ggplot2); theme_set(theme_bw()) require(glmpca) ## ----------------------------------------------------------------------------- set.seed(202) ngenes <- 5000 #must be divisible by 10 ngenes_informative<-ngenes*.1 ncells <- 50 #number of cells per cluster, must be divisible by 2 nclust<- 3 # simulate two batches with different depths batch<-rep(1:2, each = nclust*ncells/2) ncounts <- rpois(ncells*nclust, lambda = 1000*batch) # generate profiles for 3 clusters profiles_informative <- replicate(nclust, exp(rnorm(ngenes_informative))) profiles_const<-matrix(ncol=nclust,rep(exp(rnorm(ngenes-ngenes_informative)),nclust)) profiles <- rbind(profiles_informative,profiles_const) # generate cluster labels clust <- sample(rep(1:3, each = ncells)) # generate single-cell transcriptomes counts <- sapply(seq_along(clust), function(i){ rmultinom(1, ncounts[i], prob = profiles[,clust[i]]) }) rownames(counts) <- paste("gene", seq(nrow(counts)), sep = "_") colnames(counts) <- paste("cell", seq(ncol(counts)), sep = "_") # clean up rows Y <- counts[rowSums(counts) > 0, ] sz<-colSums(Y) Ycpm<-1e6*t(t(Y)/sz) Yl2<-log2(1+Ycpm) z<-log10(sz) pz<-1-colMeans(Y>0) cm<-data.frame(total_counts=sz,zero_frac=pz,clust=factor(clust),batch=factor(batch)) ## ---- fig.width=6, fig.height=4----------------------------------------------- set.seed(202) system.time(res1<-glmpca(Y,2,fam="poi")) #about 9 seconds print(res1) pd1<-cbind(cm,res1$factors,dimreduce="glmpca-poi") #check optimizer decreased deviance plot(res1$dev,type="l",xlab="iterations",ylab="Poisson deviance") ## ---- fig.width=6, fig.height=4----------------------------------------------- set.seed(202) system.time(res2<-glmpca(Y,2,fam="nb")) #about 32 seconds print(res2) pd2<-cbind(cm,res2$factors,dimreduce="glmpca-nb") #check optimizer decreased deviance plot(res2$dev,type="l",xlab="iterations",ylab="negative binomial deviance") ## ----------------------------------------------------------------------------- #standard PCA system.time(res3<-prcomp(log2(1+t(Ycpm)),center=TRUE,scale.=TRUE,rank.=2)) #<1 sec pca_factors<-res3$x colnames(pca_factors)<-paste0("dim",1:2) pd3<-cbind(cm,pca_factors,dimreduce="pca-logcpm") ## ---- fig.width=6, fig.height=15---------------------------------------------- pd<-rbind(pd1,pd2,pd3) #visualize results ggplot(pd,aes(x=dim1,y=dim2,colour=clust,shape=batch))+geom_point(size=4)+facet_wrap(~dimreduce,scales="free",nrow=3) ## ---- fig.width=6, fig.height=4----------------------------------------------- nbres<-res2 names(nbres) dim(Y) dim(nbres$factors) dim(nbres$loadings) dim(nbres$coefX) hist(nbres$coefX[,1],breaks=100,main="feature-specific intercepts") print(nbres$glmpca_family)