set.seed(0) knitr::opts_chunk$set( out.extra = 'style="display:block; margin: auto"', fig.align = "center", fig.path = "es/", collapse = TRUE, comment = "#>", dev = "png")
pkgs <- c("Biobase", "arrayQualityMetrics", "dplyr", "knitr", "mclust", "pQTLtools", "rgl", "scatterplot3d") for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) { if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install")) } invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))
We start with Bioconductor/Biobase's ExpressionSet example and finish with a real application.
dataDirectory <- system.file("extdata", package="Biobase") exprsFile <- file.path(dataDirectory, "exprsData.txt") exprs <- as.matrix(read.table(exprsFile, header=TRUE, sep="\t", row.names=1, as.is=TRUE)) pDataFile <- file.path(dataDirectory, "pData.txt") pData <- read.table(pDataFile, row.names=1, header=TRUE, sep="\t") all(rownames(pData)==colnames(exprs)) metadata <- data.frame(labelDescription=c("Patient gender", "Case/control status", "Tumor progress on XYZ scale"), row.names=c("gender", "type", "score")) phenoData <- Biobase::AnnotatedDataFrame(data=pData, varMetadata=metadata) experimentData <- Biobase::MIAME(name="Pierre Fermat", lab="Francis Galton Lab", contact="pfermat@lab.not.exist", title="Smoking-Cancer Experiment", abstract="An example ExpressionSet", url="www.lab.not.exist", other=list(notes="Created from text files")) exampleSet <- pQTLtools::make_ExpressionSet(exprs,phenoData,experimentData=experimentData, annotation="hgu95av2") data(sample.ExpressionSet, package="Biobase") identical(exampleSet,sample.ExpressionSet)
The great benefit is to use the object directly as a data.frame.
lm.result <- Biobase::esApply(exampleSet,1,function(x) lm(score~gender+x)) beta.x <- unlist(lapply(lapply(lm.result,coef),"[",3)) beta.x[1] lm(score~gender+AFFX.MurIL2_at,data=exampleSet)
We wish to examine the distribution of each feature via histogram, scatter and boxplot.
One could resort to esApply()
for its simplicity as before
invisible(Biobase::esApply(exampleSet[1:2],1,function(x) {par(mfrow=c(3,1));boxplot(x);hist(x);plot(x)} ))
but it is nicer to add feature name in the title.
par(mfrow=c(1,3)) f <- Biobase::featureNames(exampleSet[1:2]) invisible(sapply(f,function(x) { d <- t(Biobase::exprs(exampleSet[x])) fn <- Biobase::featureNames(exampleSet[x]) hist(d,main="",xlab=fn); plot(d, ylab=fn); boxplot(d,ylab=fn) } ) )
where the expression set is indexed using feature name(s).
This illustrates one mechanism,
list_outliers <- function(es, method="upperquartile") arrayQualityMetrics::outliers(exprs(es),method=method) for (method in c("KS","sum","upperquartile")) { ZWK_outliers <- list_outliers(protein_ZWK,method=method) print(ZWK_outliers@statistic[ZWK_outliers@which]) }
We employ model-based clustering absed on principal compoents to see potential groupings in the data,
pca <- prcomp(na.omit(t(Biobase::exprs(exampleSet))), rank=10, scale=TRUE) pc1pc2pc3 <- with(pca,x)[,1:3] mc <- mclust::Mclust(pc1pc2pc3,G=3) with(mc, { cols <- c("blue","red", "purple") s3d <- scatterplot3d::scatterplot3d(with(pca,x[,c(2,1,3)]), color=cols[classification], pch=16, type="h", main="Plot of the PC1, PC2 and PC3") s3d.coords <- s3d$xyz.convert(with(pca,x[,c(2,1,3)])) text(s3d.coords$x, s3d.coords$y, cex = 1.2, col = cols[classification], labels = row.names(pc1pc2pc3), pos = 4) legend("right", legend=levels(as.factor(classification)), col=cols[classification], pch=16) rgl::open3d(width = 500, height = 500) rgl::plot3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],size=5) rgl::text3d(with(pca,x[,c(2,1,3)]),cex=1.2,col=cols[classification],texts=row.names(pc1pc2pc3)) htmlwidgets::saveWidget(rgl::rglwidget(), file = "mcpca3d.html") })
An interactive version is also available,
htmltools::tags$iframe(src = "mcpca3d.html", width = "100%", height = "450px")
Suppose we wish use log2 for those greater than zero but set those negative values to be missing.
log2.na <- function(x) log2(ifelse(x>0, x, NA)) Biobase::exprs(exampleSet) <- log2.na(Biobase::exprs(exampleSet))
We generate a lod.max
~ U[0,1] variable to experiment
Biobase::fData(exampleSet) Biobase::fData(exampleSet)$lod.max <- apply(Biobase::exprs(exampleSet),1,quantile,runif(nrow(exampleSet))) lod <- pQTLtools::get.prop.below.LLOD(exampleSet) x <- dplyr::arrange(Biobase::fData(lod),desc(pc.belowLOD.new)) knitr::kable(head(lod)) plot(x[,2], main="Random quantile cutoff", ylab="<lod%")
The quantity has been shown to have a big impact on protein abundance and therefore pQTL detection as is shown with a real example.
rm(list=ls()) dir <- "~/rds/post_qc_data/interval/phenotype/olink_proteomics/post-qc/" eset <- readRDS(paste0(dir,"eset.inf1.flag.out.outlier.in.rds")) x <- pQTLtools::get.prop.below.LLOD(eset) annot <- Biobase::fData(x) annot$MissDataProp <- as.numeric(gsub("\\%$", "", annot$MissDataProp)) plot(annot$MissDataProp, annot$pc.belowLOD.new, xlab="% <LLOD in Original", ylab="% <LLOD in post QC dataset", pch=19) INF <- Sys.getenv("INF") np <- read.table(paste(INF, "work", "INF1.merge.nosig", sep="/"), header=FALSE, col.names = c("prot", "uniprot")) kable(np, caption="Proteins with no pQTL") annot$pQTL <- rep(NA, nrow(annot)) no.pQTL.ind <- which(annot$uniprot.id %in% np$uniprot) annot$pQTL[no.pQTL.ind] <- "red" annot$pQTL[-no.pQTL.ind] <- "blue" annot <- annot[order(annot$pc.belowLOD.new, decreasing = TRUE),] annot <- annot[-grep("^BDNF$", annot$ID),] saveRDS(annot,file=file.path("~","pQTLtools","tests","annot.RDS"))
annot <- readRDS(file.path(find.package("pQTLtools"),"tests","annot.RDS")) %>% dplyr::left_join(pQTLdata::inf1[c("prot","target.short","alt_name")],by=c("ID"="prot")) %>% dplyr::mutate(prot=if_else(is.na(alt_name),target.short,alt_name),order=1:n()) %>% dplyr::arrange(desc(order)) xtick <- seq(1, nrow(annot)) attach(annot) par(mar=c(10,5,1,1)) plot(100-pc.belowLOD.new,cex=2,pch=19,col=pQTL,xaxt="n",xlab="",ylab="",cex.axis=0.8) text(66,16,"IL-17C",offset=0,pos=2,cex=1.5,font=2,srt=0) arrows(67,16,71,16,lwd=2) axis(1, at=xtick, labels=prot, lwd.tick=0.5, lwd=0, las=2, hadj=1, cex.axis=0.8) mtext("% samples above LLOD",side=2,line=2.5,cex=1.2) mtext("Ordered protein",side=1,line=6.5,cex=1.2,font=1) legend(x=1,y=25,c("without pQTL","with pQTL"),box.lwd=0,cex=2,col=c("red","blue"),pch=19) detach(annot) options(width=120) knitr::kable(annot,caption="Summary statistics",row.names=FALSE)
This is available by design.
Web: https://bioconductor.org/packages/release/workflows/html/maEndToEnd.html.
Examples can be found on PCA, heatmap, normalisation, linear models, and enrichment analysis from this Bioconductor package.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.