oppar.R
In oppar: Outlier profile and pathway analysis in R

## ---- echo=FALSE--------------------------------------------------------------
require(knitr)
knitr::opts_chunk$set(message = FALSE, 
											library(oppar), 
											suppressPackageStartupMessages(library(Biobase)),
											suppressPackageStartupMessages(library(limma)),
											suppressPackageStartupMessages(library(GO.db)),
											suppressPackageStartupMessages(library(org.Hs.eg.db))
											)


## -----------------------------------------------------------------------------
data(Tomlins) # loads processed Tomlins data

# the first 21 samples are Normal samples, and the rest of 
# the samples are our cases (metastatic). We, thus, generate a group
# variable for the samples based on this knowledge.

g <- factor(c(rep(0,21),rep(1,ncol(exprs(eset)) - 21)))
g


# Apply opa on Tomlins data, to detect outliers relative to the
# lower 10% (lower.quantile = 0.1) and upper 5% (upper.quantile = 0.95 -- Default) of 
# gene expressions.
tomlins.opa <- opa(eset, group = g, lower.quantile = 0.1)
tomlins.opa


## -----------------------------------------------------------------------------
tomlins.opa$profileMatrix[1:6,1:5]
tomlins.opa$upper.quantile
tomlins.opa$lower.quantile

## -----------------------------------------------------------------------------
getSampleOutlier(tomlins.opa, c(1,5))


## -----------------------------------------------------------------------------

outlier.list <- getSubtypeProbes(tomlins.opa, 1:ncol(tomlins.opa$profileMatrix))

## -----------------------------------------------------------------------------
# gene set testing with limma::mroast
#BiocManager::install(org.Hs.eg.db)
library(org.Hs.eg.db)
library(limma)
org.Hs.egGO2EG
go2eg <- as.list(org.Hs.egGO2EG)
head(go2eg)

# Gene Set analysis using rost from limma

# need to subset gene express data based on up outliers
up.mtrx <- exprs(eset)[fData(eset)$ID %in% outlier.list[["up"]], ]
# get Entrez gene IDs for genes in up.mtrx

entrez.ids.up.mtrx <- fData(eset)$Gene.ID[fData(eset)$ID %in% rownames(up.mtrx)]

# find the index of genes in GO gene set in the gene expression matrix
gset.idx <- lapply(go2eg, function(x){
	match(x, entrez.ids.up.mtrx)
})

# remove missing values
gset.idx <- lapply(gset.idx, function(x){
	x[!is.na(x)]
})

# removing gene sets with less than 10 elements
gset.ls <- unlist(lapply(gset.idx, length))
gset.idx <- gset.idx[which(gset.ls > 10)]

# need to define a model.matrix for mroast
design <- model.matrix(~ g)
up.mroast <- mroast(up.mtrx, index = gset.idx, design = design) 
head(up.mroast, n=5)

## -----------------------------------------------------------------------------
go.terms <- rownames(up.mroast[1:10,])
#BiocManager::install(GO.db)
library(GO.db)
columns(GO.db)
keytypes(GO.db)

r2tab <- select(GO.db, keys=go.terms,
                columns=c("GOID","TERM"), 
                keytype="GOID")
r2tab

## -----------------------------------------------------------------------------
library(org.Hs.eg.db)
library(limma)
org.Hs.egGO2EG
go2eg <- as.list(org.Hs.egGO2EG)
head(go2eg)
# subsetting gene expression matrix based on down outliers
down_mtrx <- exprs(eset)[fData(eset)$ID %in% outlier.list[["down"]], ]
entrez_ids_down_mtrx <- fData(eset)$Gene.ID[fData(eset)$ID %in% rownames(down_mtrx)]

gset_idx_down <- lapply(go2eg, function(x){
	match(x, entrez_ids_down_mtrx)
})

# remove missing values
gset_idx_down <- lapply(gset_idx_down, function(x){
	x[!is.na(x)]
})

# removing gene sets with less than 10 elements
gset_ls_down <- unlist(lapply(gset_idx_down, length))
gset_idx_down <- gset_idx_down[which(gset_ls_down > 10)]

# apply mroast
down_mroast <- mroast(down_mtrx, gset_idx_down, design) 
head(down_mroast, n=5)

## -----------------------------------------------------------------------------

go_terms_down <- rownames(down_mroast[1:10,])

dr2tab <- select(GO.db, keys=go_terms_down,
                columns=c("GOID","TERM"), 
                keytype="GOID")
dr2tab


## -----------------------------------------------------------------------------
data("Maupin")
names(maupin)
head(maupin$data)
head(maupin$sig)

geneSet<- maupin$sig$EntrezID    #Symbol  ##EntrezID # both up and down genes:

up_sig<- maupin$sig[maupin$sig$upDown == "up",]
d_sig<- maupin$sig[maupin$sig$upDown == "down",]
u_geneSet<- up_sig$EntrezID   #Symbol   # up_sig$Symbol  ## EntrezID
d_geneSet<- d_sig$EntrezID

enrichment_scores <- gsva(maupin$data, list(up = u_geneSet, down= d_geneSet), mx.diff=1,
               verbose=TRUE, abs.ranking=FALSE, is.gset.list.up.down=TRUE, parallel.sz = 1 )$es.obs
			


head(enrichment_scores)

## calculating enrichment scores using ssgsea method
# es.dif.ssg <- gsva(maupin, list(up = u_geneSet, down= d_geneSet),
# 														 verbose=TRUE, abs.ranking=FALSE, is.gset.list.up.down=TRUE,
# 														 method = "ssgsea")

## ---- fig.width= 6, fig.height= 5, fig.align= "center"------------------------
hist(enrichment_scores, main = "enrichment scores", xlab="es")
lines(density(enrichment_scores[,1:3]), col = "blue") # control samples
lines(density(enrichment_scores[,4:6]), col = "red") # TGFb samples
legend("topleft", c("Control","TGFb"), lty = 1, col=c("blue","red"), cex = 0.6)