##################################################
## R script for miRNet
## Description: expression data I/O, normalization and differential expression analysis
## Author: Jeff Xia, jeff.xia@mcgill.ca
###################################################
# read tab delimited file
# stored in dataSet list object
# can have many classes, stored in meta.info
# type: array, count, qpcr
#' Read Expression Data
#' @export
ReadTabExpressData <- function(dataName) {
dataSet <- ReadTabData(dataName);
# rename data to data.orig
int.mat <- dataSet$data;
dataSet$cls <- dataSet$meta.info[,1];
dataSet$data <- NULL;
dataSet$listData <- FALSE;
msg <- paste("a total of ", ncol(int.mat), " samples and ", nrow(int.mat), " features were found. ");
# remove NA, null
row.nas <- apply(is.na(int.mat)|is.null(int.mat), 1, sum);
good.inx<- row.nas/ncol(int.mat) < 0.5;
if(sum(!good.inx) > 0){
int.mat <- int.mat[good.inx,];
msg <- c(msg, paste("removed ", sum(!good.inx), " features with over 50% missing values"));
}
# remove constant values
filter.val <- apply(int.mat, 1, IQR, na.rm=T);
good.inx2 <- filter.val > 0;
if(sum(!good.inx2) > 0){
int.mat <- int.mat[good.inx2,];
msg <- c(msg, paste("removed ", sum(!good.inx2), " features with constant values"));
}
if(nrow(int.mat) > 2000){
filter.val <- filter.val[good.inx2];
rk <- rank(-filter.val, ties.method='random');
var.num <- nrow(int.mat);
kept.num <- 0.95*var.num;
int.mat <- int.mat[rk < kept.num, ];
# msg <- c(msg, paste("removed 5% features with near-constant values"));
}
minVal <- min(int.mat, na.rm=T);
na.inx <- is.na(int.mat);
if(sum(na.inx) > 0){
int.mat[na.inx] <- minVal/2;
# msg <- c(msg, "the remaining", sum(na.inx), "missing variables were replaced with data min");
}
current.msg <<- paste(msg, collapse="; ");
saveRDS(int.mat, file="data.proc");
rownames(dataSet$meta.info) <- colnames(int.mat);
dataSet <<- dataSet;
if(.on.public.web){
return (1);
}else{
return (current.msg);
}
}
#' Setup miRNA Expression Data
#' @export
SetupMirExpressData <- function(){
idType <- dataSet$id.current;
mydata <- data.matrix(dataSet$sig.mat[,"max.logFC",drop=FALSE]);
if(idType == "mir_id"){ # note, in the mirnet database, all mir ids are lower case! miR=>mir
mir.vec <- rownames(mydata);
rownames(mydata) <- tolower(mir.vec);
}
dataSet$idType <- idType;
dataSet$mir.orig <- mydata;
dataSet$data[["gene"]] <- mydata;
dataSet$id.types[["gene"]] <- idType;
dataSet<<- dataSet;
if(.on.public.web){
return (nrow(mydata));
}else{
return("Setup complete!")
}
}
#' Get class info
#' @export
GetClassInfo <- function(){
return(levels(dataSet$cls));
}
#' Get annotated names
#' @export
GetAnotNames<-function(){
return(rownames(dataSet$data.anot));
}
#' Peform data annotation
#' @export
PerformDataAnnot<-function(org, idType, tissue, lvlOpt, matchMin=0.5){
data.org <<- dataSet$org <- org;
dataSet$id.orig <- dataSet$id.current <- idType;
dataSet$annotation <- NULL;
dataSet$annotated <- F;
dataSet$tissue <- tissue;
# should not contain duplicates, however sanity check
data.proc <- readRDS("data.proc");
dataSet$lvlOpt <- lvlOpt;
dataSet$data.anot <- RemoveDuplicates(data.proc, lvlOpt, quiet=T);
feature.vec <- rownames(data.proc);
if(idType == "mir_id" | idType=="mir_acc"){
#do nothing
matched.len <- length(feature.vec);
}else{ # genes
if(tolower(org) != 'na' & tolower(idType) != 'na'){
anot.id <- doMirGeneAnnotation(feature.vec, idType);
hit.inx <- !is.na(anot.id);
matched.len <- sum(hit.inx);
if(matched.len < length(feature.vec)*0.25){
perct <- round(matched.len/length(feature.vec),3)*100;
current.msg <<- paste('Only ', perct, '% ID were matched. Please choose the correct ID type or use default.', sep="");
return(0);
}else{
current.msg <<- paste("ID annotation: ", "Total [", length(anot.id),
"] Matched [", matched.len, "] Unmatched [", sum(!hit.inx),"]", collapse="\n");
dataSet$anotList <- list();
dataSet$anotList$total <- length(anot.id);
dataSet$anotList$match <- length(matched.len);
dataSet$anotList$unmatch <- sum(!hit.inx);
dataSet$anotList$smpl.num <- colnames(dataSet$data.anot);
if(lvlOpt != 'NA' | idType == "entrez"){
# do actual summarization to gene level
matched.entrez <- anot.id[hit.inx];
data.anot <- data.proc[hit.inx,];
rownames(data.anot) <- matched.entrez;
current.msg <<- paste(current.msg, "Data is now transformed to gene-level (Entrez) expression.");
dataSet$data.anot <- RemoveDuplicates(data.anot, lvlOpt, quiet=F);
#dataSet$id.current <- "entrez";
dataSet$id.current <- "symbol";
dataSet$annotated <- T;
}else{
# record the annotation
dataSet$annotation <- anot.id; # this need to be updated to gether with data from now on
current.msg <<- paste(current.msg, "No gene level summarization was performed.");
}
}
}else{ # no conversion will be performed
matched.len <- 9; # dummies
minLvl <- 1;
current.msg <<- paste("No annotation was performed. Make sure organism and gene ID are specified correctly!");
}
}
dataSet$data.norm <- dataSet$data.anot; # before
fast.write.csv(dataSet$data.anot, file="cleaned_annot.csv");
dataSet <<- dataSet;
if(.on.public.web){
return(matched.len);
}else{
return("Annotation was performed successfully!")
}
}
# update result based on new cutoff
#' Get significant genes
#' @export
GetSigGenes<-function(p.lvl, fc.lvl, direction, update=T){
resTable <- readRDS("resTable");
if(update){
current.msg <<- "";
}
# select based on p
hit.inx.p <- resTable$adj.P.Val <= p.lvl;
resTable<-resTable[hit.inx.p,,drop=F];
if(nrow(resTable) == 0){
current.msg <<- paste(current.msg, "No significant genes were identified using the given design and cutoff.");
return();
}
# now rank by logFC, note, the logFC for each comparisons
# are returned in resTable before the AveExpr columns
# for two-class, only one column, multiple columns can be involved
# for > comparisons - in this case, use the largest logFC among all comparisons
fc.lvl <- abs(fc.lvl);
#if(fc.lvl > 0){ # further filter by logFC
if(direction == "both"){
hit.inx.fc <- abs(resTable$max.logFC) >= fc.lvl;
}else if(direction == "up"){
hit.inx.fc <- resTable$max.logFC >= fc.lvl;
}else{
hit.inx.fc <- resTable$max.logFC <= -fc.lvl;
}
resTable<-resTable[hit.inx.fc,,drop=F];
if(nrow(resTable) == 0){
current.msg <<- paste(current.msg, "No significant genes were identified using the given design and cutoff.");
return();
}
#}
fast.write.csv(signif(resTable,5), file="mirnet_siggenes.csv");
de.Num <- nrow(resTable);
current.msg <<- paste(current.msg, "A total of", de.Num, "significant genes were identified!");
gene.anot <- NULL;
# display at most 5000 genes for the server (two main reasons)
# 1) should not have more 22% (human: 23000) DE of all genes (biological)
# 2) IE canvas can display no more than 6800 pixels (computational)
if(nrow(resTable) > 5000){
resTable <- resTable[1:5000,];
gene.anot <- gene.anot[1:5000,];
current.msg <<- paste(current.msg, " Due to computational constraints, only top 5000 genes will be used. ", collapse="\n");
}
# may need to update data, class and meta.info
data <- dataSet$data.norm;
cls <- dataSet$cls;
meta.info <- dataSet$meta.info;
grp.nms <- levels(cls);
hit.inx <- cls %in% grp.nms;
if(sum(hit.inx) < length(hit.inx)){
current.msg <<- paste(current.msg, "Only groups selected for comparisons: ", paste(grp.nms, collapse=", "), "are included.");
cls <- factor(cls[hit.inx]);
cls.lvls <- levels(cls);
data <- data[,hit.inx];
meta.info <- dataSet$meta.info[hit.inx,];
}
dataSet$fc.val <- fc.lvl
dataSet$pval <- p.lvl
dataSet$sig.mat <- resTable;
dataSet$sig.genes.anot <- gene.anot;
dataSet$cls.stat <- cls;
dataSet$meta.stat <- meta.info;
dataSet <<- dataSet;
# return DE num
if(.on.public.web){
return(de.Num);
}else{
return(current.msg)
}
}
# note, here also update data type array/count
#' Peform array data normalization
#' @export
PerformArrayDataNormalization <- function(norm.opt){
data <- dataSet$data.anot;
row.nms <- rownames(data);
col.nms <- colnames(data);
msg <- NULL;
if(norm.opt=="log"){
min.val <- min(data[data>0], na.rm=T)/10;
data <- log2((data + sqrt(data^2 + min.val^2))/2);
msg <- "Log2 transformation";
}else if(norm.opt=="quant"){
library('preprocessCore');
data <- normalize.quantiles(data, copy=FALSE);
msg <- "Quantile normalization";
}else if(norm.opt == "combine"){
min.val <- min(data[data>0], na.rm=T)/10;
data <- log2((data + sqrt(data^2 + min.val^2))/2);
library('preprocessCore');
data <- normalize.quantiles(data, copy=FALSE);
msg <- "Log2 transformation followed by normalization";
}else{
msg <-"No log normalization was performed.";
print(msg);
}
rownames(data) <- row.nms;
colnames(data) <- col.nms;
dataSet$data.norm <- data;
dataSet$norm.opt <- norm.opt;
current.msg <<- msg;
dataSet <<- dataSet;
if(.on.public.web){
return(1);
}else{
return("Normalization was performed successfully!")
}
}
#' Perform limma
#' @export
PerformLimma<-function(target.grp){
myargs <- list();
cls <- dataSet$cls;
dataSet$comparison <- target.grp;
design <- model.matrix(~ 0 + cls) # no intercept
colnames(design) <- levels(cls);
grp.nms <- strsplit(target.grp, " vs. ")[[1]];
myargs[[1]] <- paste(grp.nms, collapse="-");
filename = paste("SigGene_", paste(grp.nms, collapse="_vs_"), sep="");
library(limma);
myargs[["levels"]] <- design;
contrast.matrix <- do.call(makeContrasts, myargs);
fit = lmFit(dataSet$data.norm, design);
# sanity check
if(!is.fullrank(design)){
current.msg <<- paste("This metadata combination is not full rank! Please use other combination.");
return(0);
}
df.residual <- fit$df.residual;
if (all(df.residual == 0)){
current.msg <<- paste("There is not enough replicates in each group (no residual degrees of freedom)!");
return(0);
}
fit2 <- contrasts.fit(fit, contrast.matrix);
fit2 <- eBayes(fit2);
topFeatures <- topTable(fit2, number=Inf, adjust.method="fdr");
# add a common column for FC selection
# this is to prevent no logFC for multi grps or for counts data
hit.inx <- which(colnames(topFeatures) == "AveExpr");
maxFC.inx <- hit.inx - 1; # not sure if this is also true for edgeR
logfc.mat <- topFeatures[,1:maxFC.inx, drop=F];
# extract the max FC together with direction
pos.vec <- apply(abs(logfc.mat), 1, which.max);
pos.mat <- cbind(1:length(pos.vec), pos.vec);
max.logFC <- logfc.mat[pos.mat];
topFeatures <- cbind(topFeatures, max.logFC = max.logFC);
dataSet$filename <- filename;
dataSet$norm.opt <- "limma";
dataSet <<- dataSet;
saveRDS(topFeatures, file="resTable");
if(.on.public.web){
return(1);
}else{
return("Differential Analysis was performed successfully!")
}
}
###########################
## for RNAseq data
##########################
#' Perform count data normalization
#' #' @export
PerformCountDataNormalization <- function(norm.opt, disp.opt){
msg <- NULL;
cls <- dataSet$cls;
design <- model.matrix(~ 0 + cls) # no intercept
colnames(design) <- levels(cls);
library(edgeR);
y <- DGEList(counts=dataSet$data.anot, group=dataSet$cls);
if(norm.opt=="tmm"){
y <- calcNormFactors(y)
}else{
msg <-"No log normalization was performed.";
print(msg);
}
y <- estimateGLMCommonDisp(y, design, verbose=FALSE);
if(disp.opt=="tagwise"){
y <- estimateGLMTrendedDisp(y, design);
y <- estimateGLMTagwiseDisp(y, design, trend=TRUE);
}
saveRDS(y, file="edger.y");
dataSet$design <- design;
current.msg <<- msg;
dataSet$norm.opt <- norm.opt;
dataSet <<- dataSet;
if(.on.public.web){
return(1);
}else{
return("Normalization was performed successfully!")
}
}
#' Perform EdgeR
#' @export
PerformEdgeR<-function(target.grp){
myargs <- list();
dataSet$comparison <- target.grp;
grp.nms <- strsplit(target.grp, " vs. ")[[1]];
myargs[[1]] <- paste(grp.nms, collapse="-");
filename = paste("SigGene_", paste(grp.nms, collapse="_vs_"), sep="");
myargs[["levels"]] <- dataSet$design;
contrast.matrix <- do.call(makeContrasts, myargs);
edger.y <- readRDS("edger.y");
fit <- glmFit(edger.y, dataSet$design);
lrt <- glmLRT(fit, contrast=contrast.matrix);
topFeatures<-topTags(lrt,n=Inf)$table;
# need to change the FDR to adj.P.Val same as limma
nms <- colnames(topFeatures);
nms[which(nms == 'FDR')] <- 'adj.P.Val';
colnames(topFeatures) <- nms;
# add a common column for FC selection
# this is to prevent no logFC for multi grps or for counts data
hit.inx <- which(colnames(topFeatures) == "logCPM");
maxFC.inx <- hit.inx - 1; # not sure if this is also true for edgeR
logfc.mat <- topFeatures[,1:maxFC.inx, drop=F];
# extract the max FC together with direction
pos.vec <- apply(abs(logfc.mat), 1, which.max);
pos.mat <- cbind(1:length(pos.vec), pos.vec);
max.logFC <- logfc.mat[pos.mat];
topFeatures <- cbind(topFeatures, max.logFC = max.logFC);
dataSet$filename <- filename;
dataSet$de.method <- "EdgeR";
dataSet <<- dataSet;
saveRDS(topFeatures, file="resTable");
if(.on.public.web){
return(1);
}else{
return("Differential Analysis was performed successfully!")
}
}
###########################
## for QPCR data
##########################
#' Perform qPCR data normalization
#' @export
PerformQpcrDataNormalization <- function(norm.opt = "quantile") {
dataSet$norm.opt <- norm.opt;
data <- dataSet$data.anot;
library('HTqPCR');
# need to create a qPCRset with only info needed
obj <- new("qPCRset", exprs=data);
featureNames(obj) <- rownames(data);
if(norm.opt =="deltaCt"){
if(!exists('delta.genes')){
current.msg <<- "Cannot find endogenous control genes!";
print(current.msg);
return (0);
}
norm.obj <- normalizeCtData(obj, norm="deltaCt", deltaCt.genes=delta.genes)
}else{
norm.obj <- normalizeCtData(obj, norm=norm.opt)
}
dataSet$data.norm <- exprs(norm.obj);
dataSet <<- dataSet;
if(.on.public.web){
return(1);
}else{
return("Normalization was performed successfully!")
}
}
#' Perform HTqPCR
#' @export
PerformHTqPCR<-function(target.grp, method){
dataSet$comparison <- target.grp;
if(method == "limma"){
return(PerformLimma(target.grp));
}
# need to subselect groups and data
grp.nms <- strsplit(target.grp, " vs. ")[[1]];
my.inx <- dataSet$cls %in% grp.nms;
my.cls <- factor(dataSet$cls[my.inx]);
my.dat <- dataSet$data.norm[, my.inx];
nonpar <- TRUE;
if(method == "ttest"){
nonpar <- FALSE;
}
p.value <- GetTtestP(my.dat, my.cls, grp.nms[1], grp.nms[2], FALSE, TRUE, nonpar);
fdr.p <- p.adjust(p.value, "fdr");
fc <-GetFC(my.dat, my.cls, grp.nms[1], grp.nms[2]);
resTable <- data.frame(p.value=p.value, adj.P.Val = fdr.p, ddCt = fc);
topFeatures <- cbind(resTable, max.logFC = fc);
rownames(topFeatures)<-rownames(my.dat);
inx<-order(p.value);
topFeatures<-topFeatures[inx,];
filename = paste("SigGene_", paste(grp.nms, collapse="_vs_"), sep="");
dataSet$filename <- filename;
dataSet$de.method <- method;
dataSet <<- dataSet;
saveRDS(topFeatures, file="resTable");
if(.on.public.web){
return(1);
}else{
return("Differential Analysis was performed successfully!")
}
}
# utility method to get p values
#' Get p-value from t-test
#' @export
GetTtestP <- function(my.dat, my.cls, grp1, grp2, paired=FALSE, equal.var=TRUE, nonpar=F){
inx1 <- which(my.cls==grp1);
inx2 <- which(my.cls==grp2);
if(nonpar){
p.value <- apply(as.matrix(my.dat), 1, function(x) {
tmp <- try(wilcox.test(x[inx1], x[inx2], paired = paired));
if(class(tmp) == "try-error") {
return(NA);
}else{
return(tmp$p.value);
}
})
}else{
if(nrow(my.dat) < 1000){
p.value <- apply(as.matrix(my.dat), 1, function(x) {
tmp <- try(t.test(x[inx1], x[inx2], paired = paired, var.equal = equal.var));
if(class(tmp) == "try-error") {
return(NA);
}else{
return(tmp$p.value);
}
})
}else{ # use fast version
library(genefilter);
p.value <- try(rowttests(t(as.matrix(my.dat)), my.cls)$p.value);
if(class(p.value) == "try-error") {
p.value <- NA;
}
}
}
return(p.value);
}
# utility method to calculate FC
#' Get Fold Change
#' @export
GetFC <- function(my.dat, my.cls, grp1, grp2){
m1 <- rowMeans(my.dat[, which(my.cls==grp1)]);
m2 <- rowMeans(my.dat[, which(my.cls==grp2)]);
# create a named matrix of sig vars for display
fc <- signif (m1-m2, 5);
return(fc);
}
#' Plot Data Overview
#' @export
PlotDataOverview<-function(imgNm){
dat <- dataSet$data.anot;
library('lattice');
subgene=10000;
if (nrow(dat)>subgene) {
set.seed(28051968);
sg = sample(nrow(dat), subgene)
Mss = dat[sg,,drop=FALSE]
} else {
Mss = dat
}
subsmpl=100;
if (ncol(Mss)>subsmpl) {
set.seed(28051968);
ss = sample(ncol(Mss), subsmpl)
Mss = Mss[,ss,drop=FALSE]
} else {
Mss = Mss
}
sample_id = rep(seq_len(ncol(Mss)), each = nrow(Mss));
values = as.numeric(Mss)
formula = sample_id ~ values
Cairo(file=imgNm, width=460, height=420, type="png", bg="white");
box = bwplot(formula, groups = sample_id, layout = c(1,1), as.table = TRUE,
strip = function(..., bg) strip.default(..., bg ="#cce6ff"),
horizontal = TRUE,
pch = "|", col = "black", do.out = FALSE, box.ratio = 2,
xlab = "", ylab = "Samples",
fill = "#1c61b6AA",
panel = panel.superpose,
scales = list(x=list(relation="free"), y=list(axs="i")),
ylim = c(ncol(Mss)+0.7,0.3),
prepanel = function(x, y) {
list(xlim = quantile(x, probs = c(0.01, 0.99), na.rm=TRUE))
},
panel.groups = function(x, y, ...) {
panel.bwplot(x, y, ...)
})
print(box);
dev.off();
infoSet <- readSet(infoSet, "infoSet");
infoSet$imgSet$pca_box <- imgNm;
saveSet(infoSet, "infoSet");
}
# given a data with duplicates, dups is the one with duplicates
#' Remove Duplicates
#' @export
RemoveDuplicates <- function(data, lvlOpt, quiet=T){
all.nms <- rownames(data);
colnms <- colnames(data);
dup.inx <- duplicated(all.nms);
dim.orig <- dim(data);
data <- apply(data, 2, as.numeric); # force to be all numeric
dim(data) <- dim.orig; # keep dimension (will lost when only one item)
rownames(data) <- all.nms;
colnames(data) <- colnms;
if(sum(dup.inx) > 0){
uniq.nms <- all.nms[!dup.inx];
uniq.data <- data[!dup.inx,,drop=F];
dup.nms <- all.nms[dup.inx];
uniq.dupnms <- unique(dup.nms);
uniq.duplen <- length(uniq.dupnms);
for(i in 1:uniq.duplen){
nm <- uniq.dupnms[i];
hit.inx.all <- which(all.nms == nm);
hit.inx.uniq <- which(uniq.nms == nm);
# average the whole sub matrix
if(lvlOpt == "mean"){
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, mean, na.rm=T);
}else if(lvlOpt == "median"){
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, median, na.rm=T);
}else if(lvlOpt == "max"){
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, max, na.rm=T);
}else{ # sum
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, sum, na.rm=T);
}
}
if(!quiet){
current.msg <<- paste(current.msg, paste("A total of ", sum(dup.inx), " of duplicates were replaced by their ", lvlOpt, ".", sep=""), collapse="\n");
}
return(uniq.data);
}else{
if(!quiet){
current.msg <<- paste(current.msg, "All IDs are unique.", collapse="\n");
}
return(data);
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.