R/miRLAB.R

Defines functions identifymiRTargetsByEnsemble identifymiRTargetsByICPPam50 ICPPam50 prepareDataForPam50 filterAndCompare experiment KEGGenrichment GOBPenrichment filterAndCompare ReadExtResult ValidateAll ValidationT Validation bRank ReOrder BordaTopk Borda Extopk ProMISe Zscore Elastic Lasso RDC RDCParameter IDA MI EMI Hoeffding Dcov Kendall Spearman Pearson DiffExpAnalysis ImputeNormData Standardise queryTargetFile readHeader Read getData TCGAdownload `%:::%`

Documented in Borda BordaTopk bRank Dcov DiffExpAnalysis Elastic experiment Extopk filterAndCompare getData GOBPenrichment Hoeffding ICPPam50 IDA identifymiRTargetsByEnsemble identifymiRTargetsByICPPam50 ImputeNormData KEGGenrichment Kendall Lasso MI Pearson ProMISe RDC Read ReadExtResult readHeader Spearman Standardise ValidateAll Validation ValidationT Zscore

# This file contains a collection of methods for inferring miRNA-mRNA regulatory relationships
# Each method will take the dataset (.csv) as an input with the first n colomns are miRNAs 
# and the last m columns are mRNAs. Rows are samples. We need to specify the number of miRNAs in the dataset
# and the index of the miRNA, i.e. column that contains the miNRA. Topk is the number of top k interactions we
# would like to extract.






#getData from TCGA by TCGAbiolinks package
#' @import TCGAbiolinks
#' @import dplyr
#' @import SummarizedExperiment
#' @importFrom utils read.csv
#' @importFrom utils write.csv
#' @importFrom methods new
#' @importFrom stats p.adjust
#' @importFrom stats cor
#' @importFrom stats cov
#' @importFrom stats rnorm
#' @importFrom stats cancor
#' @importFrom stats phyper
#' @importFrom stats median
#' @import RCurl
#' @import httr
#' @import stringr
#' @import ctc
#' @import heatmap.plus


`%:::%` = function(pkg, fun) get(fun, envir = asNamespace(pkg), inherits = FALSE)

# A list of normal/tumor sample types, you can get 19 types via the command: TCGAbiolinks:::getBarcodeDefinition()$tissue.definition

TCGAdownload = function(CancerType="BRCA", Datatype="miRNA", Platform="miRNA-Seq", SampleType="tumor", ncases) {
  normal_pos = c(10:14)
  # tumor_barcode = TCGAbiolinks:::getBarcodeDefinition()$tissue.definition[-normal_pos]
  # normal_barcode = TCGAbiolinks:::getBarcodeDefinition()$tissue.definition[normal_pos]
  x <- 'TCGAbiolinks' %:::% 'getBarcodeDefinition'
  tumor_barcode = x()$tissue.definition[-normal_pos]
  normal_barcode = x()$tissue.definition[normal_pos]
  
  #find the projects conducted by The Cancer Genome Atlas genome program 
  project = paste("TCGA-", CancerType, sep = "")
  
  sample_type = switch(tolower(SampleType), 
                       "tumor" = tumor_barcode,
                       "normal" = normal_barcode)
  
  switch(tolower(Datatype), 
         "mirna" = { 
           #data_category = "Transcriptome Profiling"
           data_category = "Gene expression"
           data_type = "miRNA gene quantification"
           #data_type="miRNA Expression Quantification"
           file_type = "mirbase20.mirna" 
         },
         
         "mrna" = { 
           #data_category = "Gene expression"
           data_category="Gene expression"
           data_type = "Gene expression quantification"
           #file_type = ".rsem.genes.normalized_results"
           file_type = "normalized_results"
         },
         
         "somatic mutation" = { 
           data_category = "Simple nucleotide variation"
           data_type = "Simple somatic mutation"
           file_type = ".automated.somatic.maf"
         },
         
         "clinical" = {
           result = GDCquery_clinic(project)
           return(result)
         },
         
         "methylation" = { #get data of methylation
           data_category = "DNA methylation"
           data_type = "Methylation beta value"
           file_type = FALSE
         }
  )
  
  #eg. function argument Platform = RNA-Seq = experimental.strategy of GDCdownload
  experimental_strategy = Platform
  
  #query all of the cases based on the project and their types of data
  query = GDCquery(project = project, 
                   data.category =data_category,
                   data.type =data_type,
                   platform = "Illumina HiSeq",
                   #platform = "IlluminaGA_RNASeqV2",
                   file.type = file_type,
                   legacy = TRUE, access = "open", 
                   experimental.strategy = experimental_strategy,
                   #sample.type = sample_type,
                   #barcode=c("TCGA-A2-A1FZ-01A-51R-A14D-07")
  )
  print(query$result[[1]])
  # get barcode for the first [ncases] cases and download them, if ncases = NA then download all
  if (!missing(ncases)) {
    getcases = getResults(query)$cases[1:ncases]
    
    query = GDCquery(project = project,
                     data.category = data_category,
                     data.type = data_type,
                     file.type = file_type,
                     legacy = TRUE, access = "open",
                     experimental.strategy = experimental_strategy,
                     sample.type = sample_type,
                     barcode = getcases
    )
  }
  
  GDCdownload(query)
  
  result_tmp =GDCprepare(query)
  #output
  switch(tolower(Datatype), 
         "mirna" = { 
           rownames(result_tmp) = t(as.character(result_tmp[,1]))
           result_tmp = as.matrix(result_tmp[,grep("reads_per_million", colnames(result_tmp))])
           colnames(result_tmp) = substring(colnames(result_tmp), 32)
           return(result_tmp)
         },
         
         "mrna" = { 
           return(assay(result_tmp, "normalized_count")) 
         }, 
         
         "somatic mutation" = {
           return(result_tmp)
         },
         
         "methylation" = {
           return(assay(result_tmp))
         }
  )
}


#' getData from GDC
#' @param cancerName The name of cancer in string format
#' @return dataset in matrix format
#' @export
getData=function(cancerName){
  mRNA = TCGAdownload(CancerType=cancerName,Datatype = "mRNA", Platform = "RNA-Seq")
  miRNA = TCGAdownload(CancerType=cancerName,Datatype = "miRNA", Platform = "miRNA-Seq")
  unlink("GDCdata",recursive = TRUE)
  unlink("MANIFEST.txt",recursive = TRUE)

  samples_mRNA = colnames(mRNA)
  samples_miRNA = colnames(miRNA)
  match = match(substring(samples_miRNA, 1, 20), substring(samples_mRNA, 1, 20))
  
  getmatch_data = samples_mRNA[match]
  match_data_names = cbind(samples_miRNA, getmatch_data)
  
  ans = c() #ans is our goal of combination mRNA and miRNA
  
  for (i in c(1: length(getmatch_data))) {
    if (!is.na(match_data_names[i, 2])) {
      #sample_name = substring(match_data_names[i, 1], 1, 20)
      tmp = c(as.numeric(miRNA[, i]), as.numeric(mRNA[, match[i]])) #data of sample
      ans = rbind(ans, tmp)
    }
  }
  
  empty = which(is.na(match_data_names[, 2]))
  match_data_names = match_data_names[-empty, ]
  rownames(ans) = NULL
  colnames(ans) = c(rownames(miRNA), rownames(mRNA))
  return(ans)
}





#' Read dataset from csv file
#' @param dataset The input dataset in csv format
#' @return dataset in matrix format
#' @examples
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' data=Read(dataset)
#' @export
Read<-function(dataset){
		data<-read.csv(dataset, header=TRUE, sep=",")
		#data<-data[,-1]
		return(data)
	}

############ Get data header from dataset ###############
#' Read the header of the dataset
#' @return the header of the dataset
#' @param dataset the character string of the names of the dataset in csv format, e.g. "ToyEMT.csv"
#' @examples
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' header=readHeader(dataset)
#' @export
readHeader<-function(dataset){
  data<-read.csv(dataset, header=FALSE)
  header<-character()
  for (i in 1:ncol(data)){
  header[i]=toString(data[1,i])
  }
  return(header)
}

############ Convert target information into a binary matrix, the rows are mRNAs and the columns are miRNAs ###############
queryTargetFile<-function(miRNA,mRNA,file){
  #the column name of file should be tagged as "mir" and "gene"
  data<-Read(file)
  mir=as.character(data$mir)
  #   mir<-paste("hsa-",sep="",mir);mir<-sub('r','R',mir)
  gene=as.character(data$gene)
  #   symbol<-geneSymbol(gene)
  sum=0
  rep<-replicate(length(miRNA),mir)
  edge=matrix(FALSE,length(miRNA)+length(mRNA),length(miRNA)+length(mRNA))
  for (i in 1:length(mir)){
    #     print(i)
    if (length(which(rep[i,]==miRNA)>0)){
      match1<-which(rep[i,]==miRNA,arr.ind=TRUE)
      #gene: gene[i] #mirna: miRNA[match]
      rep2<-replicate(length(mRNA),gene[i])
      match2<-which(rep2==mRNA,arr.ind=TRUE)
      edge[match1,match2+length(miRNA)]=TRUE
    }
  }
  return(edge)
}
	
	
#' Stardarsise the dataset
#' Stadardise the dataset to have mean=0 and std=1 in each column.
#' @param dataset The input dataset in csv format. e.g. "ToyEMT.csv". The first column is the sample name.
#' @return The standardised dataset.
#' @examples
#' \dontrun{
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' stdata=Standardise(dataset)
#' }
	Standardise<-function(dataset){
		ncol<-ncol(dataset)
		nrow<-nrow(dataset)
		stdData<-matrix(nrow=nrow,ncol=ncol-1)
              rownames(stdData)<-dataset[,1]
              colnames<-colnames(dataset)
              colnames(stdData)<-colnames[2:ncol]
		for (i in 2:ncol){
			stdData[,i-1]<-scale(dataset[i],center=TRUE, scale=TRUE)
			}
		return(stdData)
	}

#' Filter, impute, and normalise data.
#' 
#' Remove the genes (rows) that have more than r\% of missing data;
#' use the impute package to fill in missing data, and finally normalise the data.
#' @importFrom impute impute.knn
#' @importFrom limma normalizeBetweenArrays
#' @param dataset The input dataset in csv format. e.g. "EMT.csv" 
#' @param r The rate threshold to filter the records (genes). Genes with more than r\% missing data will be removed.
#' @return The processed dataset.
#' @references 
#' 1. Hastie T, Tibshirani R, Narasimhan B and Chu G. impute: Imputation for microarray data. R package version 1.42.0.
#' 
#' 2. Smyth, G.K. (2005). Limma: linear models for microarray data. In Bioinformatics and computational biology solutions using R and Bioconductor (pp. 397-420). Springer New York.
#' @examples
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' impdata=ImputeNormData(dataset, 0.1)
#' @export
ImputeNormData<-function(dataset, r){
               #library(impute)
               Rawdata<-Read(dataset)
               #Rawnames<-as.matrix(Rawdata)[,1]
              # Rawdata<-Rawdata[,-1]
               #rownames(Rawdata)<-Rawnames
                              
               #Removing genes which samples having more r% zero counts or missing value
               Rawdata<-Rawdata[which(rowSums(Rawdata==0)<r*dim(Rawdata)[2]),]
               data<-Rawdata[which(rowSums(is.na(Rawdata))<r*dim(Rawdata)[2]),]
             
               #if(exists(".Random.seed")) rm(.Random.seed)
               imputed<-impute.knn(as.matrix(data) ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
               dataimputed<-imputed$data
               
               #Make log2 transformation for expression data
               dataimputed<-log2(dataimputed)                    
                            
               #normalizeBetweenArrays in limma
               #library(limma)
               normlog<-matrix(data = dataimputed, nrow =dim(dataimputed)[1], ncol = dim(dataimputed)[2], byrow = TRUE, dimnames = NULL)
               normlogbtwarray<-normalizeBetweenArrays(normlog,method='quantile')
               rownames(normlogbtwarray)=rownames(data)             
               colnames(normlogbtwarray)=colnames(data)

               return(normlogbtwarray)
}

#' Differentially expressed analysis
#' 
#' Find the top miRNAs and mRNAs that are differently expression between different conditions, e.g. cancer vs normal
#' @importFrom limma lmFit makeContrasts contrasts.fit eBayes topTable
#' @param miR1 the miRNA dataset for condition 1, e.g. cancer
#' @param miR2 the miRNA dataset for condition 1, e.g. normal
#' @param mR1  the mRNA dataset for condition 1, e.g. cancer
#' @param mR2  the mRNA dataset for condition 2, e.g. normal
#' @param topkmiR the maximum number of miRNAs that we would like to extract, e.g. top 50 miRNAs.
#' @param topkmR the maximum number of mRNAs that we would like to extract, e.g. top 2000 mRNAs.
#' @param p.miR cutoff value for adjusted p-values when conducting differentially expressed analysis for miRNAs.
#' @param p.mR cutoff value for adjusted p-values when conducting differentially expressed analysis for mRNAs.
#' @return the dataset that includes differentially expressed miRNAs and mRNAs. columns are miRNAs and mRNAs  and rows are samples
#' @references
#' Smyth, G.K. (2005). Limma: linear models for microarray data. In Bioinformatics and computational biology solutions using R and Bioconductor (pp. 397-420). Springer New York.
#' @export
DiffExpAnalysis=function(miR1, miR2, mR1, mR2,topkmiR, topkmR, p.miR, p.mR){
	#miR1: miRNA dataset in condition 1, rows are miRNAs, samples are columns
	#miR2: condition2
	#mR1, mR2: genes in 2 conditions.
	# topk: the number of genes that we want to extract.
	
	#library(limma)
	miR1=read.csv(miR1, header=TRUE, sep=",")
	miRnames=miR1[,1]
	miR1=miR1[,-1]
	c1=ncol(miR1)
	
	miR2=read.csv(miR2, header=TRUE, sep=",")
	miR2=miR2[,-1]
	c2=ncol(miR2)
	
	miR=cbind(miR1, miR2)
	rownames(miR)=miRnames
	#miR=t(miR)
	
	mR1=read.csv(mR1, header=TRUE, sep=",")
	mRnames=mR1[,1]
	mR1=mR1[,-1]
	mR2=read.csv(mR2, header=TRUE, sep=",")
	mR2=mR2[,-1]
	mR=cbind(mR1, mR2)
	rownames(mR)=mRnames
	#mR=t(mR)
	Normal=NULL
  Cancer=NULL
	########## miR ############
	design=cbind(Normal=c(rep(1,c1), rep(0,c2)), Cancer=c(rep(0,c1), rep(1,c2)))
	miRfit=lmFit(miR, design)
	contrast.matrix=makeContrasts(NormalvCancer=Normal - Cancer, levels=design)
	miRfit2=contrasts.fit(miRfit, contrast.matrix)
	miRfit2=eBayes(miRfit2)
	miRresults=topTable(miRfit2, number= topkmiR, p.value=p.miR, sort.by="p", adjust.method="BH")
	write.csv(miRresults, file="DiffExpmiR.csv")
	########## mR ############
	mRfit=lmFit(mR, design)
	contrast.matrix=makeContrasts(NormalvCancer=Normal - Cancer, levels=design)
	mRfit2=contrasts.fit(mRfit, contrast.matrix)
	mRfit2=eBayes(mRfit2)
	mRresults=topTable(mRfit2, number= topkmR, p.value=p.mR, sort.by="p", adjust.method="BH")
	write.csv(mRresults, file="DiffExpmR.csv")
        miRSymbol=rownames(miRresults)
        mRSymbol=rownames(mRresults)
        miRExp=miR[which(miRnames %in% miRSymbol),]
        mRExp=mR[which(mRnames %in% mRSymbol),]
        
        miRExp=t(miRExp)
        mRExp=t(mRExp)
        
        DExp=cbind(miRExp,mRExp)
        
        return(DExp)
        	
}




#' miRNA target prediction with the Pearson correlation coefficient method
#' 
#' Calculate the Pearson correlation coefficient of each pair of miRNA-mRNA,and return a matrix of correlation coefficients with columns are miRNAs and rows are mRNAs.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Pearson correlation coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Pearson(dataset, 1:3, 4:18) 
#' 
#' @references
#' Pearson, K. (1920) Notes on the history of correlation. Biometrika, 13, 25 - 45.
#' @export 

## 1. Pearson ##
Pearson=function(datacsv, cause, effect, targetbinding=NA){
data=Read(datacsv)
data=scale(data) #standardise the data
header<-readHeader(datacsv)
num_miRNA<-length(cause)
miR<-header[1:num_miRNA]
mR<-header[-(1:num_miRNA)]

miRNA=data[,cause]
mRNA=data[,effect]
corMat=cor(mRNA, miRNA, method="pearson")# miRNAs in columns and mRNAs in rows

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
corMat=corMat*edge
}

return(corMat)
}

#' miRNA target prediction with the Spearman correlation coefficient method
#' 
#' Calculate the Spearman correlation coefficient of each pair of miRNA-mRNA,and return a matrix of correlation coefficients with columns are miRNAs and rows are mRNAs.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Spearman correlation coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Spearman(dataset, 1:3, 4:18) 
#' 
#' @references
#' Spearman, C. (1904) General intelligence, objectively determined and measured. Am. J. Psychol., 15, 201 - 92.
#' @export 
## 2. Spearman ##
Spearman=function(datacsv, cause, effect, targetbinding=NA){
data=Read(datacsv)
data=scale(data) #standardise the data
header<-readHeader(datacsv)
num_miRNA<-length(cause)
miR<-header[1:num_miRNA]
mR<-header[-(1:num_miRNA)]

miRNA=data[,cause]
mRNA=data[,effect]
corMat=cor(mRNA, miRNA, method="spearman")# miRNAs in columns and mRNAs in rows


if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
corMat=corMat*edge
}

return(corMat)
}

#' miRNA target prediction with the Kendall correlation coefficient method
#' 
#' Calculate the Kendall correlation coefficient of each pair of miRNA-mRNA,and return a matrix of correlation coefficients with columns are miRNAs and rows are mRNAs.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Kendall correlation coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Kendall(dataset, 1:3, 4:18) 
#' @references
#' Kendall, M. (1938) A new measure of rank correlation. Biometrika, 30, 81 - 9.
#' @export 
## 3. Kendall ##
Kendall=function(datacsv, cause, effect, targetbinding=NA){
data=Read(datacsv)
data=scale(data) #standardise the data
header<-readHeader(datacsv)
num_miRNA<-length(cause)
miR<-header[1:num_miRNA]
mR<-header[-(1:num_miRNA)]

miRNA=data[,cause]
mRNA=data[,effect]
corMat=cor(mRNA, miRNA, method="kendall")# miRNAs in columns and mRNAs in rows


if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
corMat=corMat*edge
}

return(corMat)
}

#' miRNA target prediction with the Distance correlation  method
#' 
#' Calculate the Distance correlation  of each pair of miRNA-mRNA,and return a matrix of correlation coefficients with columns are miRNAs and rows are mRNAs.
#' @importFrom energy dcov
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Distance correlation values. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Dcov(dataset, 1:3, 4:18) 
#' @references
#' Szekely, G., Rizzo, M. and Bakirov, N. (2007) Measuring and testing independence by correlation of distances. Ann. Stat., 35, 2769 - 94.
#' @export 
## 4. Dcov(Distance correlation) ##
Dcov <- function(datacsv, cause, effect, targetbinding=NA) {

       # library(energy)
        data=Read(datacsv)
        data=scale(data)
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]

        allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
	Result<-matrix(nrow=length(effect), ncol=length(cause))
        for (i in cause){
                     for (j in effect){
                        Result[j-length(cause),i]<-dcov(data[,i],data[,j]) # Calculate Distance correlation.
                      }
         }
       colnames(Result)=causenames
       rownames(Result)=effectnames
       
       
if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
Result=Result*edge
}
 
return(Result)
}
#' miRNA target prediction with the Hoeffding correlation coefficient method
#' 
#' Calculate the Hoeffding correlation coefficient of each pair of miRNA-mRNA,and return a matrix of correlation coefficients with columns are miRNAs and rows are mRNAs.
#' @importFrom Hmisc hoeffd 
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Hoeffding correlation coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Hoeffding(dataset, 1:3, 4:18) 
#' @references
#' Hoeffding, W. (1948) A non-parametric test of independence. Ann. Math. Stat., 19, 546 - 57.
#' @export 
## 5. Hoeffding(Hoeffding's D measure) ##
Hoeffding <- function(datacsv, cause, effect, targetbinding=NA) {

       #library(Hmisc)
        data=Read(datacsv)
        data=scale(data)
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]        

        allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
	Result<-matrix(nrow=length(effect), ncol=length(cause))
        for (i in cause){
                     for (j in effect){
                        Result[j-length(cause),i]<-as.numeric(c(hoeffd(data[,i],data[,j])["D"], recursive = TRUE)[2]) # Calculate Hoeffding's D measure.
                      }
         }
       colnames(Result)=causenames
       rownames(Result)=effectnames

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
Result=Result*edge
}

return(Result)
}



#' @importFrom entropy mi.empirical
#' @importFrom gplots hist2d
      
EMI <- function(x, y) {
   # library(ghyp)
   # library(entropy)
    Mx <- matrix(x, length(x), 1)
    My <- matrix(y, length(y), 1)
    nbins <- ceiling(log(length(Mx[,1]),2)) + 1
    a <- hist2d(cbind(Mx,My), nbins=nbins,show=FALSE)
    result <- mi.empirical(a$counts)
    return(result)
}

#' miRNA target prediction with  mutual information method
#' 
#' Calculate the mutual information of each pair of miRNA-mRNA,and return a matrix of mutual information values with columns are miRNAs and rows are mRNAs.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the mutual information values. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=MI(dataset, 1:3, 4:18) 
#' @references
#' Moon, Y.I., Balaji, R., and Lall, U. (1995) Estimation of mutual information using kernel density estimators. Phys. Rev. E, 52, 2318 - 21.
#' @export 

MI <- function(datacsv, cause, effect, targetbinding=NA) {
        data=Read(datacsv)
        data=scale(data)
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]

        allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
	Result<-matrix(nrow=length(effect), ncol=length(cause))
        for (i in cause){
                     for (j in effect){
                        Result[j-length(cause),i]<-EMI(data[,i],data[,j]) # Calculate Mutual Information.
                      }
         }
       colnames(Result)=causenames
       rownames(Result)=effectnames       

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
Result=Result*edge
}

return(Result)
}

#' miRNA target prediction with the IDA method
#' 
#' Calculate the causal effect of each pair of miRNA-mRNA,and return a matrix of causal effects with columns are miRNAs and rows are mRNAs.
#' @importFrom pcalg pc idaFast gaussCItest
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param pcmethod choose different versons of the PC algorithm, including "original" (default)
#' "stable", and "stable.fast"
#' @param alpha significance level for the conditional independence test, e.g. 0.05.
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the causal effects. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=IDA(dataset, 1:3, 4:18) 
#' @references
#' 1. Le, T.D., Liu, L., Tsykin, A., Goodall, G.J., Liu, B., Sun, B.Y. and Li, J. (2013) Inferring microRNA-mRNA causal regulatory relationships from expression data. Bioinformatics, 29, 765-71.
#' 
#' 2. Zhang, J., Le, T.D., Liu, L., Liu, B., He, J., Goodall, G.J. and Li, J. (2014) Identifying direct miRNA-mRNA causal regulatory relationships in heterogeneous data. J. Biomed. Inform., 52, 438-47.
#' 
#' 3. Maathuis, H.M., Colombo, D., Kalisch, M. and Buhlmann, P. (2010) Predicting causal effects in large-scale systems from observational data. Nat. Methods, 7, 247-249.
#' 
#' 4. Maathuis, H.M., Kalisch, M. and Buhlmann, P. (2009) Estimating high-dimensional intervention effects from observational data. Ann. Stat., 37, 3133-3164.
#' @export 
## 7. IDA ##
IDA=function(datacsv, cause, effect, pcmethod="original", alpha=0.05, targetbinding=NA){
     #   library(pcalg)

	if(is.character(datacsv)){
		data=Read(datacsv)
		#data=data[,-1] # if the dataset have the sample names column, otherwise comment this out.
	} else {
		data=datacsv #Assume there is no samplenames column and this is a data.frame.
	}							
	data=scale(data) #standardise the data
	header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]

        #print(data[1:5,])
	allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
		
	multiset=character(0)
	result=matrix(nrow=length(effect), ncol=length(cause))
	suffStat=list(C=cor(data), n=nrow(data))
	indepTest=gaussCItest
	
	pcFit <- pc(suffStat, indepTest, p=ncol(data), alpha=alpha, skel.method=pcmethod)
	for (l in cause){
				
			#Inferring causal effects
			caef<-idaFast(l,effect,cov(data), pcFit@graph )
		
			#min of absolute values.
			caef1<-matrix(nrow=length(effect),ncol=1)
			for (k in 1:length(effect)){
				caefabs<-abs(caef)
				index<-which(caefabs==min(caefabs[k,]), arr.ind=TRUE)
				pos<-index[1,2]
				caef1[k,]<-caef[k,pos]
			}
			result[,l]<-caef1
	}
	colnames(result)=causenames
	rownames(result)=effectnames

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
result=result*edge
}

return(result)	
}



## 9. RDC(Randomized Dependence Coefficient) ##
RDCParameter <- function(x,y,k=20,s=1/6,f=sin) {
  x <- cbind(apply(as.matrix(x),2,function(u)rank(u)/length(u)),1)
  y <- cbind(apply(as.matrix(y),2,function(u)rank(u)/length(u)),1)
  x <- s/ncol(x)*x%*%matrix(rnorm(ncol(x)*k),ncol(x))
  y <- s/ncol(y)*y%*%matrix(rnorm(ncol(y)*k),ncol(y))
  cancor(cbind(f(x),1),cbind(f(y),1))$cor[1]
}

#' miRNA target prediction with the Randomized Dependence Coefficient method
#' 
#' Calculate the Randomized Dependence coefficient of each pair of miRNA-mRNA,and return a matrix of  coefficients with columns are miRNAs and rows are mRNAs.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the  correlation coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=RDC(dataset, 1:3, 4:18) 
#' @export 

RDC <- function(datacsv,cause, effect, targetbinding=NA) {
        data=Read(datacsv)
        data=scale(data) #standardise the data
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]

        allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
        
	Result<-matrix(nrow=length(effect), ncol=length(cause))
        for (i in cause){
                     for (j in effect){
                        Result[j-length(cause),i]<-RDCParameter(data[,i],data[,j],k=20,s=1/6,f=sin) # Calculate Randomized Dependence Coefficient.
                      }
         }

        colnames(Result)=causenames
	rownames(Result)=effectnames

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
Result=Result*edge
}
        
return(Result)
}
#' miRNA target prediction with the Lasso method
#' 
#' Calculate the Lasso regression coefficient of each pair of miRNA-mRNA, and return a matrix of coefficients with columns are miRNAs and rows are mRNAs.
#' @importFrom glmnet glmnet
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Lasso regression coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Lasso(dataset, 1:3, 4:18) 
#' @references
#' 1. Le, T.D., Zhang, J., Liu, L., and Li, J. (2015) Ensemble Methods for miRNA Target Prediction from Expression Data, submitted.
#' 
#' 2. Tibshirani, R. (1996). Regression shrinkage and selection via the lasso. J. R. Stat. Soc. Series B Stat. Methodol., 267-288.
#' @export 
## 10. Lasso ##
Lasso=function(datacsv, cause, effect, targetbinding=NA){
 # library(glmnet)
	if(is.character(datacsv)){
		data=Read(datacsv)
		#data=data[,-1] # if the dataset have the sample names column, otherwise comment this out.
	} else {
		data=datacsv #Assume there is no samplenames column and this is a data.frame.
	}							#To allow both .csv data input or a matrix in R. This will help the IDAbootstrap, as IDA can run on sampling matrices.
	data=scale(data) #standardise the data
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]
  data=as.matrix(data)
	#print(data[1:5,])
	allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
	res=matrix(nrow=length(effectnames), ncol=length(causenames), 0)
	colnames(res)=causenames
	rownames(res)=effectnames
		
	for(gene in effectnames) {
		aGene <- glmnet(data[,cause],data[,gene],alpha=1)$beta #return the the effects of all miRNAs on the gene
    aGene=as.matrix(aGene)
		aGene=rowMeans(aGene) # take the means of all values output from lasso    
		res[gene,]=aGene # assign the effects of all miRNAs on the gene to the result. 
	}

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
res=res*edge
}

return(res)
}

#' miRNA target prediction with the Elastic-net regression coefficient method
#' 
#' Calculate the Elastic-net regression coefficient of each pair of miRNA-mRNA,and return a matrix of correlation coefficients with columns are miRNAs and rows are mRNAs.
#' @importFrom glmnet glmnet
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Elastic-net regression coefficients. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Elastic(dataset, 1:3, 4:18) 
#' @references
#' 1. Le, T.D., Zhang, J., Liu, L., and Li, J. (2015) Ensemble Methods for miRNA Target Prediction from Expression Data, under review.
#' 
#' 2. Zou, H., & Hastie, T. (2005). Regularization and variable selection via the elastic net. J. R. Stat. Soc. Series B Stat. Methodol., 67, 301-320.
#' @export 
## 11. Elastic-net ##
Elastic=function(datacsv, cause, effect, targetbinding=NA){
 # library(glmnet)
	if(is.character(datacsv)){
		data=Read(datacsv)
		#data=data[,-1] # if the dataset have the sample names column, otherwise comment this out.
	} else {
		data=datacsv #Assume there is no samplenames column and this is a data.frame.
	}							#To allow both .csv data input or a matrix in R. This will help the IDAbootstrap, as IDA can run on sampling matrices.
	data=scale(data) #standardise the data
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]
	data=as.matrix(data)
	#print(data[1:5,])
	allnames=colnames(data)
	causenames=allnames[cause]
	effectnames=allnames[effect]
	res=matrix(nrow=length(effectnames), ncol=length(causenames), 0)
	colnames(res)=causenames
	rownames(res)=effectnames
		
	for(gene in effectnames) {
		aGene <- glmnet(data[,cause],data[,gene],alpha=0.5)$beta #return the the effects of all miRNAs on the gene
		aGene=as.matrix(aGene)
    aGene=rowMeans(aGene) # take the means of all values output from lasso     
		res[gene,]=aGene # assign the effects of all miRNAs on the gene to the result. 
	}

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
res=res*edge
}

return(res)
}

#' miRNA target prediction with the Z-score method
#' 
#' Calculate the Z-score value of each pair of miRNA-mRNA, and return a matrix of values with columns are miRNAs and rows are mRNAs.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the Z-score values. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=Zscore(dataset, 1:3, 4:18) 
#' @references
#' Prill, R.J., Marbach, D., Saez-Rodriguez, J., Sorger, P.K., Alexopoulos, L.G., Xue, X., Clarke, N.D., Altan-Bonnet, G. and Stolovitzky, G. (2010) Towards a rigorous assessment of systems biology models: the DREAM3 challenges. PLoS One, 5, e9202.
#' @export 
## 12. Z-score ##
Zscore=function(datacsv, cause, effect, targetbinding=NA){
	dt=read.csv(datacsv, header=TRUE, sep=",")
	dt=scale(dt)
        header<-readHeader(datacsv)
        num_miRNA<-length(cause)
        miR<-header[1:num_miRNA]
        mR<-header[-(1:num_miRNA)]

	effectnames=colnames(dt)[effect]
	causenames=colnames(dt)[cause]
	res=matrix(nrow=length(effectnames), ncol=length(causenames), 0)
	colnames(res)=causenames
	rownames(res)=effectnames
	causes=dt[,cause]
	effects=dt[,effect]
	#zAB=(xBminA-meanB)/sdB
	for (i in 1:length(cause)){
		for (j in 1: length(effect)){
			indexminA=which(causes[,i]==min(causes[,i]))
			xBminA=effects[indexminA,j]
			xBminA=median(xBminA)
			#sdB=sd(effects[,j]) #if we standardise the sdB=1
			#meanB=mean(effects[,j]) if we standardise the mean is 0
			#zij=abs(xBminA-meanB)/sdB
			zij=abs(xBminA)
			
			res[j,i]=zij
		}
	}

if(is.na(targetbinding)==FALSE){
#query knowledge matrix from file
edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
edgeTargetScan<-edgeTargetScan!=0;
edge=edgeTargetScan[effect,cause]
res=res*edge
}

return(res)
}

#' miRNA target prediction with the ProMISe method
#' 
#' Calculate the ProMISe score of each pair of miRNA-mRNA, and return a matrix of values with columns are miRNAs and rows are mRNAs.
#' @importFrom Roleswitch roleswitch
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param targetbinding the putative target, e.g. "TargetScan.csv". If targetbinding is not specified, only expression data is used.
#' If targetbinding is specified, the prediction results using expression data with be intersected with the interactions in the target binding file.
#' @return A  matrix that includes the ProMISe scores. Columns are miRNAs, rows are mRNAs.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' results=ProMISe(dataset, 1:3, 4:18) 
#' @references
#' Li, Y., Liang, C., Wong, K.C., Jin, K., and Zhang, Z. (2014). Inferring probabilistic miRNA - mRNA interaction signatures in cancers: a role-switch approach. Nucleic Acids Res., 42, e76-e76.
#' @export 
## 13. ProMISe ##
ProMISe=function(datacsv, cause, effect, targetbinding=NA){
  # library("Roleswitch")
  dt<-Read(datacsv)
  dd<-colMeans(dt)
  stdData<-as.matrix(dd)
  header<-readHeader(datacsv)
  num_miRNA<-length(cause)
  miR<-header[1:num_miRNA]
  mR<-header[-(1:num_miRNA)]
  
  x.o<-matrix(stdData[effect,],dimnames=list(c(1:length(effect)),"mRNA"))
  z.o<-matrix(stdData[cause,],dimnames=list(c(1:length(cause)),"miRNA"))
  c<-matrix(1,length(effect),length(cause)) #Generate ones matrix
  rownames(c)<-c(1:length(effect))
  colnames(c)<-c(1:length(cause))
  
  rMatrix <- roleswitch(x.o,z.o,c)$p.xz # Calculate ProMISe probabilistic
  rownames(rMatrix) <- colnames(dt)[effect]
  colnames(rMatrix) <- colnames(dt)[cause]
  
  if(is.na(targetbinding)==FALSE){
    #query knowledge matrix from file
    edgeTargetScan<-queryTargetFile(miR,mR,targetbinding); 
    edgeTargetScan<-edgeTargetScan+t(edgeTargetScan);
    edgeTargetScan<-edgeTargetScan!=0;
    edge=edgeTargetScan[effect,cause]
    rMatrix=rMatrix*edge
  }
  
  return(rMatrix)
}


#' Extract top k miRNA-mRNA interactions 
#' 
#' Rank the miRNA-mRNA interactions based on absolute values of the correlations/scores/causal effects, and return
#' the topk interactions.
#' @param cormat the correlation matrix that need to be extracted with columns are miRNAs and rows are mRNAs
#' @param topk the number of interactions that need to be extracted.
#' @return topk interactions 
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' EMTresults=Pearson(dataset, 1:3, 4:18)
#' top10=Extopk(EMTresults, 10)
#' @export
Extopk <- function(cormat,topk){
          mir<-ncol(cormat)
          gene<-nrow(cormat)
          mirname<-colnames(cormat)
          mirname = gsub("\\.", "-", mirname)  # replace "." with "-" for miRNA
          for(index in 1:length(mirname)){
          if(substr(mirname, nchar(mirname),nchar(mirname))[index]=="-") {
	          substring(mirname[index], nchar(mirname[index]), nchar(mirname[index]))="*" 
                  }
          }
          genename<-rownames(cormat)
          Result<-matrix(-1,ncol=4,nrow=mir*gene)
          n<-0
          for(i in 1:mir){
                   for(j in 1:gene){
                       n<-(n+1)
                       Result[n, 1] <- mirname[i]
                       Result[n, 2] <- genename[j]
                       Result[n, 3] <- cormat[j,i]
                       Result[n, 4] <- abs(as.numeric(Result[n, 3])) # Calculate absolute value
                }
           } 
           TrResult <- Result[sort.list(as.numeric(Result[,4]),decreasing=TRUE),] #Rank the whole interations by decreasing the absolute value
           
           return(TrResult[1:topk,]) # Extract top k miRNA-mRNA interactions
        	
}      


#' Ensemble method for miRNA target prediction using Borda count election
#' 
#' Use the Borda count election method to integrate the rankings from different miRNA target prediction methods
#' @param listCEmatrices a list of matrices that include the correlation coefficients/causal effects/scores resulting from different target prediction methods
#' @return a matrix of ranking scores (averaging all the rankings from different methods). Columns are miRNAs and rows are mRNAs
#' @examples
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' ps=Pearson(dataset, cause=1:3, effect=4:18)
#' ida=IDA(dataset, cause=1:3, effect=4:18)
#' borda=Borda(list(ps, ida))
#' @references
#' 1. Le, T.D., Zhang, J., Liu, L., and Li, J. (2015) Ensemble Methods for miRNA Target Prediction from Expression Data, Plos ONE.
#' 
#' 2. Marbach, D., Costello, J.C., Kuffner, R., Vega, N.M., Prill, R.J., Camacho, D.M., Allison, K.R. and DREAM5 Consortium (2012). Wisdom of crowds for robust gene network inference. Nat. Methods, 9, 796-804.
#' @export

Borda=function(listCEmatrices){
noMethods=length(listCEmatrices)
effectnames=rownames(listCEmatrices[[1]])
causenames=colnames(listCEmatrices[[1]])
res=matrix(nrow=length(effectnames), ncol=length(causenames), 0)
colnames(res)=causenames
rownames(res)=effectnames

for(i in 1:noMethods){ # for each CEmatrix from a method
	for (j in 1:length(causenames)){ #for each miRNA
		cormat=listCEmatrices[[i]][,j] #extract the corelation values
		cormat=cormat[order(-abs(cormat))] # sort based on absolute values
		for(k in 1:length(cormat)){cormat[k]=k} # change the correlation values by the ranking
		rn=names(cormat) # take the genenames
		
		for(gene in rn){ #for each gene in the sorted matrix
			res[gene, j]=res[gene, j]+cormat[gene] #add up the current rankings of the possition (gene, miRNA)
		}
	}
}
res=res/noMethods #take the average rankings
res=length(effectnames)/res #revert the rankings as bRank will sort the rankings as if correlation.
}

#' Ensemble method for miRNA target prediction using Borda count election with topk targets
#' 
#' Use the Borda count election method to integrate the rankings from different miRNA target prediction methods, but only topk targets of each miRNA are included
#' in the calculation. The targets outside the topk will be assigned a large and fixed rank, e.g. number of genes in the dataset.
#' @param listCEmatrices a list of matrices that include the correlation/causal effects/scores resulting from a target prediction method
#' @param topk number of targets of a miRNA to be included in the calculation (Borda count election)
#' @return a matrix of ranking scores (averaging all the rankings from different methods). Columns are miRNAs and rows are mRNAs
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' ps=Pearson(dataset, cause=1:3, effect=4:18)
#' ida=IDA(dataset, cause=1:3, effect=4:18)
#' borda=BordaTopk(list(ps, ida), topk=10)
#' @references
#' Le, T.D., Zhang, J., Liu, L., and Li, J. (2015) Ensemble Methods for miRNA Target Prediction from Expression Data, Plos ONE.
#' @export
BordaTopk=function(listCEmatrices, topk){
noMethods=length(listCEmatrices)
effectnames=rownames(listCEmatrices[[1]])
causenames=colnames(listCEmatrices[[1]])
res=matrix(nrow=length(effectnames), ncol=length(causenames), 0)
colnames(res)=causenames
rownames(res)=effectnames
noGenes=nrow(listCEmatrices[[1]])
for(i in 1:noMethods){ # for each CEmatrix from a method
	for (j in 1:length(causenames)){ #for each miRNA
		cormat=listCEmatrices[[i]][,j] #extract the corelation values
		cormat=cormat[order(-abs(cormat))] # sort based on absolute values
		for(k in 1:length(cormat)){cormat[k]=k} # change the correlation values by the ranking
		rn=names(cormat) # take the genenames
		
		for(gene in rn){ #for each gene in the sorted matrix
			if(cormat[gene]>topk) cormat[gene]=noGenes
			res[gene, j]=res[gene, j]+cormat[gene] #add up the current rankings of the possition (gene, miRNA)
		}
	}
}
res=res/noMethods #take the average rankings
res=length(effectnames)/res #revert the rankings as bRank will sort the rankings as if correlation.
}



ReOrder=function(topkList){
   
   ####### preprocessing the list of topk results #######
   topkList = as.matrix(topkList, ncol=3);
   if(nrow(topkList)<1 || length(topkList[,3]>0)==0 || length(topkList[,3]<0)==0)   return(data.frame(topkList));
   
   ####### sort the values in increasing order ########
   val = as.numeric(topkList[,3]);  
   val = sort(val, index.return=TRUE, decreasing=FALSE)

   ####### obtain the negative values ########
   ni = val$ix[which(val$x<0)] # the index of negative values in Val;
   pi = val$ix[length(val$x):(length(ni)+1)] # the index of positive values in Val

   return(data.frame(topkList[c(ni, pi), ]))
}


#' Extract topk predicted targets of a miRNA
#' Rank all the targets of a miRNA and extract the topk targets
#' @param CEmatrix the matrix of correlation/causal effect/score results with columns are miRNAs and rows are mRNAs
#' @param causeIndex the column index of the miRNA that we would like to extract
#' @param topk the number of targets being extracted
#' @param downreg if TRUE the negative correlation/causal effect/score will be on the top of the ranking. This is to
#' favour the negative regulations. 
#' @return a matrix with 3 columns, where the first column contains the miRNA, the second column contains the mRNAs and the last column contains the correlations/causal effects/scores
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' ps=Pearson(dataset, cause=1:3, effect=4:18)
#' miR200aTop10 = bRank(ps, 3, 10, TRUE)
#' @export

bRank=function(CEmatrix, causeIndex, topk, downreg=TRUE){

miRNAnames=colnames(CEmatrix)
mRNAnames=rownames(CEmatrix)

c1=rep(miRNAnames[causeIndex], length(mRNAnames))
corMat=CEmatrix[,causeIndex]
result=cbind(c1,mRNAnames, corMat)
rownames(result)=NULL
colnames(result)=c("miRNA", "mRNA", "Correlation")
#remove quotes
result=data.frame(result)
result[,3]=as.numeric(as.character(result[,3]))

#cat("numberof genes in result:", nrow(result), "\n")
result=result[order(-(abs(result[,3]))),]
result=result[!duplicated(result[,2]),]
#cat("number of genes in the ranking list after removing dulicates", nrow(result), "\n")

result=result[1:topk,]
result[, 1] = gsub("\\.", "-", result[, 1])  # replace "." with "-" for miRNA
if(substr(result[ ,1], nchar(result[,1]), nchar(result[,1]))[1]=="-") {
	substring(result[,1], nchar(result[,1]), nchar(result[,1]))="*" 
}
# if the last character is - then change to *, e.g. hsa-miR-7-1*
#result[, 2] = gsub("\\.", "-", result[, 2])  # replace "." with "-" for mRNA
if(downreg) result=ReOrder(result)
#print(result[1:5,])
return(result)
}


############# Validation function ###### 
#' Validate the targets of a miRNA
#' 
#' Given the predicted target of a miRNA, the function returns a list of targets that are experimentally confirmed
#' based on the provided ground truth. Users can provide their own ground truth or use the built-in ground truth 
#' which is the union of Tarbase, miRTarbase, miRecords, and miRWalk. 
#' @param topkList a matrix with 3 columns. The first column is the miRNA name, the second contains the target mRNAs, and the third contains the correlation values/ causal effects/ scores
#' @param datacsv the ground truth for the validation. The ground truth is a matrix with 2 columns, where the first column
#' is the miRNA and the second is the mRNA.
#' @return a matrix in the same format of the input matrix put only contains the confirmed interactions.
#' @examples 
#' dataset=system.file("extdata", "ToyEMT.csv", package="miRLAB")
#' ps=Pearson(dataset, cause=1:3, effect=4:18)
#' miR200aTop10=bRank(ps, 3, 10, TRUE)
#' groundtruth=system.file("extdata", "Toygroundtruth.csv", package="miRLAB")
#' miR200aTop10Confirmed = Validation(miR200aTop10, groundtruth)
#' @export
Validation=function(topkList, datacsv){
	
####### preprocessing the list of topk results #######
	topkList = as.matrix(topkList, ncol=3);
	if(nrow(topkList)<1) stop("No data in the input")
    
####### read the validation data from file ########
        dt=read.csv(datacsv, header=TRUE, sep=",")
	
####### get the validation lists from the data ######
	        
        dt = paste(dt[, 1], dt[, 2], sep=" ");
        tmp= paste(topkList[, 1], topkList[, 2], sep=" ");	  	
	result=topkList[which(tmp %in% dt), ] 
        
	
	if(is.matrix(result)){
		result=data.frame(result)
		result[,3]=as.numeric(as.character(result[,3]))
		NoConfimred=nrow(result)
	} else {
		result=as.matrix(result)
		result=t(result)
		result=data.frame(result)
		result[,3]=as.numeric(as.character(result[,3]))
		NoConfimred=nrow(result)
	}
	
	return(list(result, NoConfimred))
}

#' Validate the targets of a miRNA using transfection data
#' 
#' Given the predicted target of a miRNA, the function returns a list of targets that are  confirmed
#' based on the curated transfection data. Users need to download the file logFC.imputed.rda from nugget.unisa.edu.au/Thuc/miRLAB/ and place it in the working directory (this file is obtained from the TargetScoreData package)
#' @param topkList a matrix with 3 columns. The first column is the miRNA name, the second contains the target mRNAs, and the third contains the correlation values/ causal effects/ scores
#' @param LFC the log fold change threshold. The targets that have the absolute value of log fold change greater than the LFC
#' will be regarded as the confirmed targets.
#' @return a matrix in the same format of the input matrix put only contains the confirmed interactions.
#' @examples 
#dataset=system.file("extdata", "EMT35.csv", package="miRLAB")
#' print("ps=Pearson(dataset, cause=1:35, effect=36:1189)")
#' print("miR200aTop100=bRank(ps, 11, 100, TRUE)")
#' print("miR200aTop100Confirmed = ValidationT(miR200aTop100, 1.0)")
#' @export
#' @references
#' 1. Le, T.D., Zhang, J., Liu, L., and Li, J. (2015) Ensemble Methods for miRNA Target Prediction from Expression Data, under review.
#' 
#' 2. Li Y, Goldenberg A, Wong K and Zhang Z (2014). A probabilistic approach to explore human microRNA targetome using microRNA-overexpression data and sequence information. Bioinformatics, 30(5), pp. 621-628. http://dx.doi.org/10.1093/bioinformatics/btt599.


ValidationT=function(topkList, LFC){
 #get all transfection data from TargetScoreData package and present in the format of Tarbase.
 #the rows with LFC <= LFC will be discarded
 dt=system.file("extdata", "Transfection_data_summary.csv", package="miRLAB")
 Transfection=read.csv(dt, header=TRUE, sep=",")
 logFC.imputed=c()
 if(!file.exists("logFC.imputed.rda"))
   stop("Please download the transfection data from nugget.unisa.edu.au/Thuc/miRLAB/logFC.imputed.rda for validation. 
       If you would like to use experimentally confirmed data only, please use the Validation function")
 load("./logFC.imputed.rda")
 td=logFC.imputed
 
 topkList = as.matrix(topkList, ncol=3);
 stopifnot(nrow(topkList)>0)
 miN = unique(topkList[, 1]) # get the unique names of miRNA
 mind= NULL;
 for(i in 1:length(miN))
	mind = c(mind, which(Transfection[,1]==miN[i]))  #get the index of miRNA from the validation data   
 #print(mind)
 if(length(mind)==0){
	#cat("This miRNA is not in the tranfection database", "\n")
	return(list(NULL, 0))
}	
 rn=rownames(td) # take the genenames
 td = td[,mind]      # get a small validation data from the original one, a new list
 
 if(length(mind)>1) td=rowMeans(td)	 # take the average of LFCs
 		 
#decorate the table
 groundtruth=cbind(rep(miN, length(rn)), rn, td)
 rownames(groundtruth)=NULL
 colnames(groundtruth)=c("miRNA", "mRNA", "LFC")
 groundtruth=data.frame(groundtruth)
 groundtruth[,3]=as.numeric(as.character(groundtruth[,3]))
 groundtruth=groundtruth[abs(groundtruth[,3])>LFC,]       #LFC<-1 using groundtruth=groundtruth[groundtruth[,3]<-LFC,]
 groundtruth=groundtruth[,1:2]
 
 groundtruth = paste(groundtruth[, 1], groundtruth[, 2], sep=" ");
 tmp= paste(topkList[, 1], topkList[, 2], sep=" ");
 result=topkList[which(tmp %in% groundtruth), ]
 
 #Decorate the result table, if only one row it is not a matrix. If it is a matrix we just remove the "" in data
 if(is.matrix(result)){
	result=data.frame(result)
	result[,3]=as.numeric(as.character(result[,3]))
	NoConfimred=nrow(result)
  } else {
		result=as.matrix(result)
		result=t(result)
		result=data.frame(result)
		result[,3]=as.numeric(as.character(result[,3]))
		NoConfimred=nrow(result)
  }
  #cat("Number of confirmed genes for  ",miN, "is: ", NoConfimred, "\n")
	
  return(list(result, NoConfimred))

 
}

#' Validate the targets of all miRNA using both experimentally confirmed and transfection data
#' 
#' Given the predicted target of all miRNA, the function returns a list of targets of each miRNA that are  confirmed
#' based on the experimentally validated interactions or curated transfection data. Users need to download the file logFC.imputed.rda from nugget.unisa.edu.au/Thuc/miRLAB/ and place it in the working directory (this file is obtained from the TargetScoreData package)
#' @param CEmatrix the matrix of correlation/causal effects/scores with columns are miRNAs and rows are mRNAs
#' @param topk the number of targets of each miRNA that are being validated. 
#' @param groundtruth the csv file containing the ground truth.
#' @param LFC the log fold change threshold for the transfection data. The targets that have the absolute value of log fold change greater than the LFC
#' will be regarded as the confirmed targets.
#' @param downreg if TRUE the negative correlation/causal effect/score values will be ranked on the top of the ranking. This is to favour the down regulations.
#' @return a list of matrices that contains the confirmed interactions by both provided ground truth and built-in transfection data.
#' @examples 
#' print("ps=Pearson(dataset, cause=1:3, effect=4:18)")
#' print("results=ValidateAll(ps, 10, groundtruth, LFC=0.5, downreg=TRUE)")
#' @export
# \dontrun{
# dataset=system.file("extdata", "ToyEMT35.csv", package="miRLAB")
# ps=Pearson(dataset, cause=1:3, effect=4:18)
# groundtruth=system.file("extdata", "groundtruth.csv", package="miRLAB")
# results=ValidateAll(ps, 10, groundtruth, LFC=0.5, downreg=TRUE)
# }
ValidateAll=function(CEmatrix, topk, groundtruth, LFC, downreg=TRUE){
#CEmatrix: the results from a computational method in a matrix format. columns are miRNAs. Rows are genes.
#causes: the column indices of the causes in the dataset or in the CEMatrix.
#Top k gene of each miRNA we would like to extract for validation.
#Groundtruth is the experimentally validated database in .csv format.
#LFC: log2 fold-change threshold for identifying the groundtruth using miRNA transfection data.
if(!file.exists("logFC.imputed.rda"))
  stop("Please download the transfection data from nugget.unisa.edu.au/Thuc/miRLAB/logFC.imputed.rda for validation. 
       If you would like to use experimentally confirmed data only, please use the Validation function")
 causes=1:ncol(CEmatrix)
 NoExpConfirmed=c()
 NoTransConfirmed=c()
 names=c('miRNA','mRNA','Correlation')
 K1=c() #ground truth experiment
 ResultK1=matrix(,ncol=3)
 colnames(ResultK1)=names
 K2=c() #ground truth transfection
 ResultK2=matrix(,ncol=3)
 colnames(ResultK2)=names
 pE=c() #pvalue for experiment
 pT=c() #pvalue for transfection
 S=nrow(CEmatrix)
	for (i in causes){
		miRtopk=bRank(CEmatrix, i, topk, downreg)
		miRall=bRank(CEmatrix, i, S)
		#cat("Validate the prediction using experimentally confirmed database ", "\n")
		temp1=Validation(miRtopk, groundtruth)
		temp2=Validation(miRall, groundtruth)
		pvalE=1-phyper((temp1[[2]]-1), temp2[[2]], (S-temp2[[2]]), topk)
		pE=c(pE, pvalE)
		
		cat("EXPERIMENT:  ", colnames(CEmatrix)[i], ": ", temp1[[2]], "with ", temp2[[2]], " in the groundtruth. pvalue: ",pvalE, "\n")
		NoExpConfirmed=c(NoExpConfirmed,temp1[[2]]) # sum will be x in hypergeometric test.
		K1=c(K1,temp2[[2]])
                if(temp1[[2]]>0)  ResultK1=rbind(ResultK1,temp1[[1]])
                     
 		
		#cat("Validate the prediction using transfection data ", "\n")
		temp3=ValidationT(miRtopk, LFC)
		temp4=ValidationT(miRall, LFC)
		pvalT=1-phyper((temp3[[2]]-1), temp4[[2]], (S-temp4[[2]]), topk)
		pT=c(pT, pvalT)
		cat("TRANSFECTION:  ", colnames(CEmatrix)[i], ": ", temp3[[2]], "with ", temp4[[2]], " in the groundtruth. pvalue: ", pvalT, "\n")
		NoTransConfirmed=c(NoTransConfirmed,temp3[[2]]) # sum will be x in the hypergeometric test.
		K2=c(K2,temp4[[2]]) 
                
		if(temp3[[2]]>0) ResultK2=rbind(ResultK2,temp3[[1]])
		
	}
	pvalueE=1-phyper((sum(NoExpConfirmed)-1), sum(K1), (S*ncol(CEmatrix)-sum(K1)), topk*ncol(CEmatrix))
	pvalueT=1-phyper((sum(NoTransConfirmed)-1), sum(K2), (S*ncol(CEmatrix)-sum(K2)), topk*ncol(CEmatrix))
	
	cat("number of confirms by experiments: ", sum(NoExpConfirmed), ", pvalue: ", pvalueE, ". By transfection ", sum(NoTransConfirmed), ", pvalue: ", pvalueT, "\n")
	cat("groundtruths in experiments: ", sum(K1), ", and in transfection ", sum(K2), "\n")
	result=list(ResultK1[2:nrow(ResultK1),], cbind(K1, NoExpConfirmed, pE), ResultK2[2:nrow(ResultK2),], cbind(K2, NoTransConfirmed, pT))
}


## Read external results ##
#' Read results from other methods
#' 
#' Read the results predicted by external methods (methods that are not in this package and may not be implemented in R). Consequently, we can compare the results
#' predicted by the external methods and results predicted by the methods in the miRLAB package.
#' @param datacsv the input dataset in csv format
#' @param cause the column range that specifies the causes (miRNAs), e.g. 1:35
#' @param effect the column range that specifies the effects (mRNAs), e.g. 36:2000
#' @param ExtCEcsv score matrix predicted by an external matrix with columns are miRNAs and rows are mRNAs.
#' @return a matrix of scores predicted by an external matrix and ready for further validation and comparison tasks.
#' @examples 
#' print("GenemiR=ReadExtResult(dataset, cause=1:3, effect=4:18, 'genemirresults.csv')")
#' @export
ReadExtResult=function(datacsv, cause, effect,  ExtCEcsv){
	dataset=read.csv(datacsv, header=TRUE, sep=",")
	genenames=colnames(dataset)
	causenames=genenames[cause]
	effectnames=genenames[effect]
	Extmatrix=read.csv(ExtCEcsv, header=FALSE, sep=",")
	colnames(Extmatrix)=causenames
	rownames(Extmatrix)=effectnames
	return(Extmatrix)
	
}

#delete experiment function

## Compare the validation results of 13 built-in methods ##
filterAndCompare=function(allresults, noVal){
	#allresults: the results from all methods generated from experiment function. This is a list
	#noVal: number of confirmed targets in each method (threshold) to filter. Records (miRNA) with less than this will be removed
	ExpResult=allresults[[1]]
	TransResult=allresults[[2]]
	temp1=apply(ExpResult, 1, function(x) all(x[c(2,4,6,8,10,12,14,16,18,20,22,24,26)]>(noVal-1)))
	ExpResult=ExpResult[temp1,]
	temp2=apply(TransResult, 1, function(x) all(x[c(2,4,6,8,10,12,14,16,18,20,22,24,26)]>(noVal-1)))
	TransResult=TransResult[temp2,]
	###If else to incase only one record. In that case the result is not a matrix
	if(is.matrix(ExpResult)){
		ExpResult=ExpResult[,c(2,4,6,8,10,12,14,16,18,20,22,24,26)]
		colnames(ExpResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "MIC", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
	} else {
		tt=as.matrix(ExpResult)
		ExpResult=t(tt)
		ExpResult=ExpResult[,c(2,4,6,8,10,12,14,16,18,20,22,24,26)]
		names(ExpResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "MIC", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
	}
	
	if(is.matrix(TransResult)){
		TransResult=TransResult[,c(2,4,6,8,10,12,14,16,18,20,22,24,26)]
		colnames(TransResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "MIC", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
	} else {
		tt=as.matrix(TransResult)
		TransResult=t(tt)
		TransResult=TransResult[,c(2,4,6,8,10,12,14,16,18,20,22,24,26)]
		#print(TransResult)
		names(TransResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "MIC", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
	}
	if(is.matrix(ExpResult)){
		numberExpResult=apply(ExpResult, 2, sum)
		ranking=apply(ExpResult, 1, function(i) rank(i))
		ranking=t(ranking)
		rankExpResult=apply(ranking, 2, sum)
	}
	if(is.matrix(TransResult)){
		numberTransResult=apply(TransResult, 2, sum)	
		rankingT=apply(TransResult, 1, function(i) rank(i))
		rankingT=t(rankingT)
		rankTransResult=apply(rankingT, 2, sum)
	}
	if(is.matrix(ExpResult)){
		Exp=list(ExpResult, numberExpResult, rankExpResult)
	} else Exp=ExpResult
	
	if(is.matrix(TransResult)){
		Trans=list(TransResult, numberTransResult, rankTransResult)
	} else Trans=TransResult
	
	result=list(Exp, Trans)
}

## Enrichment analysis including GO and KEGG enrichment analysis using two functions: GOBPenrichment and KEGGenrichment ##
# The input is a list of gene symbols. For example: Genes <- c("AREG", "FKBP5", "CXCL13", "KLF9", "ZC3H12A", "P4HA1", "TLE1", "CREB3L2", "TXNIP", "PBX1", "GJA1", "ITGB8", "CCL3", "CCND2", "KCNJ15", "CFLAR", "CXCL10", "CYSLTR1", "IGFBP7", "RHOB", "MAP3K5", "CAV2", "CAPN2", "AKAP13", "RND3", "IL6ST", "RGS1", "IRF4", "G3BP1", "SEL1L", "VEGFA", "SMAD1", "CCND1", "CLEC3B", "NEB", "AMD1", "PDCD4", "SCD", "TM2D3", "BACH2", "LDLR", "BMPR1B", "RFXAP", "ASPH", "PTK2B", "SLC1A5", "ENO2", "TRPM8", "SATB1", "MIER1", "SRSF1", "ATF3", "CCL5", "MCM6", "GCH1", "CAV1", "SLC20A1")
#@import GSEABase @import Category
## GO BP enrichment analysis, Cutoff is normally set to 0.05 ##
#' Functional enrichment analysis
#' 
#' GO BP enrichment analysis for a gene list
# @importMethodsFrom AnnotationDbi mget Lkeys get
# @importFrom org.Hs.eg.db org.Hs.egSYMBOL2EG org.Hs.egGO org.Hs.egSYMBOL
# @import GOstats 
# @import Category 
#' @param Genes a list of gene symbols
#' @param Cutoff the significant level, e.g. 0.05
#' @examples
#'  print("result = GOBPenrichment(genelist, 0.05)")
#' @return a list of GO terms for the genes
#' @export
#' @references
#' Ashburner, M., Ball, C.A., Blake, J.A., Botstein, D., Butler, H., Cherry, J.M., Davis, A.P., Dolinski, K., Dwight, S.S., Eppig, J.T., Harris, M.A., Hill, D.P., Issel-Tarver, L., Kasarskis, A., Lewis, S., Matese, J.C., Richardson, J.E., Ringwald, M., Rubin, G.M. and Sherlock, G. (2000) Gene Ontology: tool for the unification of biology. Nat. Genet., 25, 25-29.
GOBPenrichment <- function(Genes, Cutoff){
  
if(requireNamespace("AnnotationDbi", quitely=TRUE)) {
  EntrezIDs <- AnnotationDbi::mget(Genes, org.Hs.eg.db::org.Hs.egSYMBOL2EG, ifnotfound=NA)
  EntrezIDs <- as.character(EntrezIDs)
  GOAnnotation <- AnnotationDbi::get("org.Hs.egGO")
  Universe <- AnnotationDbi::Lkeys(GOAnnotation)

  Params <- new("GOHyperGParams",
                  geneIds=EntrezIDs,
                  universeGeneIds=Universe,
                  annotation="org.Hs.eg.db",
                  ontology="BP",
                  pvalueCutoff=Cutoff,
                  conditional=FALSE,
                  testDirection="over")
}
if(requireNamespace("GOstats", quitely=TRUE)) {
  Over <- GOstats::hyperGTest(Params)
}
if(requireNamespace("Category", quitely=TRUE)) {
  Glist <- Category::geneIdsByCategory(Over)
}
if(requireNamespace("AnnotationDbi", quitely=TRUE)){
Glist <- sapply(Glist, function(.ids) {
 	.sym <- AnnotationDbi::mget(.ids, envir=org.Hs.eg.db::org.Hs.egSYMBOL, ifnotfound=NA)
 	.sym[is.na(.sym)] <- .ids[is.na(.sym)]
 	paste(.sym, collapse=";")
 	})
BP <- summary(Over)
BP$Symbols <- Glist[as.character(BP$GOBPID)]
# Adjust p-value using Benjamini & Hochberg (BH) method
BP$adjustPvalue <-p.adjust(BP$Pvalue, "BH", length(BP$Pvalue))
# write.csv(BP,'BPResult.csv')
}

return(BP)
}

## KEGG enrichment analysis, Cutoff is normally set to 0.05 ##
## GO BP enrichment analysis, Cutoff is normally set to 0.05 ##
#' Functional enrichment analysis
#' KEGG enrichment analysis for a gene list
# @importMethodsFrom AnnotationDbi mget Lkeys get
# @import GOstats 
# @import Category 
# @importFrom org.Hs.eg.db org.Hs.egSYMBOL2EG org.Hs.egPATH org.Hs.egSYMBOL
#' @param Genes a list of gene symbols
#' @param Cutoff the significant level, e.g. 0.05
#' @examples
#'  print("result = KEGGenrichment(genelist, 0.05)") 
#' @return a list of pathways for the genes
#' @export
#' @references
#' Kanehisa, M. and Goto, S. (2000) KEGG: kyoto encyclopedia of genes and genomes. Nucleic Acids Res., 28, 27-30.

KEGGenrichment <- function(Genes, Cutoff){

  if(requireNamespace("AnnotationDbi", quitely=TRUE)){
EntrezIDs <- AnnotationDbi::mget(Genes, org.Hs.eg.db::org.Hs.egSYMBOL2EG, ifnotfound=NA)
EntrezIDs <- as.character(EntrezIDs)
KEGGAnnotation <- AnnotationDbi::get("org.Hs.egPATH")
Universe <- AnnotationDbi::Lkeys(KEGGAnnotation)
Params <- new("KEGGHyperGParams", 
                     geneIds=EntrezIDs, 
                     universeGeneIds=Universe, 
                     annotation="org.Hs.eg.db", 
                     categoryName="KEGG", 
                     pvalueCutoff=Cutoff,
                     testDirection="over")
}

if(requireNamespace("GOstats", quitely=TRUE)){
  Over <- GOstats::hyperGTest(Params)
 KEGG <- summary(Over)
}
if(requireNamespace("Category", quitely=TRUE)){
  Glist <- Category::geneIdsByCategory(Over)
}
if(requireNamespace("AnnotationDbi", quitely=TRUE)){
Glist <- sapply(Glist, function(.ids) {
 	.sym <- AnnotationDbi::mget(.ids, envir=org.Hs.eg.db::org.Hs.egSYMBOL, ifnotfound=NA)
 	.sym[is.na(.sym)] <- .ids[is.na(.sym)]
 	paste(.sym, collapse=";")
 	})
KEGG$Symbols <- Glist[as.character(KEGG$KEGGID)]
# Adjust p-value using Benjamini & Hochberg (BH) method
KEGG$adjustPvalue <-p.adjust(KEGG$Pvalue, "BH", length(KEGG$Pvalue))
# write.csv(KEGG,'KEGGResult.csv')
}

return(KEGG)
}

#' Function for validate the results from all 12 methods.
#' @param allmethods A list of results (matrix with columns are miRNA and rows are mRNAs). 
#' @param topk Top k targets of each miRNA that will be extracted for validation
#' @param Expgroundtruth The ground truth in .csv file for validation
#' @param LFC log fold-change for validating the results using transfection experiments
#' @param downreg If set to TRUE the negative effects will have higher ranks than the positives.
#' @return The validation results for all 12 methods 
#' @export
experiment=function(allmethods, topk, Expgroundtruth, LFC, downreg){
  
  psv=ValidateAll(allmethods[[1]], topk, Expgroundtruth, LFC, downreg)
  spv=ValidateAll(allmethods[[2]], topk, Expgroundtruth, LFC, downreg)
  kendallv=ValidateAll(allmethods[[3]], topk, Expgroundtruth, LFC, downreg)
  dcovv=ValidateAll(allmethods[[4]], topk, Expgroundtruth, LFC, downreg)
  hoeffdingv=ValidateAll(allmethods[[5]], topk, Expgroundtruth, LFC, downreg)
  miv=ValidateAll(allmethods[[6]], topk, Expgroundtruth, LFC, downreg)
  idav=ValidateAll(allmethods[[7]], topk, Expgroundtruth, LFC, downreg)
 # micv=ValidateAll(allmethods[[8]], topk, Expgroundtruth, LFC, downreg, TargetBinding)
  rdcv=ValidateAll(allmethods[[8]], topk, Expgroundtruth, LFC, downreg)
  lassov=ValidateAll(allmethods[[9]], topk, Expgroundtruth, LFC, downreg)
  elasticv=ValidateAll(allmethods[[10]], topk, Expgroundtruth, LFC, downreg)
  zsv=ValidateAll(allmethods[[11]], topk, Expgroundtruth, LFC, downreg)
  promisev=ValidateAll(allmethods[[12]], topk, Expgroundtruth, LFC, downreg)
  
  #############genenames
  miRs=colnames(allmethods[[1]])
  #########decorate and return
  result1= cbind(psv[[2]], spv[[2]][,2:3], kendallv[[2]][,2:3], dcovv[[2]][,2:3], hoeffdingv[[2]][,2:3], miv[[2]][,2:3], idav[[2]][,2:3], rdcv[[2]][,2:3], lassov[[2]][,2:3], elasticv[[2]][,2:3],zsv[[2]][,2:3], promisev[[2]][,2:3])
  rownames(result1)=miRs
  result2= cbind(psv[[4]], spv[[4]][,2:3], kendallv[[4]][,2:3], dcovv[[4]][,2:3], hoeffdingv[[4]][,2:3], miv[[4]][,2:3], idav[[4]][,2:3], rdcv[[4]][,2:3], lassov[[4]][,2:3], elasticv[[4]][,2:3],zsv[[4]][,2:3], promisev[[4]][,2:3])
  rownames(result2)=miRs
  result=list(result1, result2)
  return(result)
}

## Compare the validation results of 13 built-in methods ##
#' Filter and compare the validation results from 12 methods
#' Keep the miRNAs that have at least noVal confirmed targets and compare the validation results from all methods.
#' @param allresults the results from all methods generated from experiment function. This is a list.
#' @param noVal Number of confirmed targets in each method (threshold) to filter. Records (miRNA) with less than this will be removed
#' @return the validation results of all methods
#' @examples
#' print("result=filterAndCompare(allresults, 2)")
#' @export 
filterAndCompare=function(allresults, noVal){
  #allresults: the results from all methods generated from experiment function. This is a list
  #noVal: number of confirmed targets in each method (threshold) to filter. Records (miRNA) with less than this will be removed
  ExpResult=allresults[[1]]
  TransResult=allresults[[2]]
  temp1=apply(ExpResult, 1, function(x) all(x[c(2,4,6,8,10,12,14,16,18,20,22,24)]>(noVal-1)))
  ExpResult=ExpResult[temp1,]
  temp2=apply(TransResult, 1, function(x) all(x[c(2,4,6,8,10,12,14,16,18,20,22,24)]>(noVal-1)))
  TransResult=TransResult[temp2,]
  ###If else to incase only one record. In that case the result is not a matrix
  if(is.matrix(ExpResult)){
    ExpResult=ExpResult[,c(2,4,6,8,10,12,14,16,18,20,22,24)]
    colnames(ExpResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
  } else {
    tt=as.matrix(ExpResult)
    ExpResult=t(tt)
    ExpResult=ExpResult[,c(2,4,6,8,10,12,14,16,18,20,22,24)]
    names(ExpResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
  }
  
  if(is.matrix(TransResult)){
    TransResult=TransResult[,c(2,4,6,8,10,12,14,16,18,20,22,24)]
    colnames(TransResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
  } else {
    tt=as.matrix(TransResult)
    TransResult=t(tt)
    TransResult=TransResult[,c(2,4,6,8,10,12,14,16,18,20,22,24)]
    #print(TransResult)
    names(TransResult)=c( "Pearson", "Spearman", "Kendall", " Dcov", "Hoeffding", "MI", "IDA", "RDC", "Lasso", "Elastic", "Z-score", "ProMISe")
  }
  if(is.matrix(ExpResult)){
    numberExpResult=apply(ExpResult, 2, sum)
    ranking=apply(ExpResult, 1, function(i) rank(i))
    ranking=t(ranking)
    rankExpResult=apply(ranking, 2, sum)
  }
  if(is.matrix(TransResult)){
    numberTransResult=apply(TransResult, 2, sum)	
    rankingT=apply(TransResult, 1, function(i) rank(i))
    rankingT=t(rankingT)
    rankTransResult=apply(rankingT, 2, sum)
  }
  if(is.matrix(ExpResult)){
    Exp=list(ExpResult, numberExpResult, rankExpResult)
  } else Exp=ExpResult
  
  if(is.matrix(TransResult)){
    Trans=list(TransResult, numberTransResult, rankTransResult)
  } else Trans=TransResult
  
  result=list(Exp, Trans)
}

#' Convert miRNA symbols from a miRBase version to another 
#' 
#' This function convert the miRNAs in the input file from the "source" miRBase version to the "Target" version. 
#' If users do not know the miRBase version of the input file, please set the source version to 0. The function will match the 
#' miRNAs in the input file to all miRBase versions to find the most likely miRBase version. Currently, we have versions 16-21.
#' @param miRNAListFile the input file containing a list of miRNA symbols in csv format
#' @param sourceV the miRBase version of the input miRNAs, e.g. 16. If users do not know the version, use 0.
#' @param targetV the miRBase version that we want to convert into, e.g. 21.
#' @return A csv file in the working directory containing the converted miRNA symbols.
#' @examples 
#' miRs=system.file("extdata", "ToymiRs.csv", package="miRLAB")
#' convert(miRs, 17, 21) 
#' @export 
convert = function (miRNAListFile,sourceV,targetV) {
  load(system.file("extdata", "database.RData", package="miRLAB"))
  miRNAList = as.matrix( read.csv( miRNAListFile ,header = FALSE) )
  sourceName = miRNAList
  sourceVersion = c()
  targetName = c()
  targetVersion = c()
  
  if (sourceV != 0) # have the source version
  {
    location = match( miRNAList, all[,sourceV-14] )
    isNA = is.na(location)
    targetVersion[which(!isNA)] = targetV
    targetVersion[which(isNA)] = NA
    targetName = all[location, targetV-14]
    sourceVersion = rep(sourceV,length(miRNAList))
  }else 
  {
    allVersionList = c(all[,2],all[,3],all[,4],all[,5],all[,6],all[,7])
    location = match(miRNAList, allVersionList)
    sourceVersion = 16 + (location %/% 2602)
    isNA = is.na(location)
    targetVersion[which(!isNA)] = targetV
    targetVersion[which(isNA)] = NA
    location = location %% 2602
    location[which(location == 0)] = 2602
    targetName = all[location, targetV-14]
  }
  
  res = cbind(sourceName, sourceVersion, targetName, targetVersion)
  colnames(res) = c("sourceName","sourceVersion","targetName","targetVersion")
  write.table(res, file="resOfConvert.csv", sep=",",row.names = FALSE,col.names = TRUE)
}

# A function for preparing data to classify cancer subtypes
#' @importFrom utils write.table
prepareDataForPam50 = function(d, dataDir = "bioclassifier_data", inputFileName = "inputFile.txt") {
  
  d <- t(d)
  noOfRow <- nrow(d)
  noOfCol <- ncol(d)
  temp <- matrix(0, nrow = (noOfRow + 1), ncol = noOfCol)
  row.names(temp) <- c("T", row.names(d))
  colnames(temp) <- colnames(d)
  temp[2:(noOfRow + 1), 1:noOfCol] <- d[, ]
  dir.create(file.path(".", dataDir), showWarnings = FALSE)
  write.table(temp, paste("./", dataDir, "/", inputFileName, sep = ""), col.names=NA, sep = "\t")
}

#' Identify miRNA targets by ICP and PAM50
#' 
#' This function identifies miRNA targets by ICP and PAM50.
#' @importFrom utils read.table
#' @param d A matrix of expression of miRNAs and mRNAs with columns being miRNA or mRNA names and rows being samples
#' @param nmiR Number of miRNAs
#' @param nmR Number of mRNAs
#' @param fiftymRNAsData A matrix of expression of 50 mRNAs in PAM50 with columns being mRNA names and rows being samples
#' @return The matrix of causal effects of miRNAs and mRNAs with columns being miRNAs and rows being mRNAs
#' @references 
#' 1. Parker, J. S., et al. (2009). "Supervised Risk Predictor of Breast Cancer Based on Intrinsic Subtypes." Journal of Clinical Oncology 27(8): 1160-1167.
#' @export 
ICPPam50 = function(d, nmiR, nmR, fiftymRNAsData) {
  
  # Set parameters
  ## Folder including input and output files used to categorise samples into cancer subtypes using Pam50
  dataDir = "bioclassifier_data"
  ## Input file used to categorise samples into cancer subtypes using Pam50
  inputFileName = "inputFile.txt"
  
  # Classify samples based on cancer subtypes using Pam50
  # Reference: Parker, J. S., et al. (2009). "Supervised Risk Predictor of Breast Cancer Based on Intrinsic Subtypes." Journal of Clinical Oncology 27(8): 1160-1167.
  ## Prepare data
  prepareDataForPam50(fiftymRNAsData, dataDir, inputFileName)
  ## Run the assignment algorithm
  source(system.file(package = 'miRLAB', 'bioclassifier_R', 'subtypePrediction_functions.R'))
  source(system.file(package = 'miRLAB', 'bioclassifier_R', 'subtypePrediction_distributed.R'))
  
  ## Read the result
  result <- read.table(paste("./", dataDir, "/", "outputFile_pam50scores.txt", sep = ""))
  result <- result[-1, -1]
  
  # Define experimental settings
  ExpInd <- c(rep(0, nrow(d)))
  ExpInd[which(result[, 6] == "Basal")] <- 1
  ExpInd[which(result[, 6] == "Her2")] <- 2
  ExpInd[which(result[, 6] == "LumA")] <- 3
  ExpInd[which(result[, 6] == "LumB")] <- 4
  ExpInd[which(result[, 6] == "Normal")] <- 5
  
  # Estimate causal effects
  standardizedData <- scale(d)
  temp = matrix(nrow = nmR, ncol = nmiR)
  for (i in 1:nmR) {
    Y <- standardizedData[, nmiR+i]
    X <- standardizedData[, c(1:nmiR)]
    icp <- InvariantCausalPrediction::hiddenICP(X, Y, ExpInd, alpha = 0.1, mode = "asymptotic", intercept=FALSE)
    for (k in 1:nmiR) {
      temp[i,k] <- icp$betahat[k]
    }
  }
  colnames(temp) <- colnames(d)[1:nmiR]
  row.names(temp) <- colnames(d)[(nmiR + 1):(nmiR + nmR)]
  
  return(temp)
}

#' Identify the top miRNA targets by ICP and PAM50
#' 
#' This function identifies the top miRNA targets by ICP and PAM50.
#' @param d A matrix of expression of miRNAs and mRNAs with columns being miRNA or mRNA names and rows being samples
#' @param nmiR Number of miRNAs
#' @param nmR Number of mRNAs
#' @param fiftymRNAsData A matrix of expression of 50 mRNAs in PAM50 with columns being mRNA names and rows being samples
#' @param top 1 if getting the top of all miRNAs and 2 if getting the top of each miRNA
#' @param topk Number of the top to get
#' @return The top k miRNA targets
#' @references 
#' 1. Parker, J. S., et al. (2009). "Supervised Risk Predictor of Breast Cancer Based on Intrinsic Subtypes." Journal of Clinical Oncology 27(8): 1160-1167.
#' @export 
identifymiRTargetsByICPPam50 = function(d, nmiR, nmR, fiftymRNAsData, top = 1, topk = 500) {
  
  temp <- ICPPam50(d, nmiR, nmR, fiftymRNAsData)
  
  # Get the result
  miRTargets = NULL
  if(top == 1) {# Top for all miRNAs
    miRTargets = Extopk(temp, topk)
  } else { # Top for each miRNA
    for(i in 1:nmiR) {
      miRiTopk = bRank(temp, i, topk, FALSE)
      miRTargets <- rbind(miRTargets, miRiTopk)
    }
  }
  
  return(miRTargets)
}

#' Identify the top miRNA targets by an ensemble method with ICP-PAM50, Pearson and Lasso
#' 
#' This function identifies the top miRNA targets by an ensemble method with ICP-PAM50, Pearson and Lasso.
#' @param d A matrix of expression of miRNAs and mRNAs with columns being miRNA or mRNA names and rows being samples
#' @param nmiR Number of miRNAs
#' @param nmR Number of mRNAs
#' @param fiftymRNAsData A matrix of expression of 50 mRNAs in PAM50 with columns being mRNA names and rows being samples
#' @param top 1 if getting the top of all miRNAs and 2 if getting the top of each miRNA
#' @param topk Number of the top to get
#' @return The top k miRNA targets
#' @references 
#' 1. Parker, J. S., et al. (2009). "Supervised Risk Predictor of Breast Cancer Based on Intrinsic Subtypes." Journal of Clinical Oncology 27(8): 1160-1167.
#' @export 
identifymiRTargetsByEnsemble = function(d, nmiR, nmR, fiftymRNAsData, top = 1, topk = 500) {
  
  # Get the results of ICPPam50, Pearson and Lasso
  hid <- ICPPam50(d, nmiR, nmR, fiftymRNAsData)
  standardizedData <- scale(d)
  dir.create(file.path(".", "temp_data"), showWarnings = FALSE)
  write.table(standardizedData, file = "./temp_data/input.csv", sep = ",", row.names = FALSE)
  pea = Pearson("./temp_data/input.csv", cause = 1:nmiR, effect = (nmiR+1):(nmiR+nmR))
  las = Lasso("./temp_data/input.csv", cause = 1:nmiR, effect = (nmiR+1):(nmiR+nmR))
  
  row.names(pea) <- row.names(hid)
  row.names(las) <- row.names(hid)
  
  # Call the ensemble method
  borda = Borda(list(hid, pea, las))
  
  # Get the result
  miRTargets = NULL
  if(top == 1) {# Top for all miRNAs
    miRTargets = Extopk(borda, topk)
  } else { # Top for each miRNA
    for(i in 1:nmiR) {
      miRiTopk = bRank(borda, i, topk, FALSE)
      miRTargets <- rbind(miRTargets, miRiTopk)
    }
  }
  
  return(miRTargets)
}

				

Try the miRLAB package in your browser

Any scripts or data that you put into this service are public.

miRLAB documentation built on Nov. 8, 2020, 5:45 p.m.