CONSTANTS <- list(
uniProt2HGNC = list(
fileURL='http://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=md_prot_id&status=Approved&status_opt=2&where=&order_by=gd_hgnc_id&format=text&limit=&submit=submit',
downloadedFilename='uniProt2HGNC.csv'
),
reactome = list(
pathways = list(
fileURL = 'http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt',
downloadedFilename = 'UniProt2Reactome_All_Levels.txt',
speciesMap = list(HomoSapiens = 'Homo sapiens')
),
interactions = list(
HomoSapiens = list(
mitabURL = 'http://www.reactome.org/download/current/homo_sapiens.mitab.interactions.txt.gz',
downloadedFilename = 'homo_sapiens.mitab.interactions.txt.gz')
)
)
)
#'Get a mapping table between UniProt ID's and HGNC Gene Symbols from
#'genenames.org
#'
#'This function downloads data from genenames.org to create a mapping table
#'between Universal Protein Resource (UniProt) Identifiers to HUGO Gene
#'Nomenclature Committee; gene symbols - which are the default ID's used in this
#'package. This is particularly useful in the case of reactome data, in which
#'interactions as well as pathway membership use UniProt ID's. It can reuse
#'downloaded files if the downloaded folder is passed as an argument
#'
#'@param data_folder - The folder where the data resides, or
#' where it should be downloaded if it doesnt exist
#'
#'@export
#' @references \url{http://www.genenames.org/cgi-bin/download}
#' @examples
#' \dontrun{
#' data_folder<-tempdir()
#' usmap<-getUniProtToHGNCSymbolMapping(data_folder)
#' }
getUniProtToHGNCSymbolMapping <- function(data_folder) {
dFilePath <-
file.path(data_folder,CONSTANTS$uniProt2HGNC$downloadedFilename)
if(!file.exists(dFilePath)){
utils::download.file(CONSTANTS$uniProt2HGNC$fileURL,dFilePath);
}
us <- utils::read.csv(
dFilePath
,sep = "\t",header = FALSE,skip = 1,stringsAsFactors = FALSE
);
us <- subset(us[,c(2,3)],us$V3 != '')
colnames(us) <- c('HGNCSymbol','uniProtID')
return(us)
}
#' Create graph object from gene interaction list
#'
#' Convert a data frame containing the gene interaction list into a form usable
#' by the other functions in the library. This is just a wrapper around the
#' igraph graph.edgelist function, which creates and returns an igraph object
#' with a data frame containing the in and outdegrees.
#'
#'
#'
#' @param geneInteractions - a data frame that contains the gene interaction
#' graph in two columns, the first being the source HGNCSymbol and the second
#' being the target HGNCSymbol of an interaction. See
#' \code{\link{getGeneInteractionsFromReactomeMITAB}},
#' \code{\link{getGeneInteractionsFromIRefMITAB}} for more details
#'
#' @return A list containing igraph_object - the igraph object and vertex_map -
#' a frame containing node attributes currently indegree and outdegree
#' @export
#'
#' @references \url{http://igraph.org/r/doc/graph_from_edgelist.html}
#' @seealso \code{\link{getGeneInteractionsromReactomeMITAB}},
#' \code{\link{getGeneInteractionsFromIRefMITAB}}
#'
#' @examples
#' \dontrun{
#' data_folder=tempdir()
#' usmap<-getUniProtToHGNCSymbolMapping(data_folder)
#' mitab <- downloadReactomeInteractionsMITAB(data_folder)
#' geneInteraction <- getGeneInteractionListFromReactomeMITAB(mitab,usmap)
#' ppiGraph<-createIGraphObject(geneInteraction)
#' }
#'
createIGraphObject <- function(geneInteractions) {
g <- igraph::graph.edgelist(as.matrix(geneInteractions))
vmap <-
data.frame(indegree = igraph::degree(g,mode = "in"),
outdegree = igraph::degree(g,mode = "out"))
row.names(vmap) <- igraph::V(g)$name
return(list(igraph_object = g,vertex_map = vmap))
}
#' Download current Pathway Data from Reactome
#'
#' Downloads current pathway data (protein-to-pathway mapping) from reactome,
#' saves it in a local directory and returns a data frame usable by the library.
#' It can reuse downloaded files if the downloaded folder is passed as an
#' argument
#'
#' @param data_folder - The folder where the data resides, or where it should be
#' downloaded if it doesnt exist
#' @param species - The species being analysed. Currently only supports the
#' string 'HomoSapiens'
#'
#' @return a data frame containing three columns, uniProtID, reactomeID,
#' reactomeDescription which stand for the protein identifier, pathway
#' identifier and the pathway description
#' @export
#'
#' @references \url{http://www.reactome.org/pages/download-data/}
#' @examples
#' \dontrun{
#' dataFolder <- tempdir()
#' pathwayData<-downloadReactomePathways(dataFolder)
#' }
downloadReactomePathways <- function(data_folder,species='HomoSapiens') {
dFilePath <-
file.path(data_folder,CONSTANTS$reactome$pathways$downloadedFilename)
if(!file.exists(dFilePath)){
utils::download.file(CONSTANTS$reactome$pathways$fileURL,dFilePath);
}
pdata <- utils::read.csv(
dFilePath,sep = '\t',header = FALSE,skip = 1,stringsAsFactors = FALSE
)
hpdata <-
subset(pdata,
pdata[,6] == CONSTANTS$reactome$pathways$speciesMap[species]
)[,c(1,2,4)]
colnames(hpdata) <-
c("uniProtID","reactomeID","reactomeDescription")
return(hpdata)
}
#' Download PPI Reactions from Reactome in MITAB format
#'
#' Downloads current PPI data (protein-to-protein interactions) in MITAB format
#' from reactome, saves it in a local directory and returns a data frame usable
#' by the library. It can reuse downloaded files if the downloaded folder is
#' passed as an argument
#'
#' @param data_folder - The folder where the data resides, or where it should be
#' downloaded if it doesnt exist
#' @param species - The species being analysed. Currently only supports the
#' string 'HomoSapiens'
#'
#' @return a data frame that results when a mitab file is read
#' @export
#' @references \url{http://www.reactome.org/pages/download-data/},
#' \url{http://www.psidev.info/groups/molecular-interactions}
#' @examples
#' \dontrun{
#' data_folder <- tempdir()
#' reactomeMITAB<-downloadReactomeInteractionsMITAB(data_folder)
#' }
downloadReactomeInteractionsMITAB <- function(data_folder,species='HomoSapiens')
{
dFilePath <-
file.path(data_folder,
get(species,CONSTANTS$reactome$interactions)$downloadedFilename)
if(!file.exists(dFilePath)){
utils::download.file(get(species,CONSTANTS$reactome$interactions)$mitabURL,
dFilePath);
}
return(utils::read.csv(
gzfile(dFilePath),header = FALSE,sep = "\t",skip = 1,
stringsAsFactors = FALSE
));
}
#' Get a gene interaction data frame from a reactome ppi mitab data frame
#'
#' @param mitab - The mitab data frame containing PPI's
#' See \code{\link{downloadReactomeInteractionsMITAB}}
#'
#' @param usmap - A mapping file between UniProt ID's and HGNC Symbols
#' See \code{\link{getUniProtToHGNCSymbolMapping}}
#'
#' @return a data frame that contains the gene interaction
#' graph in two columns, the first being the source HGNCSymbol and the second
#' being the target HGNCSymbol of an interaction
#' @export
#' @seealso \code{\link{downloadReactomeInteractionsMITAB}}
#'
#' @examples
#' \dontrun{
#' data_folder=tempdir()
#' usmap<-getUniProtToHGNCSymbolMapping(data_folder)
#' mitab <- downloadReactomeInteractionsMITAB(data_folder)
#' geneInteractions <- getGeneInteractionsFromReactomeMITAB(mitab,usmap)
#' }
getGeneInteractionsFromReactomeMITAB <- function(mitab,usmap) {
idata <- mitab[,c(1,2)]
idata[,1] <- gsub('uniprotkb:','',gsub('-[0-9]','',idata[,1]));
idata[,2] <- gsub('uniprotkb:','',gsub('-[0-9]','',idata[,2]));
colnames(idata) <- c("aup","bup")
idata <- subset(idata,idata$aup != idata$bup);
mapgraphA <- merge(
x = idata,y = usmap,by.x = "aup",by.y = "uniProtID"
)[,c(3,2)]
mapgraph <- merge(
x = mapgraphA,y = usmap,by.x = "bup",by.y = "uniProtID"
)[,c(2,3)]
colnames(mapgraph) <- c("A","B")
geneInteractions <- subset(unique(mapgraph),mapgraph$A != mapgraph$B)
return(geneInteractions[complete.cases(geneInteractions),])
}
#' Get a gene interaction data frame from a iRef ppi mitab data frame
#'
#' @param mitab @param mitab - The mitab data frame containing PPI's e.g
#' Obtained using
#' \url{http://www.inside-r.org/packages/cran/iRefR/docs/get_irefindex}
#'
#' @return a data frame that contains the gene interaction
#' graph in two columns, the first being the source HGNCSymbol and the second
#' being the target HGNCSymbol of an interaction
#' @export
#'
#' @examples
#' \dontrun{
#' library(iRefR)
#' iref_mitab=get_irefindex(tax_id="9606",data_folder="/tmp/",iref_version = "13.0")
#' irefGeneInteractions = getGeneInteractionListFromIRefMITAB(iref_mitab)
#' }
getGeneInteractionsFromIRefMITAB <- function(mitab) {
factori <- sapply(mitab, is.factor)
mitab[factori] <- lapply(mitab[factori], as.character)
idata <- unique(subset(mitab[,c('aliasA','aliasB')],
(
stringr::str_count(mitab$aliasA,'hgnc') == 1 &
stringr::str_count(mitab$aliasB,'hgnc') == 1
)))
idata$aliasA <- gsub('hgnc:','',
grep('hgnc:',
unlist(strsplit(
idata$aliasA,'\\|'
)),value = TRUE))
idata$aliasB <- gsub('hgnc:','',
grep('hgnc:',
unlist(strsplit(
idata$aliasB,'\\|'
)),value = TRUE))
idata <- unique(idata)
colnames(idata) <- c("A","B")
idataE <- unique(subset(mitab[,c('aliasA','aliasB')],
(
stringr::str_count(mitab$aliasA,'hgnc') > 1 &
stringr::str_count(mitab$aliasB,'hgnc') > 1
)))
list_function <-
function(cvec) {
return(gsub('hgnc:','',grep('hgnc:',cvec,value = TRUE)))
}
idataE$aliasA <-
lapply(stringr::str_split(idataE$aliasA,'\\|'),list_function)
idataE$aliasB <-
lapply(stringr::str_split(idataE$aliasB,'\\|'),list_function)
idataES <- do.call(rbind,apply(idataE,1,
function(item) {
return(merge(
x = unlist(item[1]),
y = unlist(item[2]),
all = TRUE,
stringsAsFactors = FALSE
))
}))
colnames(idataES) <- c("A","B")
geneInteractions<-unique(rbind(idata,idataES))
return(subset(geneInteractions,
geneInteractions$A != geneInteractionList$B))
}
#' Load pathway data from Reactome
#'
#' @param pdata a data frame containing three columns, uniProtID, reactomeID,
#' reactomeDescription which stand for the protein identifier, pathway
#' identifier and the pathway description. See
#' \code{\link{downloadReactomePathways}},for more details
#' @param usmap A mapping file between UniProt ID's and HGNC Symbols
#' See \code{\link{getUniProtToHGNCSymbolMapping}}
#'
#' @return A list containing two elements -
#' pathways- a single column data frame with description
#' string for each unique pathway id
#' memberships - a list of character vectors of gene ids
#' indexed by pathway id, containing the list of hgnc symbols
#' of genes in each pathway.
#' @export
#'
#' @examples
#' \dontrun{
#' dataFolder <- tempdir()
#' rawPathwayData<-downloadReactomePathways(dataFolder)
#' pdata<-loadPathwayDataReactome(pdata)
#' }
loadPathwayDataReactome <- function(pdata,usmap) {
memberships = unique(merge(pdata,usmap,by = "uniProtID")[,c(4,2)])
pathwayMembership = split(memberships,memberships$reactomeID)
pathways <- unique(pdata[,c(2,3)])
row.names(pathways) = pathways$reactomeID
pathways = pathways[,c("reactomeDescription"),drop = FALSE]
listFunction <- function(df) {
return (df[,c("HGNCSymbol")])
}
pathwayMembership = lapply(pathwayMembership, listFunction)
pathways = pathways[names(pathwayMembership),,drop = FALSE]
return(list(pathwayInfo = pathways,
pathwayMembership = pathwayMembership))
}
#' Combine probes by mean expression value
#'
#' Multiple probes may map to the same HGNC Symbol, and this function
#' that is to be used as input to the \code{\link{loadExpressionData}} combines
#' these probes by taking the mean of the expression values
#' @param A string containing comma separated probe names
#' @param expdata a matrix of expression values returned from functions such as
#' \url{'http://svitsrv25.epfl.ch/R-doc/library/Biobase/html/exprs.html'}
#'
#' @return a single value representing the gene expression
#' @export
#'
probeCombinerMean <- function(pls,expdata) {
probes = unlist(strsplit(pls,','))
if (length(probes) > 1) {
return(colMeans(expdata[probes,]))
}else{
return(expdata[probes,])
}
}
#' Select only _at and _a_at probe sets for analysis
#'
#' There are probe sets with different suffixes depending on probe behavior,(See
#' \url{http://www.affymetrix.com/support/help/faqs/hgu133/index.jsp} for more
#' information) and this function that is to be used as input to the
#' \code{\link{loadExpressionData}} selects only the expression values
#' corresponding probe set suffixes _at and _a_at for analysis
#' @param probeName the name of the probe set
#'
#' @return a binary value indicating if probeName is to be selected
#' @export
#'
probeSelectorATSAT <- function(probeName){
return( grepl("[0-9]+_at", probeName) |
grepl("[0-9]+_a_at", probeName))
}
#' Load experiment data
#'
#' @param expdata a probeName x sample matrix of expression values
#' expression data
#' @param hgnc2probe a data frame with two columns probeName and HGNCSymbol
#' for mapping probeNames to HGNC Symbols
#' @param selectorFunction a function that accepts a probeName string and
#' returns true if the probe is to be considered. See
#' \code{\link{probeSelectorATSAT}} for an example.
#' @param combinerFunction a function that accepts a comma separated string and
#' returns true if the probe is to be considered. See
#' \code{\link{probeCombinerMean}} for an example.
#'
#' @return A data frame with columns as sample names, and rows as HGNC Symbols
#' containing the expression values for analysis
#' @export
#'
#'
loadExperimentData <- function(expdata,hgnc2probe,
selectorFunction=probeSelectorATSAT,
combinerFunction=probeCombinerMean) {
dataRowNames <- data.frame(probeName = row.names(expdata))
selectedProbes <- subset(hgnc2probe,
(
selectorFunction(hgnc2probe$probeName)
) & !grepl(" ",hgnc2probe$HGNCSymbol))
selectedProbes <-
merge(x = selectedProbes,y = dataRowNames,by = "probeName")[,c(1,2)]
hgncProbeList <- sqldf::sqldf(
"select a.HGNCSymbol, group_concat(a.probeName) as
probes from hgnc2probe a inner join selectedProbes b
on b.probeName=a.probeName where a.HGNCSymbol != ''
and a.HGNCSymbol is not null and
a.probeName is not null group by a.HGNCSymbol"
)
data <- t(sapply(hgncProbeList$probes,combinerFunction,expdata))
row.names(data) <- hgncProbeList$HGNCSymbol
return(data)
}
#' Import expression data and probe mappings from GEO dataset
#'
#' This function processes the expression set and gpl files obtained using the
#' GEOQuery package
#'
#' @param eset An expression set, for more details refer to
#' \url{https://www.bioconductor.org/packages/3.3/bioc/vignettes/Biobase/inst/doc/ExpressionSetIntroduction.pdf}
#' and \url{http://svitsrv25.epfl.ch/R-doc/library/GEOquery/html/GDS2MA.html}
#'
#' @param gpl GEO Platform entity, refer
#' \url{http://svitsrv25.epfl.ch/R-doc/library/GEOquery/html/GPL-class.html}
#' for more details
#'
#' @param selectorFunction a function that accepts a probeName string and
#' returns true if the probe is to be considered. See
#' \code{\link{probeSelectorATSAT}} for an example.
#' @param combinerFunction a function that accepts a comma separated string and
#' returns true if the probe is to be considered. See
#' \code{\link{probeCombinerMean}} for an example.
#'
#' @return A data frame with columns as sample names, and rows
#' containing the expression values of each as HGNC Symbol.
#' @export
#'
#' @examples
#' \dontrun{
#' library(GEOquery)
#' data_folder = tempdir()
#' gds = getGEO('GDS3837',AnnotGPL = TRUE, getGPL = TRUE,destdir = data_folder)
#' gpl = getGEO(Meta(gds)$platform,destdir = data_folder)
#' eset = GDS2eSet(gds, GPL=gpl,do.log2=FALSE)
#' gdata=importExpressionDataGEO(eset,gpl)
#' }
importExpressionDataGEO <- function(eset,gpl,selectorFunction=probeSelectorATSAT,combinerFunction=probeCombinerMean) {
expdata <- Biobase::exprs(eset)
hgnc2probe <- GEOquery::Table(gpl)[,c("ID","Gene Symbol")]
names(hgnc2probe) <- c("probeName","HGNCSymbol")
return(loadExperimentData(expdata,hgnc2probe,selectorFunction,combinerFunction))
}
#' Paired t-test based test function
#'
#' Perform a paired t-test on expression data for a single gene
#' It is meant to be used with \code{\link{runTestOnData}}
#' This function is used to obtain a p-value on the fold change between normal
#' and disease samples for a single gene. If the data is already logarithm
#' transformed in this function, then the exponent must be passed in the list
#' extraArgs
#'
#' @param nData A vector with numSamples/2 normal expression values for a gene
#' @param dData A vector with numSamples/2 disease expression values for a gene
#' @param hgncSymbol The HGNC Symbol of the gene
#' @param extraArgs #' @param extraArgs A list of named extra arguments, the
#' only name supported is exponent (assumed to be 1 if not provided for non
#' log transformed data). An exponent not equal to 1 assumes that the
#' expression data has been log transformed with base exponent
#'
#' @return A single row data frame with two columns foldChange and pValue
#' @export
#'
logPairedTTestFunction <-
function(nData,dData,hgncSymbol,extraArgs) {
if (is.null(nData) && is.null(dData) && is.null(hgncSymbol)) {
return(
data.frame(
foldChange = numeric(0),
pValue = numeric(0)
)
)
}
if (is.null(extraArgs$exponent)) {
exponent = 1
}else{
exponent = extraArgs$exponent
}
if (exponent == 1) {
nDataLog = log(1 + nData)
dDataLog = log(1 + dData)
result = stats::t.test(dDataLog,nDataLog,paired = TRUE)
fc = exp(result$estimate)
}else{
nDataLog = log(1 + nData)
dDataLog = log(1 + dData)
result = stats::t.test(dData,nData,paired = TRUE)
fc = exponent ^ result$estimate
}
fc[fc < 1] = 1 / fc[fc < 1]
return(data.frame(
foldChange = fc,pValue = result$p.value,row.names = c(hgncSymbol)
))
}
#' Simple fold change test function
#'
#' Computes fold change for two sample expression data for a single gene.
#' It is meant to be used with \code{\link{runTestOnData}}.
#' @param nData The scalar normal expression values for a gene
#' @param dData The scalar disease expression values for a gene
#' @param hgncSymbol The HGNC Symbol of the gene
#' @param extraArgs A list of named extra arguments, the only name supported is
#' exponent (assumed to be 1 if not provided for non log transformed data). An
#' exponent not equal to 1 assumes that the expression data has been log
#' transformed with base exponent
#'
#' @return A single row data frame with one column foldChange
#' @export
#'
foldChangeFunction <- function(nData,dData,hgncSymbol,extraArgs) {
if (is.null(nData) && is.null(dData) && is.null(hgncSymbol)) {
return(data.frame(foldChange = numeric(0)))
}
if (is.null(extraArgs$exponent)) {
exponent = 1
}else{
exponent = extraArgs$exponent
}
if (exponent == 1) {
fc = dData / nData;
}else{
fc = dData - nData;
}
fc = exponent ^ fc
fc[fc < 1] = 1 / fc[fc < 1]
fc[is.na(fc)] = 1
return(data.frame(foldChange = fc,row.names = c(hgncSymbol)))
}
#' Test expression data
#'
#' Runs the specified test function on the expression data to obtain fold change
#' and p-values.
#'
#' @param data A data frame with columns as sample names, and rows containing
#' the expression values of each as HGNC Symbol. For example the output of
#' \code{\link{importExpressionDataGEO}}
#' @param normalSampleIndexes The indices of the columns which contain the
#' normal samples
#' @param diseaseSampleIndexes The indices of the columns which contain the
#' disease samples
#' @param testFunction The test function - see
#' \code{\link{logPairedTTestFunction}}
#' @param testFunctionExtraArgs Extra arguments to the test function
#'
#' @return A data frame with as many rows as HGNC Gene Symbols, with the
#' columns defined by the output of testFunction.
#' @export
#'
#' @examples
#' \dontrun{
#' gds = getGEO('GDS3837',AnnotGPL = TRUE, getGPL = TRUE,destdir = "/tmp")
#' gpl = getGEO(Meta(gds)$platform,destdir = "/tmp")
#' eset = GDS2eSet(gds, GPL=gpl,do.log2=FALSE)
#' numPairs = dim(pData(eset))[1]/2
#' expressionDataGEO=importExpressionDataGEO(eset,gpl,defaultProbeSelector,probeCombinerMean)
#' experimentDataGEO=runTestOnData(expressionDataGEO,1:60,61:120,logPairedTTestFunction)
#' }
runTestOnData <-
function (data,normalSampleIndexes,diseaseSampleIndexes,testFunction,
testFunctionExtraArgs) {
hgncSymbols = row.names(data)
normalData = t(data[,normalSampleIndexes])
diseaseData = t(data[,diseaseSampleIndexes])
result = testFunction(NULL,NULL,NULL);
for (i in 1:length(hgncSymbols)) {
result = rbind(
result,testFunction(
normalData[,i],diseaseData[,i],hgncSymbols[i],
testFunctionExtraArgs
)
)
}
row.names(result) = hgncSymbols
return(result)
}
#' Outdegree normalized fold change based scoring
#'
#' Uses the fold change values and the outdegree of the nodes to define a score
#' function for use with \code{\link{computePersonalizationVectors}}
#'
#' @param experimentalData A data frame with as many rows as HGNC Gene Symbols,
#' with the columns defined by the output of testFunction. The output of
#' \code{\link{runTestOnData}}.
#' @param PPIGraph A list containing igraph_object - the igraph object and
#' vertex_map - a frame containing node attributes currently indegree and
#' outdegree. The output of \code{\link{createIGraphObject}}
#' @param extraArgs NA
#'
#' @return A data frame with two columns npv - normal personalization vector
#' and dpv - disease personalization vector. There are as many rows as number
#' of HGNC Symbols in PPIGraph
#' @export
#'
outdegreeNormalizedFCScoreFunction <-
function(experimentalData,PPIGraph,extraArgs) {
npv = PPIGraph$vertex_map$outdegree / sum(PPIGraph$vertex_map$outdegree)
dpv = PPIGraph$vertex_map$outdegree * experimentalData$foldChange
dpv = dpv / sum(dpv)
result = data.frame(normal = npv,disease = dpv)
row.names(result) = row.names(experimentalData)
return(result)
}
#' Outdegree normalized fold change and p-value based scoring
#'
#' Uses the fold change, p-values and the outdegree of the nodes to define a
#' score function for use with \code{\link{computePersonalizationVectors}}
#'
#' @param experimentalData experimentalData A data frame with as many rows as
#' HGNC Gene Symbols, with the columns defined by the output of testFunction.
#' The output of \code{\link{runTestOnData}}.
#' @param PPIGraph PPIGraph A list containing igraph_object - the igraph object
#' and vertex_map - a frame containing node attributes currently indegree and
#' outdegree. The output of \code{\link{createIGraphObject}}
#' @param extraArgs NA
#'
#' @return A data frame with two columns npv - normal personalization vector and
#' dpv - disease personalization vector. There are as many rows as number of
#' HGNC Symbols in PPIGraph
#'
#' @export
#'
outdegreeNormalizedFCOneMinusPScoreFunction <-
function(experimentalData,PPIGraph,extraArgs) {
npv = PPIGraph$vertex_map$outdegree / sum(PPIGraph$vertex_map$outdegree)
dpv = PPIGraph$vertex_map$outdegree * experimentalData$foldChange *
(1 - experimentalData$pValue)
dpv = dpv / sum(dpv)
result = data.frame(normal = npv,disease = dpv)
row.names(result) = row.names(experimentalData)
return(result)
}
#' Compute the personalization vectors based on experimental data based on a
#' score function
#'
#' This function computes the personalization vectors from the experiment data
#' based on an input score function based on the test parameters.
#'
#' @param experimentalData experimentalData A data frame with as many rows as
#' HGNC Gene Symbols, with the columns defined by the output of testFunction.
#' The output of \code{\link{runTestOnData}}.
#' @param PPIGraph PPIGraph A list containing igraph_object - the igraph object
#' and vertex_map - a frame containing node attributes currently indegree and
#' outdegree. The output of \code{\link{createIGraphObject}}
#' @param defaults A data frame corresspoding to the default experimental values
#' for genes missing from the experiment. This must be of the same format as
#' the output of the scoreFunction.
#' @param scoreFunction A score function that accepts the experimentalData and
#' the PPI Graph to return the normal and disease personalization vectors. Foe
#' example see \code{\link{outdegreeNormalizedFCScoreFunction}} or
#' \code{\link{outdegreeNormalizedFCOneMinusPScoreFunction}}
#' @param scoreFunctionExtraArgs A list of extra arguments that are to be passed
#' to the score function if any.
#'
#' @return A data frame with two columns npv - normal personalization vector and
#' dpv - disease personalization vector. There are as many rows as number of
#' HGNC Symbols in PPIGraph
#' @export
#'
#' @examples
#' \dontrun{
#' gds = getGEO('GDS3837',AnnotGPL = TRUE, getGPL = TRUE,destdir = "/tmp")
#' gpl = getGEO(Meta(gds)$platform,destdir = "/tmp")
#' eset = GDS2eSet(gds, GPL=gpl,do.log2=FALSE)
#' numPairs = dim(pData(eset))[1]/2
#' expressionDataGEO=importExpressionDataGEO(eset,gpl,defaultProbeSelector,probeCombinerMean)
#' experimentDataGEO=runTestOnData(expressionDataGEO,1:60,61:120,logPairedTTestFunction)
#' personalizationVectorsGEO=computePersonalizationVectors(experimentDataGEO,reactomePPIGraph,data.frame(foldChange=1,pValue=1),outdegreeNormalizedFCOneMinusPScoreFunction)
#' }
computePersonalizationVectors <-
function (experimentalData,PPIGraph,defaults,
scoreFunction,scoreFunctionExtraArgs) {
common = intersect(row.names(experimentalData),
row.names(PPIGraph$vertex_map))
absent = setdiff(row.names(PPIGraph$vertex_map),
row.names(experimentalData))
if (ncol(experimentalData) == 1) {
experimentalDataSubset = experimentalData[common,1,drop = FALSE]
experimentalDataSubset[absent,1] = defaults
}else{
experimentalDataSubset = experimentalData[common,]
experimentalDataSubset[absent,] = defaults
}
experimentalDataSubset =
experimentalDataSubset[row.names(PPIGraph$vertex_map),,drop =
FALSE]
result = scoreFunction(experimentalDataSubset,PPIGraph,
scoreFunctionExtraArgs)
row.names(result) = row.names(experimentalDataSubset)
return(result)
}
#' Compute the pageRanks for the genes.
#'
#' This function computes the pageRanks for the genes based on the input
#' personalization vectors constructed from experimental data.
#'
#' @param personalizationVectors A data frame with two columns npv - normal
#' personalization vector and dpv - disease personalization vector. There are
#' as many rows as number of HGNC Symbols in experimentData
#' @param PPIGraph PPIGraph PPIGraph A list containing igraph_object - the
#' igraph object and vertex_map - a frame containing node attributes currently
#' indegree and outdegree. The output of \code{\link{createIGraphObject}}
#' @param alpha The damping factor
#'
#' @return A data frame with two columns containing the normal page rank and the
#' disease page rank. There are as many rows as number of HGNC Symbols in PPI
#' Graph
#' @export
#' @examples
#' \dontrun{
#' gds = getGEO('GDS3837',AnnotGPL = TRUE, getGPL = TRUE,destdir = "/tmp")
#' gpl = getGEO(Meta(gds)$platform,destdir = "/tmp")
#' eset = GDS2eSet(gds, GPL=gpl,do.log2=FALSE)
#' numPairs = dim(pData(eset))[1]/2
#' expressionDataGEO=importExpressionDataGEO(eset,gpl,defaultProbeSelector,probeCombinerMean)
#' experimentDataGEO=runTestOnData(expressionDataGEO,1:60,61:120,logPairedTTestFunction)
#' personalizationVectorsGEO=computePersonalizationVectors(experimentDataGEO,reactomePPIGraph,data.frame(foldChange=1,pValue=1),outdegreeNormalizedFCOneMinusPScoreFunction)
#' pageRanksGEO=computePageRanks(personalizationVectorsGEO,reactomePPIGraph,0.7)
#' }
computePageRanks <-
function(personalizationVectors,PPIGraph,alpha) {
pvs = personalizationVectors[row.names(PPIGraph$vertex_map),]
npr = igraph::page.rank(PPIGraph$igraph_object,
personalized = pvs$normal,damping = alpha)
ppr = igraph::page.rank(PPIGraph$igraph_object,
personalized = pvs$disease,damping = alpha)
result = data.frame(normalPageRank = npr$vector,
diseasePageRank = ppr$vector)
return(result)
}
#' Mean Absolute Deviation between two vectors.
#'
#' Computes the mean absolute deviation between two vectors. This is meant
#' to be used as a distance function for \code{\link{computePathwayScores}}
#'
#' @param A A vector of real values
#' @param B A vector of real values of the same size as A
#' @param extraArgs NA
#'
#' @return A data frame containing the mean absolute deviation
#' between A and B as a single value
#' @export
#'
meanAbsoluteDeviationDistanceFunction <-
function(A,B,extraArgs) {
return(data.frame(meanAbsoluteDeviation = sum(
abs(B - A)
) / length(A)))
}
#' KL Divergence between two vectors.
#'
#' Computes the KL divergence between two vectors. This is meant
#' to be used as a distance function for \code{\link{computePathwayScores}}
#'
#' @param A A vector of real values
#' @param B A vector of real values of the same size as A
#' @param extraArgs NA
#'
#' @return A data frame containing the KL Divergence
#' between A and B as a single value
#' @export
#'
KLDivergenceDistanceFunction <-
function(A,B,extraArgs) {
return(
data.frame(
KLDivergence =
entropy::KL.plugin(A,B)))
}
#' Compute the pathway scores from the pageRanks.
#'
#' This function computes the pathway scores for the given pathways and the
#' pageRanks using the passed distanceFunction
#'
#' @param pageRanks A data frame with two columns containing the normal page
#' rank and the disease page rank. There are as many rows as number of HGNC
#' Symbols in PPI Graph. The output from \code{\link{computePageRanks}}
#' @param pathwayMembership A list containing two elements - pathways- a single
#' column data frame with description string for each unique pathway id
#' memberships - a list of character vectors of gene ids indexed by pathway
#' id, containing the list of hgnc symbols of genes in each pathway. The
#' output of \code{\link{loadPathwayDataReactome}}
#' @param distanceFunction A function to compare the normal and disease pagerank
#' vectors. See \code{\link{KLDivergenceDistanceFunction}} or
#' \code{\link{meanAbsoluteDeviationDistanceFunction}} foe examples
#' @param distanceFunctionExtraArgs A list of extra arguments to be passed to
#' the distance function if any.
#'
#' @return A data frame containing the deviation score for each pathway.
#' @export
#' @examples
#' \dontrun{
#' gds = getGEO('GDS3837',AnnotGPL = TRUE, getGPL = TRUE,destdir =
#' "/tmp") gpl = getGEO(Meta(gds)$platform,destdir = "/tmp") eset =
#' GDS2eSet(gds, GPL=gpl,do.log2=FALSE) numPairs = dim(pData(eset))[1]/2
#' expressionDataGEO=importExpressionDataGEO(eset,gpl,defaultProbeSelector,probeCombinerMean)
#' experimentDataGEO=runTestOnData(expressionDataGEO,1:60,61:120,logPairedTTestFunction)
#' personalizationVectorsGEO=computePersonalizationVectors(experimentDataGEO,reactomePPIGraph,data.frame(foldChange=1,pValue=1),outdegreeNormalizedFCOneMinusPScoreFunction)
#' pageRanksGEO=computePageRanks(personalizationVectorsGEO,reactomePPIGraph,0.7)
#' pathwayScoresGEO=computePathwayScores(pageRanksGEO,reactomePathwayData$pathwayMembership,meanAbsoluteDeviationDistanceFunction,list())
#' }
computePathwayScores <-
function(pageRanks,pathwayMembership,distanceFunction,
distanceFunctionExtraArgs) {
listFunction <- function(item,pageranks,analysis,analysisparam) {
return(data.frame(
analysis(pageranks[item,"normalPageRank"],
pageranks[item,"diseasePageRank"],
analysisparam),stringsAsFactors = FALSE
))
}
return(do.call(
"rbind",lapply(
pathwayMembership,listFunction,
pageranks = pageRanks,
analysis = distanceFunction,
analysisparam = distanceFunctionExtraArgs
)
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.