# queries to retrieve and analyze chemical classes
#' Returns chemical class information comparing a metabolite subset to a larger metabolite population.
#'
#' @param mets a list object of source prepended metaboite ids, representing a metabolite set of interest
#' @param background an optional list of source prepended metaboite ids to be used as the background reference of
#' metabolites for enrichment. The background can be either a list of ids, a file name containing the id list,
#' one id per column (no file header row) or a specificed biospecimen type (available biospecimen types: "Blood",
#' "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney","Saliva", or "Feces").
#' @param backgroundType one of 'database' (all analytes in the RaMP Database), 'list' (a list of input ids),
#' or 'file' in which case the background parameter will be a file path, or 'biospecimen' where the specified background parameter is
#' a RaMP HMDB metabolite ontology term (see background parameter, above, for the most common biospecimen background values).
#' @param includeRaMPids include internal RaMP identifiers (default is "FALSE")
#' @param inferIdMapping if FALSE, the method only reports on class annotations made directly on the input ids.
#' If inferIdMapping is set to TRUE, the input ids are cross-referenced or mapped to other existing ids that contain metabolite class annotations.
#' Following id cross references can expand coverage if the input type is other than HMDB ids or LIPIDMAPS ids.
#' The default value is FALSE.
#' @param db a RaMP database object
#' @return Returns chemical class information data including class count tallies and comparisons between metabolites of interest and the metabolite population,
#' metabolite mappings to classes, and query summary report indicating the number of input metabolites that were resolved and listing those metabolite ids
#' that are not found in the database.
#'
#' The returned object (return_obj below) contains three or four main result areas. Use str(return_obj) to see the structure described here.
#'
#' \strong{return_obj$count_summary} contains a dataframe for each category of class annotations (e.g. class, sub_class)
#' This count summary contains:
#' \describe{
#' \item{category or level name}{The chemical class category, e.g. class, sub_class, super_class}
#' \item{class name}{The chemical class name, e.g. Organooxygen compounds}
#' \item{pop_counts and met_counts}{Population and metabolite list counts for each class}
#' \item{fract_of_pop}{The fraction of the met list's metabolites compared to the population class hits}
#' \item{fract_within_pop}{The fraction of population metabolites within the class describing how common the class is within the population.}
#' \item{fract_within_mets}{The fraction of met list metabolites witin the class describing how common the class is within the metabolite list.}
#'}
#'
#' \strong{return_obj$met_classes} metabolite classes for each input metabolite list id
#'
#' \strong{return_obj$pop_classes} metabolite classes for each input population metabolite if a population is provided.
#'
#' \strong{return_obj$query_report} this reports on the query list size, the number of input ids
#' that were found in the database and a list of metabolite ids that were not found in the database.
#' There are two sections, one for the metabolite list and a second when an optional population list is provided.
#'@examples
#'\dontrun{
#' # metabolite list of interest
#' metabolites.of.interest = c("pubchem:64969",
#' "chebi:16958",
#' "chemspider:20549",
#' "kegg:C05598",
#' "chemspider:388809",
#' "pubchem:53861142",
#' "hmdb:HMDB0001138",
#' "hmdb:HMDB0029412")
#'
#' metClassResult <- getChemClass(mets = metabolites.of.interest)
#'
#' # show structure
#' utils::str(metClassResult)
#'
#' # show a count summary, metabolite class mappings and query report
#' metClassResult$count_summary$class
#' metClassResult$met_classes
#' metClassResult$query_report
#'}
#' @export
getChemClass <- function(mets, background = "database", backgroundType="database", includeRaMPids = FALSE, inferIdMapping = FALSE, db = RaMP()){
print("Starting Chemical Class Survey")
if(backgroundType == "file") {
bkgrnd <- utils::read.table(background, header=F)[,1]
filteredMets <- mets[mets %in% bkgrnd]
print(paste0("Number of input query ids: ",length(mets)))
print(paste0("Number of input query ids found in supplied biospecimen background: ",length(filteredMets)))
print(paste0("Excluded input query ids that are NOT found in supplied biospecimen background: ",(length(mets)-length(filteredMets))))
if((length(filteredMets)) < 1) {
warning("All input query mets were not found in the specified background ID list.")
return(list())
}
mets <- filteredMets
} else if(backgroundType == "list") {
bkgrnd = background
filteredMets <- mets[mets %in% bkgrnd]
print(paste0("Number of input query ids: ",length(mets)))
print(paste0("Number of input query ids found in supplied biospecimen background: ",length(filteredMets)))
print(paste0("Excluded input query ids that are NOT found in supplied biospecimen background: ",(length(mets)-length(filteredMets))))
if((length(filteredMets)) < 1) {
warning("All input query mets were not found in the specified background ID list.")
return(list())
}
mets <- filteredMets
} else if(backgroundType == "database") {
# use the full database as background
bkgrnd = 'database'
} else if (backgroundType == "biospecimen") {
print(paste0("Biospecimen background specified: ", background))
bg = db@api$getMetaboliteSourceIdsForOntology(biospecimen = background)
# cases check if bg is empty (suggest to query for biospecimen types in ramp)
if(is.null(bg) || nrow(bg) == 0) {
warning("The input biospecimen type is not represented in the RaMP Database. Use RaMP::getOntologies() to see available terms.")
return(list())
}
# check that all input query ids are inside the biospecimen type list
# make sure that query ids are culled for ids outside the specified biospeciment background
filteredMets <- mets[mets %in% bg$sourceId]
print(paste0("Number of input query ids: ",length(mets)))
print(paste0("Number of input query ids found in supplied biospecimen background: ",length(filteredMets)))
print(paste0("Excluded input query ids that are NOT found in supplied biospecimen background: ",(length(mets)-length(filteredMets))))
# check that we still have some query metabolites
if((length(filteredMets)) < 1) {
warning(paste0("All input query mets were not found in the specified biospecimen background: ",background))
return(list())
}
# try to make a set of source ids assoicated with unique rampids
# drop this, as it limits the source ids such that there is only one random soruce id per ramp id
# bg <- bg[!duplicated(bg$rampId),]
# use unique list of source ids for the biospecimen type
# note that rampId counts will be used downstream for statistics, reducing redundancy
bkgrnd <- unique(unlist(bg$sourceId))
print(paste0("Number of metabolites in biospecimen background: ", length(bkgrnd)))
# set mets to the mets contained within the background
mets <- filteredMets
}
else {
stop("backgroundType was not specified correctly. Please specify one of the following options: 'database', 'file', 'list', or 'biospecimen'.")
}
# note that for enrichment analysis the inferIdMapping for the class survey is set to FALSE
# This means that only ids that have direct id-to-class annotations will contribute to results.
if(backgroundType == "database"){
res <- getChemicalClassRampIdsFullPopConn(db = db, mets=mets, inferIdMapping=inferIdMapping)
} else {
res <- getChemicalClassRampIdsConn(db = db, mets=mets, pop=bkgrnd, inferIdMapping=inferIdMapping)
}
print("Finished Chemical Class Survey")
# It's possible that the chemical class survey result is empty, no hits for metabolites
# or all metabolites are not in the background
if(is.null(res) || length(res) == 0) {
warning("Empty Chemical Class Result")
return(list())
}
if(includeRaMPids){
return(res)
}else{
if(!is.null(res$met_classes)) {
res$met_classes <- cleanup(data = res$met_classes)
}
if(!is.null(res$pop_classes)) {
res$pop_classes <- cleanup(data = res$pop_classes)
}
return(res)
}
}
#'Returns chemical class information comparing a metabolite subset to a metabolite population, including Fisher Exact Test
#'enrichment p-values and FDR values.
#'
#'The function performs enrichment analysis for Classyfire classes, sub-classess, and super-classes, and for #'LipidMaps categories, main classess, and sub classes.
#'
#' @param mets a vector of source prepended metabolite ids
#' @param background an optional list of source prepended metaboite ids to be used as the background reference of
#' metabolites for enrichment. The background can be either a list of ids, a file name containing the id list,
#' one id per column (no file header row) or a specificed biospecimen type (available biospecimen types: "Blood",
#' "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney","Saliva", or "Feces").
#' @param backgroundType one of 'database' (all analytes in the RaMP Database), 'list' (a list of input ids),
#' or 'file' in which case the background parameter will be a file path, or 'biospecimen' where the specified background parameter is
#' a RaMP HMDB metabolite ontology term (see background parameter, above. for the most common biospecimen background values).
#' @param inferIdMapping if FALSE, the method only reports on class annotations made directly on the input ids.
#' If inferIdMapping is set to TRUE, the input ids are cross-referenced or mapped to other existing ids that contain metabolite class annotations.
#' Following id cross references can expand coverage if the input type is other than HMDB ids or LIPIDMAPS ids.
#' The default value is FALSE.
#' @param db a RaMP database object
#' @return a list of dataframes, each holding chemical classs enrichment statistics for specific chemical classification systems,
#' such as HMDB Classyfire class categories and LIPIDMAPS class categories. The results list chemical classes, metabolite hits counts,
#' Fisher Exact p-values and Benjamini-Hochberg corrected p-values (FDR estimates)
#'
#'@examples
#'\dontrun{
#' # metabolite list of interest
#' metabolites.of.interest = c("pubchem:64969",
#' "chebi:16958",
#' "chemspider:20549",
#' "kegg:C05598",
#' "chemspider:388809",
#' "pubchem:53861142",
#' "hmdb:HMDB0001138",
#' "hmdb:HMDB0029412")
#'
#' chemical.enrichment <- runEnrichChemClass(mets = metabolites.of.interest, db = rampDB)
#'}
#' @export
runEnrichChemClass <- function( mets, background = "database", backgroundType = "database", inferIdMapping=F, db = RaMP() ) {
print("Starting Chemical Class Enrichment")
# note that inferIdMapping is set to FALSE
# enrichment will only be reported for input ids that have
# a direct association with the chemical class.
classData <- getChemClass(db = db, mets = mets,
background = background,
backgroundType = backgroundType,
includeRaMPids = TRUE, inferIdMapping = inferIdMapping)
# Quit if empty survey result
if(is.null(classData) || length(classData) < 1) {
warning("Empty survey result. Please see other warnings.")
return(list())
}
enrichmentStat <- list()
# helper method to summarize information on classes based on chemical class survey
totalCountInfo <- getTotalFoundInCategories(classData = classData, inferIdMapping = inferIdMapping)
for (category in names(classData$count_summary)) {
categoryData <- classData$count_summary[[category]]
# only run if we have data for that class category
# ANNNND we have that category represented in $mets
if(nrow(categoryData) > 0 && category %in% totalCountInfo$mets$Var1) {
resultRow <- 1
metsInfo <- totalCountInfo$mets
popInfo <- totalCountInfo$pop
# some summary methods returned Integer64, cast as integer
# these values are the counts for the annotation category for the metabolite set and the population
totMetCnt <- as.integer(metsInfo[metsInfo$Var1 == category,2])
totPopCnt <- as.integer(popInfo[popInfo$Var1 == category,2])
contingencyMat <- matrix(nrow=2, ncol=2)
resultMat <- data.frame(matrix(ncol=8))
colnames(resultMat) <- c("category", "class_name", "met_hits", "pop_hits",
"met_size", "pop_size", "Pval","OR")
for (i in 1:nrow(categoryData)) {
if(categoryData[i,'mets_count'] >= 1) {
contingencyMat[1,1] <- as.integer(categoryData[i,'mets_count'])
contingencyMat[1,2] <- totMetCnt - contingencyMat[1,1]
contingencyMat[2,1] <- as.integer(categoryData[i,'pop_count']) - contingencyMat[1,1]
contingencyMat[2,2] <- totPopCnt - contingencyMat[2,1] - contingencyMat[1,2]
className <- categoryData[i,'class_name']
testres <- stats::fisher.test(contingencyMat, alternative = "greater")
p <- testres$p.value
oddsratio <- testres$estimate
row = list(as.character(category), as.character(className), contingencyMat[1,1], contingencyMat[1,1] + contingencyMat[2,1],
totMetCnt, totPopCnt, p, oddsratio)
resultMat[resultRow, ] <- row
resultRow <- resultRow + 1
}
}
resultMat <- resultMat[order(resultMat$`Pval`),]
bhPvals <- stats::p.adjust(resultMat$`Pval`, method = "BH")
resultMat$Pval_FDR <- bhPvals
holm <- stats::p.adjust(resultMat$`Pval`, method = "holm")
resultMat$Pval_Holm <- holm
enrichmentStat[[as.character(category)]] <- resultMat
}
}
enrichmentStat[['result_type']] <- 'chemical_class_enrichment'
print("Finished Chemical Class Enrichment")
return(enrichmentStat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.