R/core_functions.R

Defines functions run_shuffle randomize_net compare_met gene_contributions single_gene_cmp plot_ref_mets_by_cmps plot_mantel_results run_all_metabolites2 run_all_metabolites make_pairwise_met_matrix get_cmp_scores generate_genomic_network filter_currency_metabolites generate_network_template_kegg get_kegg_reaction_info get_kegg_reaction_links run_all_metabolites_FBA2 run_mimosa1

Documented in compare_met filter_currency_metabolites gene_contributions generate_genomic_network generate_network_template_kegg get_cmp_scores get_kegg_reaction_info make_pairwise_met_matrix randomize_net run_all_metabolites run_all_metabolites2 run_all_metabolites_FBA2 run_mimosa1 run_shuffle single_gene_cmp

#core_functions.R
#workflow of functions for MIMOSA analysis

#' Run full MIMOSA1 analysis (2019)
#'
#' @import data.table
#' @param species processed species table
#' @param mets processed metabolite table
#' @param species_file File prefix from species data or other ID for saving
#' @param picrust_file_paths paths to picrust1 files
#' @param rxn_path Path to processed mapformula reaction table
#' @param simulated Whether this is simulated data
#' @param config_table A MIMOSA2 configuration table of analysis settings (required only for using simulation data)
#' @return Output of run_all_metabolites
#' @examples
#' run_mimosa1(species, mets)
#' @export
run_mimosa1 = function(species, mets, species_file = "mimosa1", picrust_file_paths = c("data/picrustGenomeData/16S_13_5_precalculated.tab.gz", "data/picrustGenomeData/indivGenomes/", "_genomic_content.tab"),
                       rxn_path = "data/KEGG/full_rxn_table.txt", simulated = F, config_table = NULL){
  if(!simulated){
    contrib_table = generate_contribution_table_using_picrust(species, picrust_file_paths[1], picrust_file_paths[2], picrust_file_paths[3], copynum_column = T)
    print(contrib_table)
    genes = contrib_table[,sum(contribution), by=list(Sample, Gene)]
    genes = dcast(genes, Gene~Sample, value.var = "V1", fill = 0)
    setnames(genes, "Gene", "KO")
    if("compound" %in% names(mets)) setnames(mets, "compound", "KEGG")
    setkey(mets, "KEGG")
    #if("contribution") 
    setnames(contrib_table, c("contribution", "copy_number"), c("CountContributedByOTU", "GeneCountPerGenome"))
    file_id = gsub(".txt", "", basename(species_file))
    file_prefix = file_id
    rxn_table = fread(rxn_path)
    results1 = run_all_metabolites(genes, mets, file_prefix = file_prefix, id_met = F, met_id_file = NULL,
                                   net_method = "KeggTemplate", net_file = NULL, rxn_table_source = rxn_table,
                                   correction = "fdr", degree_filter = 30, minpath_file = NULL, cor_method = "spearman", nperm = 1000, nonzero_filter = 4, save_out = F)
    #load(paste0(file_prefix, "_out.rda"))
  } else {
    spec_codes = data.table(Species = sort(unique(species[,OTU])), Code = sort(unique(species[,OTU])))
    network_noRev = build_metabolic_model(species, config_table = config_table, manual_agora = T, degree_filt = 0)[[1]]
    results1 = run_all_metabolites_FBA2("HMPsims", fake_spec = species, fake_mets = mets, network = network_noRev, nperm = 1000, spec_codes = spec_codes)
  }
  return(results1)
}

#' Run full MIMOSA1 analysis on simulation data (2019)
#'
#' @import data.table
#' @param run_prefix 
#' @param fake_spec processed species table
#' @param fake_mets processed metabolite table
#' @param network output of build_metabolic_model
#' @param spec_codes Table of species IDs
#' @param cor_method spearman or pearson
#' @param correction fdr or bonferroni
#' @param nperm Number of Mantel permutations
#' @param nonzero_filter Number of samples that must be nonzero for a metabolite to be included
#' @param kegg_translate Compound ID table
#' @return Output of run_all_metabolites
#' @examples
#' run_all_metabolites_FBA2("sim1", species, mets, network, spec_codes)
#' @export
run_all_metabolites_FBA2 = function(run_prefix, fake_spec, fake_mets, network, spec_codes, cor_method = "spearman", 
                                    correction = "fdr", nperm = 10000, nonzero_filter = 4,  kegg_translate = ""){
  subjects = intersect(names(fake_spec), names(fake_mets))
  if(length(subjects) < 2) stop("Sample names not consistent between genes and metabolites")
  
  species_cmps = get_species_cmp_scores(fake_spec, network, manual_agora = T)
  tot_cmps = species_cmps[,sum(CMP), by=list(compound, Sample)]
  cmp_mat = dcast(tot_cmps, compound~Sample, value.var = "V1", fill = 0)
  #cmp_mat[,compound:=gsub("[e]", "[env]", compound, fixed = T)]
  #cmp_mat = get_cmp_scores(emm, norm_kos)
  
  #get mets
  if(!"medium" %in% names(fake_mets) & !"compound" %in% names(fake_mets)){
    fake_mets = merge(fake_mets, kegg_translate, by="Metabolite", all.x=T, all.y=F)
  }
  if("compound" %in% names(fake_mets)){
    metIDs = fake_mets[,compound] 
    met_name = "compound"
  } else {
    metIDs = fake_mets[,medium]
    met_name = "medium"
  }
  shared_mets = metIDs[metIDs %in% cmp_mat[,unique(compound)]] 
  setkey(cmp_mat, compound)
  setkey(fake_mets, compound)
  all_comparisons = vector("list",length(shared_mets))
  
  for(j in 1:length(shared_mets)){
    #What is happening??
    good_subs = intersect(names(fake_mets)[which(!is.na(unlist(fake_mets[shared_mets[j]])))], names(cmp_mat)[which(!is.na(unlist(cmp_mat[shared_mets[j],subjects,with=F])))])
    good_subs = good_subs[good_subs != "compound"]
    prmt_vector = unlist(cmp_mat[shared_mets[j],good_subs,with=F])
    met_vector = unlist(fake_mets[shared_mets[j],good_subs,with=F])
    #check for too many 0s
    if(length(met_vector[met_vector!=0 & !is.na(met_vector)]) <= nonzero_filter | length(prmt_vector[prmt_vector!=0]) <= nonzero_filter | length(unique(met_vector)) < 2 | length(unique(prmt_vector)) < 2){
      all_comparisons[[j]] = NA
    }else{
      met_mat = make_pairwise_met_matrix(shared_mets[j], cmp_mat[,c(good_subs, "compound"),with=F])
      metabol_mat = make_pairwise_met_matrix(shared_mets[j], fake_mets[,c(good_subs,met_name),with=F])
      test = mantel_2sided(met_mat,metabol_mat,method=cor_method,permutations = nperm, direction = "pos")
      test_n = mantel_2sided(met_mat,metabol_mat,method=cor_method,permutations = nperm, direction = "neg")
      all_comparisons[[j]] = list(ID = shared_mets[j], PRMT = met_mat, Mets = metabol_mat, Mantel = list(test,test_n))
    }
  }
  failed_mets = shared_mets[which(is.na(all_comparisons))]
  shared_mets = shared_mets[which(!is.na(all_comparisons))]
  all_comparisons = all_comparisons[which(!is.na(all_comparisons))]
  
  correction = "fdr"
  cors_s = sapply(all_comparisons,function(x){return(x$Mantel[[1]]$statistic)})
  pvals_s = sapply(all_comparisons,function(x){return(x$Mantel[[1]]$signif)})
  pvals2_s = correct(pvals_s, method = correction)
  
  cors_n = sapply(all_comparisons,function(x){return(x$Mantel[[2]]$statistic)})
  pvals_n = sapply(all_comparisons,function(x){return(x$Mantel[[2]]$signif)})
  pvals2_n = correct(pvals_n, method = correction)
  
  node_data = data.table::data.table(compound = shared_mets, Correlation = cors_s, PValPos = pvals_s, QValPos = pvals2_s, PValNeg = pvals_n, QValNeg = pvals2_n)
  setkey(node_data,compound)
  
  node_data[,PredictionType:=ifelse(QValPos < 0.1, "Consistent", "Inconsistent")]
  node_data[,PredictionType:=ifelse(QValNeg < 0.1, "Contrasting", PredictionType)]
  
  #save melted version for easier post processing
  cmp_mat_save = melt(cmp_mat, id.var = "compound", variable.name = "Sample", variable.factor = F)
  return(list(failed_mets, fake_mets, cmp_mat_save, all_comparisons, node_data))
}

get_kegg_reaction_links = function(rxns){
  foo = vector("list", length(rxns))
  for(j in 1:length(rxns)){ ##Using lapply here causes weird timeouts with curl
    foo[[j]] = tryCatch(KEGGREST::keggLink(rxns[j], target="ko"))
  }
  return(foo)
}

#' Get basic KEGG reaction and KO info in a unified format
#'
#' @import data.table
#' @param kos_to_rxns_method EITHER "KEGGREST" indicating to use the KEGGREST API to link KOs and reactions and get reaction info, OR a file path to the KEGG file genes/ko/ko_reaction.list
#' @param reaction_info_file If kos_to_rxns_method is a file path, additional file containing full reaction info from the KEGG database
#' @param save_out whether to save output as an Rdata file named "KeggReactions.rda"
#' @param kolist Optionally, a vector of KO IDs. Will create an all_kegg object containing information only on those KOs and the reactions linked to them.
#' @return An R object with 4 components: a list of KOs, a list of Reaction IDs, a list of associated reaction info, and a table linking KOs to reactions
#' @examples
#' get_kegg_reaction_info("KEGGREST")
#' get_kegg_reaction_info("KEGG/genes/ko/ko_reaction.list", "KEGG/ligand/reaction/reaction")
#' @export
get_kegg_reaction_info = function(kos_to_rxns_method, reaction_info_file = "", save_out = T, kolist = ""){
#### Option 1: Access most recent KEGG reaction annotations with the KEGGREST API
  if(kos_to_rxns_method=="KEGGREST"){
      all_kegg = list(KOs = gsub("ko:","",names(KEGGREST::keggList("ko")), fixed=T),
                      Reactions = gsub("rn:", "", names(KEGGREST::keggList("reaction")), fixed = T))
    kos_to_rxns = get_kegg_reaction_links(all_kegg$Reactions)
    all_kegg$Reactions = all_kegg$Reactions[sapply(kos_to_rxns, length) != 0]
    kos_to_rxns = kos_to_rxns[which(sapply(kos_to_rxns, length) != 0)]
    kos_to_rxns = data.table(Rxn = gsub("rn:","", unlist(sapply(kos_to_rxns, function(x){ return(names(x))})), fixed = T), KO = gsub("ko:","",unlist(kos_to_rxns), fixed = T))
    if(!identical(kolist, "")){ #if we only want specific KOs)
      kos_to_rxns = kos_to_rxns[KO %in% kolist]
      all_kegg$KOs = all_kegg$KOs[all_kegg$KOs %in% kolist]
      all_kegg$Reactions = all_kegg$Reactions[all_kegg$Reactions %in% kos_to_rxns[,Rxn]]
    }
    all_kegg$Reaction_info = lapply(all_kegg$Reactions, function(x){
      return(KEGGREST::keggGet(x)[[1]]) }) #This is slow!
    cat("Done downloading reaction info!\n")
  } else {
  ##### Option 2: Read reaction info and KO links from KEGG database file
    kos_to_rxns = fread(kos_to_rxns_method, header=F)
    if(ncol(kos_to_rxns) != 2) stop("KO-reaction linking file should only have 2 columns!")
    if(any(grepl("rn", kos_to_rxns[,V1]))){ #some KEGG versions have rxn 1st
      setnames(kos_to_rxns, c("Rxn", "KO"))
    } else {
      setnames(kos_to_rxns, c("KO", "Rxn"))
    }
    kos_to_rxns[,KO:=gsub("ko:","",KO)]
    kos_to_rxns[,Rxn:=gsub("rn:","", Rxn)]
    all_kegg = list(KOs = kos_to_rxns[,sort(unique(KO))], Reactions = kos_to_rxns[order(Rxn)][,unique(Rxn)])
    if(!requireNamespace("readr", quietly = T)){
      stop("readr required to process KEGG reaction file, please install", call. = F)
    }
    reaction_info = strsplit(readr::read_file(reaction_info_file), split = "///\n", fixed = T)[[1]]
    assoc_ids = trimws(gsub(".*ENTRY(.*)Reaction.*","\\1",reaction_info))
    all_kegg$Reaction_info = reaction_info[match(all_kegg$Reactions, assoc_ids)]
    all_kegg$Reaction_info = lapply(all_kegg$Reaction_info, function(x){
      x = strsplit( gsub("(\n[A-Z])","~\\1",x), "~\n", fixed = T)[[1]]
      new_ob = list()
      for(j in 1:length(x)){
        foo = strsplit(x[j], split = "  +")[[1]]
        new_ob[[foo[1]]] = foo[2:length(foo)]
      }
      return(new_ob)
    })
  }
  all_kegg$kos_to_rxns = kos_to_rxns
  #if(save_out) save(all_kegg, file = "KeggReactions.rda")
  return(all_kegg)
}

#' Generate community metabolic network template from the KEGG database, using reaction_mapformula.lst
#'
#' @import data.table
#' @param mapformula_file file path to "reaction_mapformula.lst" from the KEGG database, specifying reaction IDs annotated in pathways
#' @param all_kegg Output of get_kegg_reaction_info, OR vector of 2 file paths to input to get_kegg_reaction_info
#' @param write_out Whether to save template to file
#' @return A table specifying a community metabolic network based on KEGG pathways with the following columns: Rxn (reaction ID),	KO (Gene ID),	Reac (Reactant ID),	Prod (Product ID),	Path (Pathway ID),	ReacProd (Original reaction specification),	stoichReac (Reactant stoichiometry coefficient),	stoichProd (Product stoichiometry coefficient)
#' @examples
#' generate_network_template_kegg("KEGG/ligand/reaction/reaction_mapformula.lst", all_kegg)
#' @export
generate_network_template_kegg = function(mapformula_file, all_kegg, write_out = T, remove_extra_pathways = T){
  if(is.vector(all_kegg) & length(all_kegg)==2){
    all_kegg = get_kegg_reaction_info(kos_to_rxns_method = all_kegg[1], reaction_info_file = all_kegg[2], save_out = F)
  }
  mapformula = fread(mapformula_file, colClasses = "character", sep = ":") #get mapformula pathway annotations of reactions
  setnames(mapformula, c("Rxn","Path","ReacProd"))
  #Remove generic pathways where everything is reversible
  mapformula = mapformula[Path!=" 01100" & Path != 1100 & Path !="01100"] #If numeric
  if(remove_extra_pathways){
    mapformula = mapformula[!Path %in% c(" 01120", 1120, "01120", " 01110", 1110, "01110")]
  }
  #Process Reacs and Prods, flip reversible reactions, etc
  mapformula[,Reac:=lapply(ReacProd,function(x){ return(unlist(strsplit(x,"="))[1])})]
  mapformula[,Reac:=lapply(Reac, strsplit, " ")]
  mapformula[,Prod:=lapply(ReacProd,function(x){ return(unlist(strsplit(x,"="))[2])})]
  mapformula[,Prod:=lapply(Prod, strsplit, " ")]
  mapformula[,Reac:=sapply(Reac, unlist)]
  mapformula[,Prod:=sapply(Prod, unlist)]
  mapformula[,Reac:=sapply(Reac, function(x){ return(x[grepl("[A-Za-z]",x)])})]
  mapformula[,Prod:=sapply(Prod, function(x){ return(x[grepl("[A-Za-z]",x)])})]
  for(j in 1:length(mapformula[,Rxn])){
    submap = mapformula[j]
    if(grepl("<=>",submap[,ReacProd])){ #add other way if flipped
      setnames(submap, c("Reac","Prod"),c("Prod","Reac"))
      submap = submap[,list(Rxn,Path,ReacProd,Reac,Prod)]
      mapformula = rbind(mapformula, submap)
    }
    if(grepl("<= ",submap[,ReacProd])){ #flip if rxn backward
      setnames(submap, c("Reac","Prod"),c("Prod","Reac"))
      submap = submap[,list(Rxn,Path,ReacProd,Reac,Prod)]
      mapformula[j] = submap
    }
  }
  new_mapformula = data.table(Rxn=rep('',0),Path=rep('',0), ReacProd=rep('',0), Reac=rep('',0),Prod=rep('',0))
  for(j in 1:length(mapformula[,Rxn])){ #add extra rows for multiple reacs and prods
    reacs = mapformula[j,Reac][[1]]
    prods = mapformula[j,Prod][[1]]
    submap = mapformula[j]
    for(k in 1:length(reacs)){
      for(m in 1:length(prods)){
        submap[,Reac:=reacs[k]]
        submap[,Prod:=prods[m]]
        new_mapformula = rbind(new_mapformula, submap)
      }
    }
  }
  mapformula = new_mapformula
  mapformula = mapformula[Path!=" 01100" & Path != 1100 & Path !="01100"] #If numeric
  mapformula = merge(mapformula, all_kegg$kos_to_rxns, by="Rxn", all.x=T, all.y=F, allow.cartesian=T)
  mapformula = mapformula[!is.na(KO)]
  setkey(mapformula, NULL)
  rxn_table = unique(mapformula)
  rxn_table_sub = unique(rxn_table[,list(Rxn,KO,Reac,Prod)])
  ## Exclude information that is not consistent with reaction annotation file
  rxn_table_sub[,Check:=sapply(1:length(Rxn), function(x){
    ind = which(all_kegg$Reactions==rxn_table_sub[x,Rxn])
    if(length(ind) < 1){ return("noMatch")
    } else if(!(rxn_table_sub[x,KO] %in% names(all_kegg$Reaction_info[[ind]]$ORTHOLOGY)|rxn_table_sub[x,KO] %in% all_kegg$Reaction_info[[ind]]$ORTHOLOGY | rxn_table_sub[x,KO] %in% gsub("KO: ", "", all_kegg$Reaction_info[[ind]]$ORTHOLOGY))){
      return("noKOmatch")
    }
    else if(!(any(grepl(rxn_table_sub[x,Reac], all_kegg$Reaction_info[[ind]]$EQUATION)) & any(grepl(rxn_table_sub[x,Prod], all_kegg$Reaction_info[[ind]]$EQUATION)))){
      return("noCmpdmatch")
    }
    else return("good")
  })]
  if(rxn_table_sub[,sum(Check=="good")==0]) stop("Problem with reaction table formatting")
  rxn_table_sub = rxn_table_sub[Check=="good"]
  rxn_table_sub[,Check:=NULL]
  rxn_table = merge(rxn_table, rxn_table_sub, by=c("Rxn","KO","Reac","Prod"), all.x=F)

  #get stoichiometry too
  goodrxns = match(rxn_table[,Rxn], all_kegg$Reactions)
  stoichReac = rep(1, length(rxn_table[,KO]))
  stoichProd = rep(1, length(rxn_table[,KO]))
  #parse KEGG Reaction for stoichiometry
  for(j in 1:length(goodrxns)){
    rxn_info = all_kegg$Reaction_info[[goodrxns[j]]]
    if(!is.null(rxn_info$EQUATION)){
      eqn = strsplit(rxn_info$EQUATION, split = " <=> ", fixed=T) #get chemical formula for reaction
      comps = unlist(lapply(eqn, strsplit, split = " \\+ "))
      coefs = unique(comps[grepl(" ", comps)])
    } else {
      coefs = c()
    }
    if(length(coefs) > 0){
      coefs_split = strsplit(coefs, split = " ",fixed=T)
      comp = sapply(coefs_split, function(x){ return(x[2])})
      reac_coef = unlist(coefs_split[comp==rxn_table[j,Reac]])[1]
      if(length(unlist(reac_coef)) > 0){
        if(!is.na(as.numeric(reac_coef))) stoichReac[j] = as.numeric(reac_coef)
      }
      prod_coef = unlist(coefs_split[comp==rxn_table[j,Prod]])[1]
      if(length(unlist(prod_coef)) > 0)
        if(!is.na(as.numeric(prod_coef))) stoichProd[j] = as.numeric(prod_coef)
    }
  }
  rxn_table[,stoichReac:=stoichReac]
  rxn_table[,stoichProd:=stoichProd]
  rxn_table[,Path:=gsub(" ","",Path)]
  if(write_out) write.table(rxn_table, file = "network_template.txt", quote=F, row.names = F, sep = "\t")
  return(rxn_table)
}

#' Remove metabolites linked to many reactions from a metabolic network.
#'
#' @import data.table
#' @param rxn_table Network table of reactions
#' @param degree_filter Compounds involved in more than this number of reactions will be removed
#' @return Network table without compounds found in reactions greater than the cutoff
#' @examples
#' filter_currency_metabolites(rxn_table, degree_filter = 50)
#' @export
filter_currency_metabolites = function(rxn_table, degree_filter = 30){
  setkey(rxn_table, NULL)
  rxn_table = unique(rxn_table)
  cmpds = unique(c(rxn_table[,Prod], rxn_table[,Reac]))
  if(degree_filter != 0){
    #cat("Filtering currency metabolites\n")
    degree = sapply(cmpds, function(x){
      rxn_table[Prod==x | Reac==x, length(unique(KO))]
    })
    cmpds = cmpds[degree < degree_filter]
    degree = degree[degree < degree_filter]
    rxn_table = rxn_table[(Prod %in% cmpds) & (Reac %in% cmpds)]
  }
  return(rxn_table)
}

#' Create a community metabolic network model using a few different methods.
#'
#' @import data.table
#' @param kos Genes to include in network (KEGG Orthology IDs)
#' @param keggSource source of network information, currently can be "KeggTemplate", "loadNet", or "metacyc"
#' @param degree_filter Compounds connected to this number of KOs or more will be filtered from the network
#' @param minpath_file file of minimal pathways to use for core network
#' @param normalize Whether to normalize output matrix to show relative impacts of each gene on each compound
#' @param rxn_table If keggSource is "KeggTemplate", must supply a data.table with the format of
#' @param networkFile If keggSource is "loadNet", file that template network should be loaded from, should have same format as output of generate_network_template_kegg
#' @param return_mats Whether to return stoichiometric matrices or just data.table list of reactions
#' @return List containing 3 different versions of the same network: an adjacency matrix, an adjacency matrix that differentiates between genes with a neutral effect on a compound and no effect (0 vs NA), and an edge list.
#' @examples
#' generate_genomic_network(kos, "KeggTemplate", degree_filter = 30)
#' @export
generate_genomic_network = function(kos, keggSource = "KeggTemplate", degree_filter = 0,
                                    minpath_file = '', normalize = T, rxn_table = "", networkFile="", return_mats = T){
  #keggSource must be either "labKegg", or "loadNet" for if the network has already been generated, or "metacyc"
  #degree filter says whether to filter hub compounds with a degree greater than this value
  if(!keggSource %in% c("loadNet", "KeggTemplate", "metacyc")){
    stop("Invalid network method selected")
    } else if(keggSource == "loadNet"){
    load(networkFile)
    return(allnet)
  } else if(keggSource == "KeggTemplate") { #load network template and just grab subset
    if(identical(rxn_table, "")){
      stop("Must supply a community network reaction table, for example the output of generate_network_template_kegg")
    }
    #rxn_table = fread("ko_rxn_map_all_info_filtered.txt", colClasses = c(rep("character",6), rep("numeric",2)))
    rxn_table = rxn_table[KO %in% kos]
    if(nrow(rxn_table) == 0){
      warning("No reactions for this KO set")
    }
    #rxn_table[,rxn_id:=rxn_ids2]
    # if(minpath_file!=''){
    #   minpaths = fread(minpath_file, colClasses="character")
    #   setnames(minpaths,"Path")
    #   #for reactions in the minpath set we are only saving the info from those minimal pathways,
    #   #but we are also saving the reactions not in the minpath set
    #   rxn_table2 = rxn_table[Path %in% minpaths[,Path]]
    #   rxn_table_extra = rxn_table[!(Rxn %in% rxn_table2[,Rxn])]
    #   rxn_table = rbind(rxn_table2, rxn_table_extra)
    # }
    #No longer need extra info
    if("Path" %in% names(rxn_table)) rxn_table[,Path:=NULL]
    if("ReacProd" %in% names(rxn_table)) rxn_table[,ReacProd:=NULL]
    if("Rxn" %in% names(rxn_table)) rxn_table[,Rxn:=NULL]
    setkey(rxn_table, NULL)
    rxn_table = unique(rxn_table)
    cmpds = unique(c(rxn_table[,Prod], rxn_table[,Reac]))
    if(degree_filter != 0){
      #cat("Filtering currency metabolites\n")
      degree = sapply(cmpds, function(x){
        rxn_table[Prod==x | Reac==x, length(unique(KO))]
      })
      cmpds = cmpds[degree < degree_filter]
      degree = degree[degree < degree_filter]
      rxn_table = rxn_table[(Prod %in% cmpds) & (Reac %in% cmpds)]
      #redefine cmpds
      cmpds = unique(c(rxn_table[,Prod], rxn_table[,Reac]))
    }
    goodkos = unique(rxn_table[,KO])
    #expand into adjacency and stoichiometry matrices
    #cat("Making stoichiometric matrix\n")
    if(return_mats){
      rxn_table = get_non_rev_rxns(rxn_table, by_species = F)
      if(nrow(rxn_table[Reversible==0]) != 0){
        net_prods = unique(rxn_table[Reversible==0,list(KO, Prod, stoichProd)])
        net_reacs = unique(rxn_table[Reversible==0,list(KO, Reac, stoichReac)])
        #Redefine goodkos and cmps after removing reversible
        goodkos = unique(c(net_prods[,KO], net_reacs[,KO]))
        cmpds = unique(c(net_prods[,Prod], net_reacs[,Reac]))
        network_mat = matrix(rep(0), nrow = length(cmpds), ncol = length(goodkos))
        stoich_mat = matrix(rep(NA), nrow = length(cmpds), ncol = length(goodkos))
        
        for(j in 1:length(net_prods[,KO])){
          foo2 = match(net_prods[j,Prod], cmpds)
          fooko = match(net_prods[j,KO], goodkos)
          #network_mat[foo1, fooko] = network_mat[foo1, fooko] - rxn_table[j,stoichReac]
          network_mat[foo2, fooko] = network_mat[foo2, fooko] + net_prods[j, stoichProd]
          #if(is.na(stoich_mat[foo1,fooko])) stoich_mat[foo1,fooko] = -1*rxn_table[j,stoichReac] else stoich_mat[foo1,fooko] = stoich_mat[foo1, fooko] - rxn_table[j,stoichReac]
          if(is.na(stoich_mat[foo2,fooko])) stoich_mat[foo2,fooko] = 1*net_prods[j,stoichProd] else stoich_mat[foo2,fooko] = stoich_mat[foo2, fooko] + net_prods[j,stoichProd] ##Luckily this doesn't affect anything
        }
        for(j in 1:length(net_reacs[,KO])){
          foo1 = match(net_reacs[j,Reac], cmpds)
          fooko = match(net_reacs[j,KO], goodkos)
          network_mat[foo1, fooko] = network_mat[foo1, fooko] - net_reacs[j, stoichReac]
          #if(is.na(stoich_mat[foo1,fooko])) stoich_mat[foo1,fooko] = -1*rxn_table[j,stoichReac] else stoich_mat[foo1,fooko] = stoich_mat[foo1, fooko] - rxn_table[j,stoichReac]
          if(is.na(stoich_mat[foo1,fooko])) stoich_mat[foo1,fooko] = -1*net_reacs[j,stoichReac] else stoich_mat[foo1,fooko] = stoich_mat[foo1, fooko] - net_reacs[j,stoichReac] ##Luckily this doesn't affect anything
          
        }
        if(normalize){
          if(length(cmpds) > 1 & length(goodkos) > 1){
            network_mat = t(as.matrix(apply(network_mat, 1, function(x){
              negkos = which(x < 0)
              poskos = which(x > 0)
              x[negkos] = x[negkos]/abs(sum(x[negkos]))
              x[poskos] = x[poskos]/sum(x[poskos])
              return(x)
            })))
          } else {
            network_mat = sapply(network_mat, function(x){
              negkos = which(x < 0)
              poskos = which(x > 0)
              x[negkos] = x[negkos]/abs(sum(x[negkos]))
              x[poskos] = x[poskos]/sum(x[poskos])
              return(x)
            })
            dim(network_mat) = c(length(cmpds), length(goodkos))
          }
        }
        network_mat = data.frame(network_mat)
        if(ncol(network_mat) != length(goodkos)) stop("Problem with KO list for network")
        if(nrow(network_mat) != length(cmpds)) stop("Problem with compound list for network")
        names(network_mat) = goodkos
        row.names(network_mat) = cmpds
        stoich_mat = data.frame(stoich_mat)
        names(stoich_mat) = goodkos
        row.names(stoich_mat) = cmpds
      } else {
        warning("Reversible rxn issues")
        print(rxn_table)
        return(list(NULL, NULL, rxn_table))
      }

      return(list(network_mat, stoich_mat, rxn_table))
    } else {
      return(rxn_table)
    }
  } else if(keggSource == "metacyc"){
    #Get KEGG reaction IDs for these KOs
    goodrxns = sapply(all_kegg$Reaction_info, function(x){ any(names(x$ORTHOLOGY) %in% kos) | any(x$ORTHOLOGY %in% kos)})
    rxns = all_kegg$Reactions[goodrxns]
    rxn_ids = rxns
    rxn_info = all_kegg$Reaction_info[goodrxns]

    #Load in metacyc rxn information, save those with associated KEGG ids
    metacyc_rxns = fread("metacyc/good_reactions.txt")
    metacyc_rxns = unique(metacyc_rxns[,list(`UNIQUE-ID`, Category, Value)])
    klinks = metacyc_rxns[Category=="DBLINKS" & grepl("LIGAND-RXN", Value)]
    klinks[,RID:=regmatches(Value, regexpr("R[0-9]+", Value))]
    klinks = klinks[RID %in% rxn_ids]
    metacyc_rxns = merge(metacyc_rxns, klinks[,list(`UNIQUE-ID`, RID)], by="UNIQUE-ID")
    setkey(metacyc_rxns, NULL)
    metacyc_rxns = metacyc_rxns[!duplicated(metacyc_rxns[,list(Category, Value, RID)])]
    #only save rxns with assoc metacyc info
    rxn_info = rxn_info[rxn_ids %in% metacyc_rxns[,RID]]
    rxn_ids = rxn_ids[rxn_ids %in% metacyc_rxns[,RID]]
    rxn_kos = unique(unlist(sapply(rxn_info, function(x){ return( names(x$ORTHOLOGY))})))
    good_kos = kos[kos %in% rxn_kos]

    #Get compound ID info
    compound_key = fread("metacyc/good_compounds.txt")
    compound_key = unique(compound_key[,`UNIQUE-ID`, Value])
    setkey(compound_key, `UNIQUE-ID`)
    compound_key = compound_key[`UNIQUE-ID` %in% metacyc_rxns[,Value]]
    klinks = compound_key[grepl("LIGAND-CPD",Value)]
    #Get linked KEGG Compound IDs
    klinks[,CID:=toupper(substr(Value, 15, 20))]
    compound_key = unique(klinks[,list(`UNIQUE-ID`, CID)])
    setnames(compound_key, c("Value", "CID"))
    compounds = sort(unique(compound_key[,CID]))

    ##get stoichiometry and directionality from metacyc
    stoich_mat = matrix(rep(NA), nrow = length(compounds), ncol = length(good_kos))
    net_rxns = c()
    net_kos = c()
    net_prods = c()
    net_reacs = c()
    net_dir = c()
    for(j in 1:length(rxn_ids)){ #get detailed info on each rxn
      rxn = rxn_ids[j]
      meta_info = metacyc_rxns[RID==rxn_ids[j]]
      meta_compounds = meta_info[Category %in% c("LEFT","RIGHT")]
      meta_compounds = merge(meta_compounds, compound_key, all.x=T, all.y=F, by="Value")[!is.na(CID)]
      #get reactants and products
      kos_involved = names(rxn_info[[j]]$ORTHOLOGY)
      kos_involved = kos_involved[kos_involved %in% good_kos]
      ko_inds = match(kos_involved,good_kos)
      reac = meta_compounds[Category=="LEFT", CID]
      prod = meta_compounds[Category=="RIGHT", CID]
      reac = reac[(!(reac %in% prod)) & (reac %in% compounds)] #get rid of things on both sides of equation and only save interesting compounds
      prod = prod[(!(prod %in% reac)) & (prod %in% compounds)]
      if(length(reac)==0 | length(prod)==0) next
      coefs = rep(1, length(reac)) #get stoichiometry
      for(k in 1:length(reac)){
        meta_id = meta_compounds[CID==reac[k], Value]
        ind = which(meta_info[,Value]==meta_id & meta_info[,Category=="LEFT"]) #coefficient always follows in dataset
        if(meta_info[ind+1,Category]=="^COEFFICIENT") coefs[k] = as.numeric(meta_info[ind+1, Value])
      }
      coefs2 = rep(1, length(prod))
      for(k in 1:length(prod)){
        meta_id = meta_compounds[CID==prod[k], Value]
        ind = which(meta_info[,Value]==meta_id & meta_info[,Category]=="RIGHT")
        if(ind < dim(meta_info)[1]){
          if(meta_info[ind+1,Category]=="^COEFFICIENT") coefs2[k] = as.numeric(meta_info[ind+1, Value])
        }
      }
      #get direction
      direction = meta_info[Category=="REACTION-DIRECTION", Value]
      if(length(direction)==0) dir = 0 else if(all(grepl("LEFT-TO-RIGHT",direction))) dir = 1 else if(direction=="REVERSIBLE") dir = 0 else if(all(grepl("RIGHT-TO-LEFT",direction))) dir = -1
      #still saving reversible rxn links
      for(k in 1:length(reac)){
        cmpd = grep(reac[k],compounds)
        if(length(cmpd) > 0){
          #slots that were NA before
          stoich_mat[cmpd, ko_inds][is.na(stoich_mat[cmpd, ko_inds])] = -1*dir*coefs[k]
          #non-NA slots
          stoich_mat[cmpd, ko_inds][!is.na(stoich_mat[cmpd, ko_inds])] = stoich_mat[cmpd, ko_inds][!is.na(stoich_mat[cmpd, ko_inds])] - 1*dir*coefs[k]
        }
      }
      #repeat for products
      for(k in 1:length(prod)){
        cmpd = grep(prod[k],compounds)
        if(length(cmpd) > 0){
          #slots that were NA before
          stoich_mat[cmpd, ko_inds][is.na(stoich_mat[cmpd, ko_inds])] = dir*coefs2[k]
          #non-NA slots
          stoich_mat[cmpd, ko_inds][!is.na(stoich_mat[cmpd, ko_inds])] = stoich_mat[cmpd, ko_inds][!is.na(stoich_mat[cmpd, ko_inds])] + dir*coefs2[k]
        }
      }
      n_subrxns = length(reac)*length(prod)*length(kos_involved)
      net_rxns = c(net_rxns, rep(rxn, n_subrxns))
      net_dir = c(net_dir, rep(dir, n_subrxns))
      for(i in 1:length(kos_involved)){
        for(p in 1:length(reac)){
          for(m in 1:length(prod)){
            net_kos = c(net_kos, kos_involved[i])
            net_reacs = c(net_reacs, reac[p])
            net_prods = c(net_prods, prod[m])
          }
        }
      }
    }
    network_table = data.table(Rxn = net_rxns, KO = net_kos, Reac = net_reacs, Prod = net_prods, Direction = net_dir)
    network_table = network_table[Prod != Reac]
    degree = apply(stoich_mat, 1, function(x){ length(x[!is.na(x)])})
    compounds = compounds[degree != 0]
    stoich_mat = stoich_mat[degree != 0,]
    degree = degree[degree != 0]
    if(degree_filter != 0){
      cat("Filtering currency metabolites\n")
      stoich_mat = stoich_mat[degree < degree_filter,]
      compounds = compounds[degree < degree_filter]
      degree = degree[degree < degree_filter]
    }
    degree = apply(stoich_mat, 1, function(x){ length(x[!is.na(x)])}) #double check
    compounds = compounds[degree != 0]
    stoich_mat = stoich_mat[degree != 0,]
    network_table = network_table[Reac %in% compounds & Prod %in% compounds]

    network_mat = stoich_mat
    network_mat[is.na(network_mat)] = 0
    if(normalize){
      negsums = apply(network_mat, 1, function(x){ sum(x[x < 0])})
      possums = apply(network_mat, 1, function(x){ sum(x[x > 0])})
      metlen = dim(network_mat)[1]
      for(j in 1:metlen){
        network_mat[j,][network_mat[j,] < 0] = network_mat[j,][network_mat[j,] < 0]/abs(negsums[j])
        network_mat[j,][network_mat[j,] > 0] = network_mat[j,][network_mat[j,] > 0]/possums[j]
      }
    }
    network_mat = data.frame(network_mat)
    names(network_mat) = good_kos
    row.names(network_mat) = compounds
    return(list(network_mat, stoich_mat, network_table))
  }
}

#' Calculate CMP scores based on community network and gene abundances
#'
#' @import data.table
#' @param emm Stoichiometric network matrix produced by generate_genomic_network
#' @param norm_kos Data.table of gene abundances
#' @return Data.table of CMP scores
#' @examples
#' get_cmp_scores(ko_net[[1]], gene_abunds)
#' @export
get_cmp_scores = function(emm, norm_kos){
  metlen = dim(emm)[1]
  nsamp = dim(norm_kos)[2]-1
  norm_kos_sub = norm_kos[KO %in% names(emm)]
  subjects = names(norm_kos)[names(norm_kos)!="KO"]
  if(!is.null(metlen)){
    cmp = matrix(rep(NA),nrow = metlen,ncol = nsamp)
    emm = emm[,match(norm_kos_sub[,KO], names(emm)), drop = F]
    if(all(names(emm)==norm_kos_sub[,KO])){
      for(m in 1:nsamp){
        cmp[,m] =  1000*as.matrix(emm) %*% unlist(norm_kos_sub[,subjects[m],with=F])
      }
    }else if(all(sort(names(emm))==sort(norm_kos_sub[,KO]))){ #Just out of order
      emm = emm[,order(names(emm))]
      norm_kos_sub = norm_kos_sub[order(KO)]
    } else {
      stop("Double check you are using the correct metabolic network! Gene KOs do not equal network KOs")
    }
    cmp = data.table(cmp,row.names(emm))
    setnames(cmp,c(subjects,"compound"))
    setkey(cmp,compound)
    return(cmp)
  } else {
    warning("Empty network")
    return(NULL)
  }
}

#' Make a data vector into a pairwise difference matrix (can be used for either cmp scores or metabolite concentrations)
#'
#' @import data.table
#' @param metabolite metabolite ID
#' @param met_mat Abundances or scores for that metabolite across samples
#' @param diff_function Difference by default, could be fold_change
#' @return Data.frame of score or abundance pairwise differentials
#' @examples
#'
#' @export
make_pairwise_met_matrix = function(metabolite, met_mat, diff_function = "difference"){
  #diff_function options: 'difference', 'fold_change'
  #met_mat can be either cmp scores or metabolomic abundances
  nsamp = ncol(met_mat) - 1
  subjects = names(met_mat)[1:nsamp]
  met_mat_small = unlist(met_mat[metabolite,subjects,with=F])
  met_matrix = matrix(rep(NA),nrow = nsamp,ncol = nsamp)
  if(diff_function == "difference"){
    for(j in 1:nsamp){
      for(k in 1:nsamp){
        met_matrix[j,k] = met_mat_small[k] - met_mat_small[j]
      }
    }
  }
  met_matrix = data.frame(met_matrix)
  names(met_matrix) = subjects
  row.names(met_matrix) = subjects
  return(met_matrix)
}


#' Runall function to do complete predictions and comparison for all shared metabolites
#'
#' @import data.table
#' @param genes Gene abundances
#' @param mets Metabolite abundances
#' @param file_prefix Prefix for output files
#' @param correction Type of multiple hypothesis correction to perform (bonferroni or fdr)
#' @param cutoff Q/P-value cutoff for significance
#' @param net_method Network generation method (see generate_genomic_network)
#' @param id_met Whether metabolites have putative identifications that need to be processed
#' @param met_id_file If id_met, file of metabolite identifications
#' @param degree_filter Threshold for filtering currency metabolites
#' @param minpath_file Optional file of pathways to filter network to
#' @param cor_method Either "spearman" or "pearson", default is Spearman
#' @param net_file file containing network template to use
#' @param nperm Number of permutations for Mantel test, default is 20000
#' @param nonzero_filter Minimum number of samples required to have nonzero concentrations and nonzero metabolic potential scores in order for metabolite to be evaluated, default is 3
#' @param norm Whether to normalize the network model coefficients by the total number of synthesis and degradation reactions for each metabolite
#' @param save_out Whether to save output
#' @return No return, writes output to files.
#' @examples
#'
#' @export
run_all_metabolites = function(genes, mets, file_prefix = 'net1', correction = "fdr", cutoff = 0.1,
                               net_method = "load", rxn_table_source = "", id_met = F, met_id_file = '',
                               degree_filter = 0, minpath_file = '', cor_method = "spearman",
                               net_file = "", nperm = 20000, nonzero_filter=3, norm = T, save_out = T){
  #must be data.tables keyed by KOs/KEGG IDs in first column, all other columns are subject IDs
  #correction must be either "bonferroni" or "fdr", cutoff is q value cutoff
  #id_mets specifies whether to use network for improved metabolite identification (i.e. for Braun datasets)
  #if true, text file with list of putative met ids from Metabosearch must be included
  kos = genes[,KO]
  nsamp = dim(genes)[2] - 1
  subjects = intersect(names(genes), names(mets))
  ko_net = generate_genomic_network(kos, keggSource = net_method, degree_filter = degree_filter, minpath_file = minpath_file, rxn_table = rxn_table_source, networkFile = net_file, normalize = norm)
  ko_network_mat = ko_net[[1]]
  stoich_mat = ko_net[[2]]
  ko_net_table = ko_net[[3]]
  if(id_met){
    cat("Selecting best metabolite identifications\n")
    net_compounds = row.names(ko_net[[1]])
    met_id_table = fread(met_id_file, sep = "\t", header=T)
    met_id_table = met_id_table[met_id_table$Delta!="-"] #get rid of mets with no possible KEGG ID
    #met_id_list = strsplit(unlist(met_id_list), split = " ")
    #if(grepl("swedish",file_prefix)){
    new_mets = select_best_id2(met_id_table, mets, net_compounds)
    mets = new_mets
    cat("Done identifying metabolites!\n")
  }

  #get mets
  metIDs = mets[,KEGG]
  shared_mets = metIDs[metIDs %in% row.names(ko_network_mat)]
  #get cmp scores
  norm_kos = genes[,c(subjects,"KO"),with=F]
  cmp_mat = get_cmp_scores(ko_network_mat, norm_kos)

  #do all comparisons
  all_comparisons = vector("list",length(shared_mets))
  for(j in 1:length(shared_mets)){
    good_subs = intersect(names(mets)[which(!is.na(unlist(mets[shared_mets[j],subjects,with=F])))], names(cmp_mat)[which(!is.na(unlist(cmp_mat[shared_mets[j],subjects,with=F])))])
    cmp_vector = unlist(cmp_mat[shared_mets[j],good_subs,with=F])
    met_vector = unlist(mets[shared_mets[j],good_subs,with=F])
    #check for too many 0s or all equal values
    if(length(met_vector[met_vector!=0]) < nonzero_filter | length(cmp_vector[cmp_vector!=0]) < nonzero_filter | length(unique(met_vector)) < 2 | length(unique(cmp_vector)) < 2){
      all_comparisons[[j]] = NA
    }else{
      met_mat = make_pairwise_met_matrix(shared_mets[j], cmp_mat[,c(good_subs, "compound"),with=F])
      metabol_mat = make_pairwise_met_matrix(shared_mets[j], mets[,c(good_subs,"KEGG"),with=F])
      test = mantel_2sided(met_mat,metabol_mat,method=cor_method,permutations = nperm, direction = "pos")
      test_n = mantel_2sided(met_mat,metabol_mat,method=cor_method,permutations = nperm,
                             direction = "neg")
      all_comparisons[[j]] = list(ID = shared_mets[j], cmp = met_mat, Mets = metabol_mat, Mantel = list(test,test_n))
    }
  }

  #remove ones that didn't pass filter
  shared_mets = shared_mets[which(!is.na(all_comparisons))]
  all_comparisons = all_comparisons[which(!is.na(all_comparisons))]

  #whole set analysis
  cors_s = sapply(all_comparisons,function(x){return(x$Mantel[[1]]$statistic)})
  pvals_s = sapply(all_comparisons,function(x){return(x$Mantel[[1]]$signif)})
  if(length(pvals_s) > 1) pvals2_s = correct(pvals_s, method = correction) else pvals2_s = pvals_s

  cors_n = sapply(all_comparisons,function(x){return(x$Mantel[[2]]$statistic)})
  pvals_n = sapply(all_comparisons,function(x){return(x$Mantel[[2]]$signif)})
  if(length(pvals_n) > 1) pvals2_n = correct(pvals_n, method = correction) else pvals2_n = pvals_n

  node_data = data.table(compound = shared_mets, Correlation = cors_s, PValPos = pvals_s, QValPos = pvals2_s, PValNeg = pvals_n, QValNeg = pvals2_n, Metabolite = met_names(shared_mets))
  setkey(node_data,compound)

  #save everything
  #write edge file
  write.table(ko_net_table,file=paste(file_prefix,'_edges.txt',sep=''),sep="\t",quote=F,row.names=F)
  #write node attribute file
  write.table(node_data,file = paste(file_prefix,'_nodes.txt',sep=''),sep="\t",quote=F,row.names=F)

  signif_pos = node_data[!is.na(QValPos) & QValPos < cutoff & PValPos < 0.05]
  signif_neg = node_data[!is.na(QValNeg) & QValNeg < cutoff & PValNeg < 0.05]
  write.table(signif_pos, file = paste(file_prefix,'_signifPos.txt',sep = ''), sep = "\t", quote = F, row.names = F)
  write.table(signif_neg, file = paste(file_prefix,'_signifNeg.txt',sep = ''), sep = "\t", quote = F, row.names = F)
  write.table(cmp_mat, file = paste0(file_prefix, '_cmpAll.txt'), quote=F, row.names = F, sep = "\t")
  if(save_out) save(norm_kos, mets, ko_net, all_comparisons, node_data, file = paste(file_prefix,'_out.rda',sep=''))
  return(list(norm_kos, mets, ko_net, all_comparisons, node_data, cmp_mat))
}


#' Runall function to do complete predictions and evaluate classification of metabolites as high or low abundance
#'
#' @import data.table
#' @param genes Gene abundances
#' @param mets Metabolite abundances
#' @param file_prefix Prefix for output files
#' @param correction Type of multiple hypothesis correction to perform (bonferroni or fdr)
#' @param cutoff Q/P-value cutoff for significance
#' @param net_method Network generation method (see generate_genomic_network)
#' @param id_met Whether metabolites have putative identifications that need to be processed
#' @param met_id_file If id_met, file of metabolite identifications
#' @param degree_filter Threshold for filtering currency metabolites
#' @param minpath_file Optional file of pathways to filter network to
#' @param net_file file containing network template to use
#' @param quant Quantile above which a metabolite is "elevated", default is 0.5
#' @param plot_rank Whether to generate plots of ranks of metabolite concentrations and scores, default is F
#' @param plot_continuous Whether to generate plots of metabolite concentrations and scores, default is F
#' @param nonzero_filter Minimum number of samples required to have nonzero concentrations and nonzero metabolic potential scores in order for metabolite to be evaluated, default is 3
#' @return No return, writes output to file
#' @examples
#'
#' @export
run_all_metabolites2 = function(genes, mets, file_prefix = 'net1', correction = "fdr", cutoff = 0.1,
                               net_method = "load", rxn_table_source = "", id_met = F, met_id_file = '',
                               degree_filter = 0, minpath_file = '',
                               net_file = "", quant = 0.5, plot_rank = F, plot_continuous=F, nonzero_filter=3, rel_abund_mets = F){
  #must be data.tables keyed by KOs/KEGG IDs in first column, all other columns are subject IDs
  #correction must be either "bonferroni" or "fdr", cutoff is q value cutoff
  #id_mets specifies whether to use network for improved metabolite identification (i.e. for Braun datasets)
  #if true, text file with list of putative met ids from Metabosearch must be included
  #can predict above/below median or other threshold with quant argument
  kos = genes[,KO]
  nsamp = dim(genes)[2] - 1
  subjects = intersect(names(genes), names(mets))
  ko_net = generate_genomic_network(kos, keggSource = net_method, degree_filter = degree_filter, minpath_file = minpath_file,
                                    rxn_table = rxn_table_source, networkFile = net_file)
  ko_network_mat = ko_net[[1]]
  stoich_mat = ko_net[[2]]
  ko_net_table = ko_net[[3]]
  if(id_met){
    cat("Selecting best metabolite identifications\n")
    net_compounds = row.names(ko_net[[1]])
    met_id_table = read.table(met_id_file, sep = "\t", header=T)
    #met_id_table = met_id_table[met_id_table$Delta!="-",] #get rid of mets with no possible KEGG ID
    #met_id_list = strsplit(unlist(met_id_list), split = " ")
    if(grepl("swedish",file_prefix)){
      new_mets = select_best_id2(met_id_table, mets, net_compounds)
    } else new_mets = select_best_id(met_id_table, mets, net_compounds)
    mets = new_mets
    cat("Done identifying metabolites!")
  }
  metIDs = mets[,KEGG]
  norm_kos = normalize_ko_counts(genes,subjects, logdata = F) #this actually does nothing right now
  #norm_kos = norm_kos[KO %in% names(ko_network_mat)]
  cmp_mat = get_cmp_scores(ko_network_mat, norm_kos)
  shared_mets = metIDs[metIDs %in% row.names(ko_network_mat)]
  all_comparisons = vector("list",length(shared_mets))
  #The idea here is to simply classify samples as higher or lower than the median and just look at classification error
  #rather than Mantel test
  for(j in 1:length(shared_mets)){
    cmps = unlist(cmp_mat[shared_mets[j],subjects,with=F])
    met1 = unlist(mets[shared_mets[j],subjects, with=F]) #these have very different distributions - I wonder how often this is the case
    if(length(met1[met1!=0]) < nonzero_filter | length(cmps[cmps!=0]) < nonzero_filter){
      all_comparisons[[j]] = NA
    }else{
      med_met = quantile(met1, probs = quant)
      if(any(met1==med_met)) med_cmp = median(cmps[met1==med_met]) else med_cmp = mean(cmps[which(abs(met1-med_met) <= min(abs(met1-med_met)) + 0.001)]) #median of two closest from which median was calculated
      higher_pred = ifelse(cmps > med_cmp,1,0)
      higher_obs = ifelse(met1 > med_met, 1, 0)
      error = sum(abs(higher_pred-higher_obs))/length(higher_obs)
      #need to clarify - this is different from not having any information
      if(plot_rank){
        requireNamespace("ggplot2", quietly = TRUE)
        preds = data.frame(Prediction = factor(higher_pred), cmp = cmps, Value = met1, Rank = rank(met1))
        ggplot2::ggplot(preds, ggplot2::aes(x=Rank,y=Value, col = Prediction)) + ggplot2::geom_point(size=3) + ggplot2::xlim(c(0,max(preds$Rank)))+ ggplot2::theme_bw() + ggplot2::scale_color_manual(values=c("purple","orange")) +
          ggplot2::annotate("text",x=0.3*max(preds$Rank), y=0.8*max(preds$Value),label=met_names(shared_mets[j]))+
          ggplot2::annotate("text",x=0.3*max(preds$Rank), y=0.6*max(preds$Value), label=paste(length(preds$Value[preds$Value==min(preds$Value) & preds$Prediction==0]), length(preds$Value[preds$Value==min(preds$Value) & preds$Prediction==1]), sep = "  "))
        ggplot2::ggsave(file=paste0(file_prefix,"_",shared_mets[j],".png"))
      }
      if(plot_continuous){
        if(any(cmps!=0)){
          plot_ref_mets_by_cmps(met1, cmps,shared_mets[j], file_prefix)
        }
      }
      if(any(cmps!=0)) {
        if(any(higher_pred != 0) & any(higher_pred != 1) & any(higher_obs !=0) & any(higher_obs !=1)){
          ftest = fisher.test(higher_pred, higher_obs, alternative = "greater")
          ftest2 = fisher.test(higher_pred, higher_obs, alternative = "less")
        } else {
          ftest = 1
          ftest2 = 1
        }
      }else {
        ftest = NA
        ftest2 = NA
      }
      all_comparisons[[j]] = list(ID = shared_mets[j], Error = error, Test = ftest, Test2 = ftest2, cmp_med = med_cmp, Met_med = med_met, Pred=higher_pred, Obs=higher_obs)
    }
  }

  #remove ones that didn't pass filter
  shared_mets = shared_mets[which(!is.na(all_comparisons))]
  all_comparisons = all_comparisons[which(!is.na(all_comparisons))]

  #whole set analysis
  pvals = sapply(all_comparisons, function(x){ if(is.na(x$Test)) return(NA) else if(is.numeric(x$Test)) return(1) else return(x$Test$p.value)})
  pvals_c = correct(pvals, method = correction)
  sensitivity = sapply(all_comparisons, function(x){ length(x$Pred[x$Pred==x$Obs & x$Pred==1])/length(x$Obs[x$Obs==1])})
  specificity = sapply(all_comparisons, function(x){ length(x$Pred[x$Pred==x$Obs & x$Pred==0])/length(x$Obs[x$Obs==0])})
  precision = sapply(all_comparisons, function(x){ length(x$Pred[x$Pred==1 & x$Pred==x$Obs])/length(x$Pred[x$Pred==1])})
  #cors_s = sapply(all_comparisons,function(x){return(x$Mantel[[1]]$statistic)})
  #pvals_s = sapply(all_comparisons,function(x){return(x$Mantel[[1]]$signif)})
  #pvals2_s = correct(pvals_s, method = correction)
  pvals_n = sapply(all_comparisons, function(x){if(is.na(x$Test)) return(NA) else if(is.numeric(x$Test2)) return(1) else return(x$Test2$p.value) })
  pvals_nc = correct(pvals_n, method = correction)
  accuracy = 1 - sapply(all_comparisons, function(x){ return(x$Error)})

  node_data = data.table(compound = shared_mets, PValS = pvals, QValPos = pvals_c,
                         #CorrP = cors_p, PValP = pvals_p, QValP = pvals2_p,
                         PValN = pvals_n, QValNeg = pvals_nc, Accuracy = accuracy, Sensitivity = sensitivity, Specificity = specificity, Precision = precision)
  setkey(node_data,compound)
  #write to network file
  write.table(ko_net_table,file=paste(file_prefix,'_edges.txt',sep=''),sep="\t",quote=F,row.names=F)
  #write node attribute file
  write.table(node_data,file = paste(file_prefix,'_nodes2.txt',sep=''),sep="\t",quote=F,row.names=F)
  #plot_mantel_results(all_comparisons, node_data, file_prefix)
  signif_pos = node_data[!is.na(node_data$QValPos) & (node_data$QValPos < cutoff),]
  signif_neg = node_data[!is.na(node_data$QValNeg) & node_data$QValNeg < cutoff,]
  #write.table(signif_pos, file = paste(file_prefix,'_signifPos.txt',sep = ''), sep = "\t", quote = F, row.names = F)
  #write.table(signif_neg, file = paste(file_prefix,'_signifNeg.txt',sep = ''), sep = "\t", quote = F, row.names = F)
  save(norm_kos, mets, ko_net, all_comparisons, node_data, file = paste(file_prefix,'_out2.rda',sep=''))
  return(NULL)
  #return(list(all_comparisons = all_comparisons, met_table = node_data))
}


plot_mantel_results = function(metabolite_list, node_data, file_prefix){
  pdf(file = paste(file_prefix,"_qval_hist.pdf", sep = ''), width = 10)
  par(mfrow = c(1,3))
  hist(node_data$QValPos, main = "Spearman correlation Mantel", ylim = c(0, 0.6*length(metabolite_list)), breaks = 25)
  abline(v = 0.05, col = "red")
  #hist(node_data$QValP, main = "Pearson correlation Mantel", ylim = c(0, 0.6*length(metabolite_list)), breaks = 25)
  #abline(v = 0.05, col = "red")
  hist(node_data$QValNeg, main = "Negative correlation Mantel", ylim = c(0, 0.6*length(metabolite_list)), breaks = 25)
  abline(v = 0.05, col = "red")
  dev.off()
}


plot_ref_mets_by_cmps = function(met, cmps, id, file_prefix){
  requireNamespace("ggplot2", quietly = TRUE)
  med_met = quantile(met, probs = quant)
  if(any(met==med_met)) med_cmp = median(cmps[met==med_met]) else med_cmp = mean(cmps[which(abs(met-med_met) <= min(abs(met-med_met)) + 0.001)]) #median of two closest from which median was calculated
  ref_met = met - med_met
  ref_cmp = cmps - med_cmp
  higher_pred = ifelse(cmps > med_cmp,1,0)
  higher_obs = ifelse(met > med_met, 1, 0)
  plot_data = data.frame(cmp=ref_cmp, Met=ref_met, HigherPred=factor(higher_pred),HigherObs=factor(higher_obs))
  x0=ggplot2::ggplot(plot_data,ggplot2::aes(x=cmp,y=Met))+ggplot2::geom_point(size=4)+ggplot2::xlab("Score reference difference")+ ggplot2::ylab("Metabolite reference difference") + ggplot2::theme_bw() + ggplot2::geom_vline(xintercept=med_cmp, linetype=2) + ggplot2::geom_hline(ylintercept=med_met, linetype=2)#+
  print(med_met)
  print(med_cmp)
    #annotate("text",x=0.3*max(plot_data$cmp), y=0.8*max(plot_data$Met),label=met_names(id))
  return(x0)
  #ggsave(file=paste0(file_prefix,id,".png"))
}



#' Calculate scores based on only a single gene.
#'
#' @import data.table
#'
#'
#'
#' @export
single_gene_cmp = function(compound, gene, norm_kos, ko_net){


}

#' Identify potential important gene contributors for each metabolite.
#'
#' @import data.table
#' @param j index (from lapply usually)
#' @param cmps_sub_good CMP scores for metabolites to be analyzed
#' @param all_rxns list of reaction tables for each compound
#' @param subjects vector of subject names
#' @param norm_kos gene abundance matrix
#' @param ko_net full network
#' @return list of information on contributors. Item 1 is the table of correlations between scores with and without each gene (also written to a file). Item 2 is a table of the associated reactions for each of the contributor genes. Item 3 is an integer equal to 1 if the metabolite scores are predicted primarily by synthesis, -1 if scores are predicted primarily by degradation, or 0 if neither. Item 4 is a vector of the number of genes and contributor genes involved in synthesis and degradation reactions.
#' @examples
#' lapply(1:length(good_mets), cmp_contributions, cmps_sub_good = cmps_sub_good, all_rxns = all_rxns[[j]],
#' subjects=subjects, norm_kos = norm_kos, ko_net = ko_net)
#' @export
gene_contributions = function(j, cmps_sub_good, all_rxns, subjects, norm_kos, ko_net){
  #index (from sapply usually), cmp matrix, list of reaction tables for each compound, subjects, gene matrix, full network
  if(!is.null(all_rxns[[j]])){
    compound = cmps_sub_good[j,compound]
    kos_involved = unique(all_rxns[[j]][Reversible==0,KO])
    ko_vals = norm_kos[KO %in% kos_involved]
    #plan - look at correlation between old cmp score and cmp score w/out each KO - least correlated is most impactful KO
    ko_cmp_cors = sapply(kos_involved, function(x){
      vals_without = ko_vals[KO != x]
      cmp_without = data.matrix(ko_net[[1]][compound,vals_without[,KO]])%*%data.matrix(vals_without[,subjects,with=F])
      return(cor(as.vector(unlist(cmps_sub_good[compound,subjects,with=F])),as.vector(cmp_without), method="spearman"))
    })
    ko_cors = data.table(KO=kos_involved, Cor=ko_cmp_cors)
    #save KOs that have a major effect
    ko_good = ko_cors[is.na(Cor)|(Cor < 0.5),KO]
    net_primary = all_rxns[[j]][KO %in% ko_good & Reversible==0]
    if(dim(net_primary)[1]==0){ #if none have a major effect, look at all reactions to get primary predicting force
      if(all(all_rxns[[j]][Reversible==0,Prod]==compound)){ primary_make = 1} else if(all(all_rxns[[j]][Reversible==0,Reac]==compound)){ primary_make = -1} else primary_make = 0
    } else if(all(net_primary[,Prod]==compound)) {
      primary_make = 1
      } else if(all(net_primary[,Reac]==compound)) {
        primary_make = -1 } else { primary_make = 0 }
    nprod = length(unique(all_rxns[[j]][Reversible==0 & compound==Prod, KO]))
    nreac = length(unique(all_rxns[[j]][Reversible==0 & compound==Reac, KO]))
    if(nrow(net_primary)==0){
      nkey_prod = length(unique(all_rxns[[j]][Reversible==0 & compound==Prod, KO]))
      nkey_reac = length(unique(all_rxns[[j]][Reversible==0 & compound==Reac, KO]))
    } else{
      nkey_prod = length(unique(net_primary[Reversible==0 & compound==Prod, KO]))
      nkey_reac = length(unique(net_primary[Reversible==0 & compound==Reac, KO]))
    }
    #write.table(ko_cors, file = paste0("geneContribAnalysis_", compound, ".txt", ))
    ko_cors = merge(ko_cors,  all_rxns[[j]][KO %in% kos_involved & Reversible==0], by = "KO")
    ko_cors[,compound:=compound]
    compound_info = data.table(compound = compound, PrimaryMake = primary_make, nKOReac = nreac, nKOProd = nprod, nkeyKOReac = nkey_reac, nkeyKOProd = nkey_prod)
    return(list(ko_cors, compound_info))
  } else stop(paste0("No reactions for compound ", j))
}

#' Compare a single set each of CMP scores and metabolite concentrations
#'
#' @import data.table
#' @param met_met metabolite to use from metabolite concentration data
#' @param met_cmp metabolite to use from CMP score data
#' @param met_all matrix of metabolite concentrations
#' @param cmp_all matrix of CMP scores
#' @param posneg Whether to test for positive or negative concentration
#' @param cor_method spearman or pearson
#' @param nperm number of permutations for Mantel test
#' @return p-value from Mantel test
#' @examples
#' compare_met("C00001", "C00002", mets, cmp_mat, "pos")
#' @export
compare_met = function(met_met, met_cmp, met_all, cmp_all, posneg="pos", cor_method="spearman", nperm=15000){
  good_subs = names(met_all)[which(!is.na(unlist(met_all[met_met])) & names(met_all)!="KEGG")]
  met_mat = make_pairwise_met_matrix(met_cmp, cmp_all[,c(good_subs,"compound"),with=F])
  metabol_mat = make_pairwise_met_matrix(met_met, met_all[,c(good_subs,"KEGG"),with=F])
  if(posneg=="pos") test = mantel_2sided(met_mat,metabol_mat,method=cor_method,permutations = nperm, direction = "pos")
  else test = mantel_2sided(met_mat,metabol_mat,method=cor_method,permutations = nperm, direction = "neg")
  return(test$signif)
}


#' Randomize a metabolic network edge list by randomly sampling 2 edges and if it works, switching products
#'
#' @import data.table
#' @param netw Network edge list, format of the 3rd output item of generate_genomic_network
#' @param n_reps Number of successful edge switches to perform (default = 5000)
#' @return Network edge list in the same format with the same compounds and degree distribution, but with randomized edges
#' @examples
#' randomize_net(edge_list, 3000)
#' @export
randomize_net=function(netw, n_reps = 5000){
  revnet = get_non_rev_rxns(netw, all_rxns=F)
  network2=revnet #we'll expand the reversible edges back out later
  #Don't want the switching to be biased towards reversible edges
  n_edges = dim(revnet)[1]
  m = 1
  while(m < n_reps){
    rand_is=sample(n_edges,size=2)
    rand_edges=network2[rand_is]
    if(all(rand_edges[,Reversible]==0) | all(rand_edges[,Reversible]==1)){ ##if both are single edges or both are reversible
      ##check that switched connection does not already exist
      comps=unique(c(rand_edges[,Prod], rand_edges[,Reac]))
      check=network2[Reac %in% comps & Prod %in% comps & KO %in% rand_edges[,KO]]
      if(dim(check)[1]==dim(rand_edges)[1] & length(comps)==4){
        #make the switch
        ind1 = which(network2[,KO]==rand_edges[1,KO] & network2[,Reac]==rand_edges[1,Reac] & network2[,Prod]==rand_edges[1,Prod])
        network2[ind1, Prod:=rand_edges[2,Prod]]
        network2[ind1, stoichProd:=rand_edges[2,stoichProd]]
        ind2 = which(network2[,KO]==rand_edges[2,KO] & network2[,Reac]==rand_edges[2,Reac] & network2[,Prod]==rand_edges[2,Prod])
        network2[ind2, Prod:=rand_edges[1,Prod]]
        network2[ind2, stoichProd:=rand_edges[1,stoichProd]]
        m=m+1
      }
    }
  }
  #Add back reversible complements
  for(j in 1:length(network2[,KO])){
    if(network2[j,Reversible]==1){
      network2 = rbind(network2, data.table(KO=network2[j,KO], Reac=network2[j,Prod], Prod=network2[j,Reac], stoichReac=network2[j,stoichProd], stoichProd=network2[j,stoichReac], Reversible=1))
    }
  }
  network2[,stoichReac:=as.numeric(stoichReac)]
  network2[,stoichProd:=as.numeric(stoichProd)]
  return(network2)
}

#' Run one iteration of MIMOSA analysis with a randomized metabolic network. Run this function many times to generate a null distribution of metabolite comparisons.
#'
#' @import data.table
#' @param out_file File path to .Rda file of MIMOSA output from run_all_metabolites
#' @param id_num ID number of shuffle (if running many iterations)
#' @param n_iter Number of edge switches to generate randomized network, default is 5000
#' @param nonzero_filter Minimum number of samples required to have nonzero concentrations and nonzero metabolic potential scores in order for metabolite to be evaluated, default is 3
#' @param qval_thresholds 1 or more significance thresholds above which to count the number of metabolites
#' @return None, writes two tables to file - one of the total counts of metabolites meeting each threshold, and one of the results for each metabolite
#' @examples
#' run_shuffle("Dataset2_bv_out.rda", id_num = 1)
#' @export
#'
run_shuffle = function(out_file, id_num = 1, n_iter = 5000, nonzero_filter = 3, qval_thresholds = c(0.1, 0.01)){
  load(out_file)
  rxn_table = ko_net[[3]]
  random_net = randomize_net(rxn_table, n_iter)
  net_mats = make_network_matrix(random_net)
  cmps = get_cmp_scores(net_mats[[1]], norm_kos)

  metIDs = mets[,KEGG]
  shared_mets = metIDs[metIDs %in% row.names(net_mats[[1]])]
  mets_shared_only = mets[shared_mets]
  cmps_shared_only = cmps[shared_mets]

  good_data = which(apply(cmps_shared_only, 1, function(x){ length(x[as.numeric(x)!=0]) >= nonzero_filter }) & apply(mets_shared_only, 1, function(x){ length(x[as.numeric(x)!=0]) >= nonzero_filter }))
  mets_shared_only = mets_shared_only[good_data]
  cmps_shared_only = cmps_shared_only[good_data]
  shared_mets = shared_mets[good_data]

  compare_pos = sapply(1:length(shared_mets), function(x){
    compare_met(shared_mets[x], shared_mets[x], mets_shared_only, cmps_shared_only, posneg = "pos", nperm = 10000)
  })
  compare_neg = sapply(1:length(shared_mets), function(x){
    compare_met(shared_mets[x], shared_mets[x], mets_shared_only, cmps_shared_only, posneg = "neg", nperm = 10000)
  })

  corrected_pos = correct(compare_pos, method="fdr")
  corrected_neg = correct(compare_neg, method = "fdr")
  total_pos = c()
  total_neg = c()
  for(j in 1:length(qval_thresholds)){
    total_pos[j] = length(corrected_pos[corrected_pos < qval_thresholds[j] & compare_pos < qval_thresholds[j] & !is.na(corrected_pos)])
    total_neg[j] = length(corrected_neg[corrected_neg < qval_thresholds[j] & compare_neg < qval_thresholds[j] & !is.na(corrected_neg)])
  }

  met_table = data.table(compound=shared_mets, Pos=corrected_pos, Neg=corrected_neg, Iter=rep(id_num, length(shared_mets)))

  file_prefix = paste0(gsub("_out.rda", "", out_file), id_num)

  write.table(data.table(t(total_pos), t(total_neg)), file = paste0(file_prefix, "_shuff_network.txt"), sep = "\t", quote=F, row.names=F, col.names=F)
  write.table(met_table, file = paste0(file_prefix,"_shuff_network_mets.txt"), sep = "\t", quote=F, row.names=F, col.names=F)
}
borenstein-lab/mimosa2 documentation built on Dec. 19, 2024, 12:44 a.m.