R/ReturnPathwaysEnrich_InputAnalytes.R

Defines functions findCluster getCustomPathwayFromAnalyte getPathwayFromAnalyte runCombinedFisherTest runFisherTest

Documented in findCluster getCustomPathwayFromAnalyte getPathwayFromAnalyte runCombinedFisherTest runFisherTest

#' Do fisher test for only one pathway from search result
#' clicked on highchart
#' @param analytes a vector of analytes (genes or metabolites) that need to be searched
#' @param NameOrIds whether input is "names" or "ids" (default is "ids", must be the same for analytes and background)
#' @param total_genes number of genes analyzed in the experiment (e.g. background) (default is 20000, with assumption that analyte_type is "genes")
#' @param analyte_type "metabolites" or "genes" (default is "metabolites")
#' @param MCall T/F if true, all pathways are used for multiple comparison corrections; if false, only pathways covering user analytes will be used (default is "F")
#' @param alternative alternative hypothesis test passed on to fisher.test().  Options are two.sided, greater, or less (default is "less")
#' @param min_path_size the minimum number of pathway members (genes and metabolites) to include the pathway in the output (default = 5)
#' @param max_path_size the maximum number of pathway memnbers (genes and metaboltes) to include the pathway in the output (default = 150)
#' @param background_type type of background that is input by the user.  Opions are "database" if user wants all
#' analytes from the RaMP database will be used; "file", if user wants to input a file with a list of background
#' analytes; "list", if user wants to input a vector of analyte IDs; "biospecimen", if user wants to specify a
#' biospecimen type (e.g. blood, adipose tissue, etc.) and have those biospecimen-specific analytes used.  For genes,
#' only the "database" option is used.
#' @param background background to be used for Fisher's tests.  If parameter 'background_type="database"', this parameter
#' is ignored (default="database"); if parameter 'background_type= "file"', then 'background' should be a file name (with
#' directory); if 'background_type="list"', then 'background' should be a vector of RaMP IDs; if 'backgroud_type="biospecimen"'
#' then users should specify one of the following: "Blood", "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney",
#' "Saliva", and "Feces"
#' @param pathway_definitions If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows
#' @return a dataframe with columns containing pathway ID, fisher's p value, user analytes in pathway, and total analytes in pathway

runFisherTest <- function(analytes,
                          total_genes = 20000,
                          NameOrIds = "ids",
                          analyte_type = "metabolites",
                          MCall = F, alternative = "less", min_path_size = 5, max_path_size = 150,
                          background_type = "database", background = "database",
                          pathway_definitions = "RaMP") {
  if(analyte_type == "genes"){
    background_type = "database"
    print("Using database background for genes")
  }
  now <- proc.time()
  print("Fisher Testing ......")
  if (pathway_definitions != "RaMP") {
    pathwaydf <- getCustomPathwayFromAnalyte(analytes,
      pathway_definitions,
      analyte_type = analyte_type
    )
  } else {
    pathwaydf <- getPathwayFromAnalyte(analytes,
      includeRaMPids = TRUE,
      NameOrIds = NameOrIds,
      find_synonym = FALSE
    )
  }

  pathwayRampId <- rampId <- c()

  pathwayRampId <- rampId <- c()

  if (analyte_type == "metabolites") {
    pathwaydf <- pathwaydf[grep("RAMP_C_", pathwaydf$rampId), ]
  } else if (analyte_type == "genes") {
    pathwaydf <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ]
  }


  # moved this check until we determine if we have analytes of a given type.
  if (nrow(pathwaydf) == 0) {
    return(NULL)
  }

#  if(class(background_type)=="list"){
   if(is(background_type, "list")){
    background = unlist(background)
  }

  if (pathway_definitions == "RaMP"){
    if (background_type == "list" & analyte_type == "metabolites") {
      backgrounddf <- getPathwayFromAnalyte(background,
                                            includeRaMPids = TRUE,
                                            NameOrIds = NameOrIds
                                            )
      print("Custom background specified, genes will be discarded")

    } else if (background_type=="file" & analyte_type == "metabolites") {
      userbkg <- utils::read.table(background, header=F)[,1]
      backgrounddf <- getPathwayFromAnalyte(userbkg,
                                            includeRaMPids = TRUE,
                                            NameOrIds = NameOrIds
                                            )
      print("Custom background specified, genes will be discarded")
    } else if (background_type == "biospecimen" & analyte_type == "metabolites") {
      biospecimen <- background
      if (biospecimen == "Adipose") {
        biospecimen <- "Adipose tissue"
      }

      # we don't need all fields from all tables joined
      # Get metabolites that belong to a specific biospecimen
      # query <- paste0(
      #   "SELECT analytehasontology.*, ontology.*, analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
      #   biospecimen,
      #   "') and ontology.rampOntologyId = analytehasontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
      # )

      # less data pull back
      query <- paste0(
        "SELECT analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
        biospecimen,
        "') and analytehasontology.rampOntologyId = ontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
      )

      con <- connectToRaMP()
      backgrounddf <- RMariaDB::dbGetQuery(con, query)
      RMariaDB::dbDisconnect(con)
      if (nrow(backgrounddf) == 0) {
        stop("Biospecimen background not found. Choices are 'Blood', 'Adipose', 'Heart', 'Urine', 'Brain', 'Liver', 'Kidney', 'Saliva', and 'Feces'")
      }
      # only keep the input metabolites (converted into pathwaydf in line above) that are in the biospecimen type specified
      pathwaydf <- with(pathwaydf, {
        pathwaydf %>%
          dplyr::filter(rampId %in% backgrounddf$rampId)
      })
      if (nrow(pathwaydf) == 0) {
        stop("There are no metabolites in your input that map to your selected biospecimen")
      }
    } else if (background_type == "database") {
    # do nothing, it's handled down below in if statements
    } else {
      stop("background_type was not specified correctly.  Please specify one of the following options: database, file, list, biospecimen")
    }
  }else{
    if (background_type == "list" & analyte_type == "metabolites") {
      backgrounddf <- getCustomPathwayFromAnalyte(background,
                                                  pathway_definitions,
                                                  analyte_type = analyte_type
                                                  )
      print("Custom background specified, genes will be discarded")
    } else if (background_type == "file" & analyte_type == "metabolites") {
      userbkg <- read.table(background, header = F)[, 1]
      backgrounddf <- getCustomPathwayFromAnalyte(userbkg,
                                                  pathway_definitions,
                                                  analyte_type = analyte_type
                                                  )
      print("Custom background specified, genes will be discarded")
    }else if(analyte_type == "genes"){
      # do nothing, it's handled down below in if statements
    }else{
      stop("Only custom backgrounds are supported for custom pathway definitions. Please provide a 'list' or 'file' containing the analyte background")
    }
  }

  ## Check that all metabolites of interest are in the background
  if (background_type != "database") {
    if (length(setdiff(pathwaydf$rampId, backgrounddf$rampId) != 0)) {
      stop("All analytes in set of interest must also be in background")
    }
  }

  ## Initialize empty contingency table for later
  contingencyTb <- matrix(0, nrow = 2, ncol = 2)
  colnames(contingencyTb) <- c("In Pathway", "Not In Pathway")
  rownames(contingencyTb) <- c("All Metabolites", "User's Metabolites")
  ## Get pathway ids that contain the user analytes
  pid <- unique(pathwaydf$pathwayRampId)
  list_pid <- sapply(pid, shQuote)
  list_pid <- paste(list_pid, collapse = ",")

  # Get the total number of metabolites that are mapped to pathways in RaMP (that's the default background)
  # added conditional to not pull hmdb ids
  query <- "select distinct rampId, pathwaySource from analytehaspathway where pathwaySource != 'hmdb';"
  con <- connectToRaMP()
  allids <- RMariaDB::dbGetQuery(con, query)

  # Close connection, then deduplicate id list
  RMariaDB::dbDisconnect(con)
  allids <- allids[!duplicated(allids), ]


  if ((analyte_type == "metabolites")) {

    # JCB replaced these lines. Reducing to a source, extracting compound indices, then applying to the full set of rampIds
    # caused an error in the tally of analytes
    #
    # wiki_totanalytes <- length(unique(allids$rampId[grep("RAMP_C",allids[which(allids$pathwaySource=="wiki"),"rampId"])]))
    # react_totanalytes <- length(unique(allids$rampId[grep("RAMP_C",allids[which(allids$pathwaySource=="reactome"),"rampId"])]))
    # kegg_totanalytes <- length(unique(allids$rampId[grep("RAMP_C",allids[which(allids$pathwaySource=="kegg"),"rampId"])]))

    # first extract source-specific ids, then select for compound ids from the source-specific ids
    sourceIds <- allids[which(allids$pathwaySource == "wiki"), "rampId"]
    wiki_totanalytes <- length(unique(sourceIds[grep("RAMP_C", sourceIds)]))

    sourceIds <- allids[which(allids$pathwaySource == "reactome"), "rampId"]
    react_totanalytes <- length(unique(sourceIds[grep("RAMP_C", sourceIds)]))

    ## sourceIds <- allids[which(allids$pathwaySource=="kegg"),"rampId"]
    ## kegg_totanalytes <- length(unique(sourceIds[grep("RAMP_C",sourceIds)]))

    sourceIds <- allids[which(allids$pathwaySource == "kegg"), "rampId"]
    kegg_totanalytes <- length(unique(sourceIds[grep("RAMP_C", sourceIds)]))
  } else if (analyte_type == "genes") {

    # for now we're using a fixed population size for genes
    # this can be enhanced to take a list of all measured genes
    # or use a subset of genes having pathway annotations within each source
    wiki_totanalytes <- react_totanalytes <- kegg_totanalytes <- total_genes
  } else {
    print("analyte_type must be 'metabolites' or 'genes'")
  }

  ## Input_RampIds is a table of all analytes included in pathways represented in the user set
  ## "User" refers to significant analytes
  input_RampIds <- buildFrequencyTables(pathwaydf, pathway_definitions, analyte_type)

  if (is.null(input_RampIds)) {
    stop("Data doesn't exist")
  } else {
    segregated_id_list <- segregateDataBySource(input_RampIds)
  }
  # Loop through each pathway, build the contingency table, and calculate Fisher's Exact
  # test p-value
  pval <- totinpath <- userinpath <- pidused <- c()

  pidCount = 0

  for (i in pid) {
    ids_inpath <- pathwaydf[which(pathwaydf$pathwayRampId == i), "rampId"]

    pidCount = pidCount + 1

    if (analyte_type == "metabolites") {
      # Check to make sure that this pathway does have metabolites
      if (length(grep("RAMP_C", ids_inpath)) == 0) {
        user_in_pathway <- 0
      } else {
        user_in_pathway <- length(unique(grep("RAMP_C", ids_inpath, value = TRUE)))
        if (background_type != "database") {
          ids_inpath_bg <- backgrounddf[which(backgrounddf$pathwayRampId == i), "rampId"]
          bg_in_pathway <- length(unique(grep("RAMP_C", ids_inpath_bg, value = TRUE)))
        }
      }
      if(pidCount == 1) {
        inputkegg <- segregated_id_list[[1]][1][[1]]
        inputreact <- segregated_id_list[[1]][2][[1]]
        inputwiki <- segregated_id_list[[1]][3][[1]]
        inputcustom <- segregated_id_list[[1]][[4]]
        tot_user_analytes <- length(grep("RAMP_C", unique(pathwaydf$rampId)))
        if (background_type != "database") {
          tot_bg_analytes <- length(grep("RAMP_C", unique(backgrounddf$rampId)))
        }
      }
    } else { # if genes
      # Check to make sure that this pathway does have genes
      if (length(grep("RAMP_G", ids_inpath)) == 0) {
        user_in_pathway <- 0
      } else {
        user_in_pathway <- length(unique(grep("RAMP_G", ids_inpath, value = TRUE)))
      }
      if(pidCount == 1) {
        inputkegg <- segregated_id_list[[2]][1][[1]]
        inputreact <- segregated_id_list[[2]][2][[1]]
        inputwiki <- segregated_id_list[[2]][3][[1]]
        inputcustom <- segregated_id_list[[2]][[4]]
        tot_user_analytes <- length(grep("RAMP_G", unique(pathwaydf$rampId)))
      }
      ## tot_bg_analytes <- length(grep("RAMP_G", unique(backgrounddf$rampId)))
    }
    if ((!is.na(inputkegg$pathwayRampId[1])) && i %in% inputkegg$pathwayRampId) {
      tot_in_pathway <- inputkegg[which(inputkegg[, "pathwayRampId"] == i), "Freq"]
      total_pathway_analytes <- kegg_totanalytes
    } else if ((!is.na(inputwiki$pathwayRampId[1])) && i %in% inputwiki$pathwayRampId) {
      tot_in_pathway <- inputwiki[which(inputwiki[, "pathwayRampId"] == i), "Freq"]
      total_pathway_analytes <- wiki_totanalytes
    } else if ((!is.na(inputreact$pathwayRampId[1])) && i %in% inputreact$pathwayRampId) {
      tot_in_pathway <- inputreact[which(inputreact[, "pathwayRampId"] == i), "Freq"]
      total_pathway_analytes <- react_totanalytes
    } else if ((!is.na(inputcustom$pathwayRampId[1])) && i %in% inputcustom$pathwayRampId) {
      tot_in_pathway <- inputcustom[which(inputcustom[, "pathwayRampId"] == i), "Freq"]
      if(analyte_type == "metabolites"){
        total_pathway_analytes <- 0
      }else{
        total_pathway_analytes <- kegg_totanalytes
      }
    } else {
      tot_in_pathway <- 0
    }

    if (tot_in_pathway == 0 || user_in_pathway == 0) {
      pval <- c(pval, NA)
    } else {
      tot_out_pathway <- total_pathway_analytes - tot_in_pathway
      # fill the rest of the table out

      ## user_in_pathway <- length(unique(pathwaydf[which(pathwaydf$pathwayRampId==i),"rampId"]))
      if (background_type != "database") {
        bg_in_pathway <- length(unique(backgrounddf[which(backgrounddf$pathwayRampId == i), "rampId"]))
      }
      # EM - Corrected the following line that initially counted all input analytes without regard as to whether
      # whether they were genes or metabolites.
      # user_out_pathway <- length(unique(pathwaydf$rampId)) - user_in_pathway
      user_out_pathway <- tot_user_analytes - user_in_pathway

      if (background_type != "database") {
        bg_in_pathway <- length(unique(backgrounddf[which(backgrounddf$pathwayRampId == i), "rampId"]))
        bg_out_pathway <- tot_bg_analytes - bg_in_pathway
      }

      if (background_type == "database") {
        contingencyTb[1, 1] <- tot_in_pathway - user_in_pathway
      } else {
        contingencyTb[1, 1] <- bg_in_pathway
      }

      if (background_type == "database") {
        contingencyTb[1, 2] <- tot_out_pathway - user_out_pathway
      } else {
        contingencyTb[1, 2] <- bg_out_pathway
      }
      contingencyTb[2, 1] <- user_in_pathway
      contingencyTb[2, 2] <- user_out_pathway
      # Put the test into a try catch in case there's an issue, we'll have some details on the contingency matrix
      tryCatch(
        {
          result <- stats::fisher.test(contingencyTb, alternative = alternative)
        },
        error = function(e) {
          print(toString(e))
          print(i)
          print(contingencyTb)
          print(tot_in_pathway)
          print(tot_out_pathway)
          print(user_in_pathway)
          print(user_out_pathway)
          print(analyte_type)
          print(pathwaydf)
        }
      )

      pval <- c(pval, result$p.value)
    } # End else tot_in_pathway is not zero

    userinpath <- c(userinpath, user_in_pathway)
    totinpath <- c(totinpath, tot_in_pathway)
    pidused <- c(pidused, i)
  } # end for loop

  print("")
  print(now - proc.time())
  print("before optional MCall")
  print("")

  # Now run fisher's tests for all other pids (all pathways not covered in dataset)
  if (MCall == T) {
    # Now run fisher's tests for all other pids
    query <- "select distinct(pathwayRampId) from analytehaspathway where pathwaySource != 'hmdb';"
    con <- connectToRaMP()
    allpids <- RMariaDB::dbGetQuery(con, query)
    RMariaDB::dbDisconnect(con)
    pidstorun <- setdiff(allpids[, 1], pid)
    pidstorunlist <- sapply(pidstorun, shQuote)
    pidstorunlist <- paste(pidstorunlist, collapse = ",")

    query2 <- paste0(
      "select rampId,pathwayRampId from analytehaspathway where pathwayRampId in (",
      pidstorunlist, ")"
    )
    con <- connectToRaMP()
    restcids <- RMariaDB::dbGetQuery(con, query2) # [[1]]
    RMariaDB::dbDisconnect(con)

    # modify to not take hmdb pathways
    query1 <- paste0("select rampId,pathwayRampId from analytehaspathway where pathwaySource != 'hmdb';")

    con <- connectToRaMP()
    allcids <- RMariaDB::dbGetQuery(con, query1) # [[1]]
    RMariaDB::dbDisconnect(con)

    # We're collecting p-values for all pathways, now those with no analyte support at all - JCB:?

    # calculating p-values for all other pathways
    kegg_metab <- kegg_metab
    kegg_gene <- kegg_gene
    wiki_metab <- wiki_metab
    wiki_gene <- wiki_gene
    reactome_metab <- reactome_metab
    reactome_gene <- reactome_gene
    hmdb_metab <- hmdb_metab
    hmdb_gene <- hmdb_gene
    count <- 1
    pval2 <- userinpath2 <- totinpath2 <- c()

    for (i in pidstorun) {
      if ((count %% 100) == 0) {
        print(paste0("Processed ", count))
      }
      count <- count + 1
      user_in_pathway <- 0
      if (analyte_type == "metabolites") {
        if (i %in% kegg_metab$pathwayRampId) {
          tot_in_pathway <- kegg_metab[which(kegg_metab[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- kegg_totanalytes
        } else if (i %in% wiki_metab$pathwayRampId) {
          tot_in_pathway <- wiki_metab[which(wiki_metab[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- wiki_totanalytes
        } else if (i %in% reactome_metab$pathwayRampId) {
          tot_in_pathway <- reactome_metab[which(reactome_metab[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- react_totanalytes
        } else if (i %in% hmdb_metab$pathwayRampId) {
          tot_in_pathway <- hmdb_metab[which(hmdb_metab[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- NULL
        } else {
          tot_in_pathway <- 0
          total_analytes <- NULL
        }
      } else {
        if (i %in% kegg_gene$pathwayRampId) {
          tot_in_pathway <- kegg_gene[which(kegg_gene[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- kegg_totanalytes
        } else if (i %in% wiki_gene$pathwayRampId) {
          tot_in_pathway <- wiki_gene[which(wiki_gene[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- wiki_totanalytes
        } else if (i %in% reactome_gene$pathwayRampId) {
          tot_in_pathway <- reactome_gene[which(reactome_gene[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- react_totanalytes
        } else if (i %in% hmdb_gene$pathwayRampId) {
          tot_in_pathway <- hmdb_gene[which(hmdb_gene[, "pathwayRampId"] == i), "Freq"]
          total_analytes <- NULL
        } else {
          tot_in_pathway <- 0
          total_analytes <- NULL
        }
      }

      # Check that the pathway being considered has your analyte type, if not, move on

      if (is.null(total_analytes)) {
        next
      }
      tot_out_pathway <- total_analytes - tot_in_pathway
      # fill the rest of the table out

      # JCB: Another issue 10/7/2020
      # This line was used for user_out_pathway
      # This section of code is for all pathways that have no analyte support.
      user_out_pathway <- length(unique(pathwaydf$rampId))

      # This line was commented out in production *but* now we have total_analytes set properly
      # not sure why this line was changed.
      # user_out_pathway <- total_analytes - user_in_pathway

      contingencyTb[1, 1] <- tot_in_pathway - user_in_pathway
      contingencyTb[1, 2] <- tot_out_pathway - user_out_pathway
      contingencyTb[2, 1] <- user_in_pathway
      contingencyTb[2, 2] <- user_out_pathway
      # Added try catch
      tryCatch(
        {
          result <- stats::fisher.test(contingencyTb, alternative = alternative)
        },
        error = function(e) {
          print(toString(e))
          print(i)
          print(contingencyTb)
        }
      )

      pval2 <- c(pval2, result$p.value)
      userinpath2 <- c(userinpath2, user_in_pathway)
      totinpath2 <- c(totinpath2, tot_in_pathway)
      # pidused <- c(pidused,i)
    } # end for loop
    keepers <- intersect(
      which(c(totinpath, totinpath2) >= min_path_size),
      which(c(totinpath, totinpath2) < max_path_size)
    )

    print(paste0("Calculated p-values for ", length(c(pval, pval2)), " pathways"))
    out <- data.frame(
      pathwayRampId = c(pidused, pidstorun)[keepers],
      Pval = c(pval, pval2)[keepers], # FDR.Adjusted.Pval=fdr,
      # Holm.Adjusted.Pval=holm,
      Num_In_Path = c(userinpath, userinpath2)[keepers],
      Total_In_Path = c(totinpath, totinpath2)[keepers]
    )
  } # end if MCall is T and all pathways are being calculated, even ones that do not represent input analytes

  else {

    # only keep pathways that have >= min_path_size or < max_path_size compounds
    keepers <- intersect(
      which(c(totinpath) >= min_path_size),
      which(c(totinpath) < max_path_size)
    )

    # hist(totinpath,breaks=1000)
    print(paste0("Keeping ", length(keepers), " pathways"))
    # fdr <- stats::p.adjust(c(pval,pval2)[keepers],method="fdr")
    # holm <- stats::p.adjust(c(pval,pval2)[keepers],method="holm")
    print(paste0("Calculated p-values for ", length(pval), " pathways"))

    # format output (retrieve pathway name for each unique source id first
    out <- data.frame(
      pathwayRampId = pidused[keepers],
      Pval = pval[keepers], # FDR.Adjusted.Pval=fdr,
      # Holm.Adjusted.Pval=holm,
      Num_In_Path = userinpath[keepers],
      Total_In_Path = totinpath[keepers]
    )
  }
  # End else if MCall (when False)

  # Remove duplicate pathways between wikipathways and reactome, only perfect overlaps
  # only make the dup list if it doesn't exist from a previous run in the session
  if( !exists('duplicate_pathways')) {
    duplicate_pathways <<- findDuplicatePathways()
  }
  if (any(out$pathwayRampId %in% duplicate_pathways)) {
    out <- out[-which(out$pathwayRampId %in% duplicate_pathways), ]
  }

  out <- out[!duplicated(out), ]

  # for user is the output needed, based on what user input
  return(list(out, pathwaydf))
}

#' Do fisher test for only one pathway from search result
#' clicked on highchart
#' @param analytes a vector of analytes (genes or metabolites) that need to be searched
#' @param NameOrIds whether input is "names" or "ids" (default is "ids", must be the same for analytes and background)
#' @param total_genes number of genes analyzed in the experiment (e.g. background) (default is 20000, with assumption that analyte_type is "genes")
#' @param min_analyte if the number of analytes (gene or metabolite) in a pathway is
#' < min_analyte, do not report
#' @param MCall T/F if true, all pathways are used for multiple comparison corrections; if false, only pathways covering user analytes will be used (default is "F")
#' @param alternative alternative hypothesis test passed on to fisher.test().  Options are two.sided, greater, or less (default is "less")
#' @param min_path_size the minimum number of pathway members (genes and metabolites) to include the pathway in the output (default = 5)
#' @param max_path_size the maximum number of pathway memnbers (genes and metaboltes) to include the pathway in the output (default = 150)
#' @param includeRaMPids include internal RaMP identifiers (default is "FALSE")
#' @param background_type type of background that is input by the user.  Opions are "database" if user wants all
#' analytes from the RaMP database to be used as background; "file", if user wnats to input a file path with a list of background
#' analytes; "list", if user wants to input a vector of analyte IDs; "biospecimen", if user wants to specify a
#' biospecimen type (e.g. blood, adipose tissue, etc.) and have those biospecimen-specific analytes used.  For genes,
#' only the "database" option is used.
#' @param background background to be used for Fisher's tests.  If parameter 'background_type="database"', this parameter
#' is ignored (default="database"); if parameter 'background_type= "file"', then 'background' should be a file name (with
#' directory); if 'background_type="list"', then 'background' should be a vector of RaMP IDs; if 'backgroud_type="biospecimen"'
#' then users should specify one of the following: "Blood", "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney",
#' "Saliva", and "Feces"
#' @param pathway_definitions If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows. Please supply a .xls or .xlsx file. If supplying pathway definitions for genes and metabolites, ensure that metabolite definitions are on tab 1, and gene definitions are on tab2.
#' @return a list containing two entries: [[1]] fishresults, a dataframe containing pathways with Fisher's p values
#' (raw and with FDR and Holm adjustment), number of user analytes in pathway, total number of analytes in pathway,
#' and pathway source ID/database. [[2]] analyte_type, a string specifying the type of analyte input into the function ("genes", "metabolites", or "both")
#' @examples
#' \dontrun{
#' pkg.globals <- setConnectionToRaMP(
#'   dbname = "ramp2", username = "root",
#'   conpass = "", host = "localhost"
#' )
#' analyte.list <- c(
#'   "chebi:15344", "chebi:10983", "chebi:15351",
#'   "uniprot:Q86V21", "uniprot:Q02338", "uniprot:Q9BUT1"
#' )
#'
#' fisher.results <- runCombinedFisherTest(analytes = analyte.list, NameOrIds = "ids")
#' }
#' @export
runCombinedFisherTest <- function(analytes,
                                  NameOrIds = "ids",
                                  total_genes = 20000,
                                  min_analyte = 2,
                                  MCall = F,
                                  alternative = "less",
                                  min_path_size = 5,
                                  max_path_size = 150,
                                  includeRaMPids = FALSE,
                                  background_type = "database",
                                  background = "database",
                                  pathway_definitions = "RaMP") {
  G <- M <- 0

  # Grab pathways that contain metabolites to run Fisher on metabolites
  # This will return all pathways that have at 8-120 metabolites/genes in them
  ## fishmetab <- pathwaydf[grep("RAMP_C_", pathwaydf$rampId), ]

  print("Running Fisher's tests on metabolites")

  outmetab <- runFisherTest(
    analytes = analytes,
    analyte_type = "metabolites",
    total_genes = total_genes,
    MCall = MCall,
    min_path_size = min_path_size,
    max_path_size = max_path_size,
    background_type = background_type,
    background = background,
    pathway_definitions = pathway_definitions
  )
  pathwaydf_metab <- outmetab[[2]]
  outmetab <- outmetab[[1]]
  if (!is.null(outmetab)) {
    M <- 1
  }


  ## Grab pathways that contain genes to run Fisher on genes
  ## fishgene <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ]
  ## Genes are not evaluated if custom background is specified
  if (background_type == "database" & pathway_definitions == "RaMP") {
    print("Running Fisher's tests on genes")
    outgene <- runFisherTest(
      analytes = analytes,
      analyte_type = "genes",
      total_genes = total_genes,
      MCall = MCall,
      min_path_size = min_path_size,
      max_path_size = max_path_size
    )
    pathwaydf_gene <- outgene[[2]]
    outgene <- outgene[[1]]
  } else if (pathway_definitions != "RaMP") {
    outgene <- runFisherTest(
      analytes = analytes,
      analyte_type = "genes",
      total_genes = total_genes,
      MCall = MCall,
      min_path_size = min_path_size,
      max_path_size = max_path_size,
      background_type = background_type,
      background = background,
      pathway_definitions = pathway_definitions
    )
    pathwaydf_gene <- outgene[[2]]
    outgene <- outgene[[1]]
  } else {
    outgene <- NULL
    pathwaydf_gene <- NULL
  }

  # if no ids map to pathways, return an empty result.
  if ((is.null(pathwaydf_metab) || nrow(pathwaydf_metab) < 1) &&
    (is.null(pathwaydf_gene) || nrow(pathwaydf_gene) < 1)) {
    print("input analyte names did not map to RaMP pathways. Returning empty result.")
    return(list(fishresults = data.frame(), analyte_type = "none_empty_pathway_mapping", result_type = "pathway_enrichment"))
  }

  pathwaydf <- rbind(pathwaydf_metab, pathwaydf_gene)

  if (!is.null(outgene)) {
    G <- 1
  }
  if (is.null(outgene) & !is.null(outmetab)) {
    out <- outmetab
    fdr <- stats::p.adjust(out$Pval, method = "fdr")
    out <- cbind(out, fdr)
    colnames(out)[ncol(out)] <- "Pval_FDR"
    holm <- stats::p.adjust(out$Pval, method = "holm")
    out <- cbind(out, holm)
    colnames(out)[ncol(out)] <- "Pval_Holm"
    keepers <- which(out$Num_In_Path >= min_analyte)
    out2 <- merge(
      pathwaydf_metab[, c(
        "pathwayName", "pathwayRampId", "pathwayId",
        "pathwaySource"
      )],
      out[keepers, ],
      by = "pathwayRampId"
    )
  } else if (!is.null(outgene) && is.null(outmetab)) {
    out <- outgene
    fdr <- stats::p.adjust(out$Pval, method = "fdr")
    out <- cbind(out, fdr)
    colnames(out)[ncol(out)] <- "Pval_FDR"
    holm <- stats::p.adjust(out$Pval, method = "holm")
    out <- cbind(out, holm)
    colnames(out)[ncol(out)] <- "Pval_Holm"
    keepers <- which(out$Num_In_Path >= min_analyte)
    out2 <- merge(
      pathwaydf_gene[, c(
        "pathwayName", "pathwayRampId", "pathwayId",
        "pathwaySource"
      )],
      out[keepers, ],
      by = "pathwayRampId"
    )
  } else {
    # merge the results if both genes and metabolites were run
    G <- M <- 1
    allfish <- merge(outmetab, outgene,
      by = "pathwayRampId", all.x = T, all.y = T
    )
    colnames(allfish)[which(colnames(allfish) == "Pval.x")] <- "Pval_Metab"
    colnames(allfish)[which(colnames(allfish) == "Pval.y")] <- "Pval_Gene"
    colnames(allfish)[which(colnames(allfish) == "Total_In_Path.x")] <- "Total_In_Path_Metab"
    colnames(allfish)[which(colnames(allfish) == "Total_In_Path.y")] <- "Total_In_Path_Gene"
    colnames(allfish)[which(colnames(allfish) == "Num_In_Path.x")] <- "Num_In_Path_Metab"
    colnames(allfish)[which(colnames(allfish) == "Num_In_Path.y")] <- "Num_In_Path_Gene"

    # Calculate combined p-values for pathways that have both genes and metabolites
    gm <- intersect(which(!is.na(allfish$Pval_Metab)), which(!is.na(allfish$Pval_Gene)))
    combpval <- stats::pchisq(-2 * (log(allfish$Pval_Metab[gm]) + log(allfish$Pval_Gene[gm])),
      df = 2, lower.tail = FALSE
    )

    g <- which(is.na(allfish$Pval_Metab))
    gpval <- allfish$Pval_Gene[g]
    m <- which(is.na(allfish$Pval_Gene))
    mpval <- allfish$Pval_Metab[m]

    out <- rbind(allfish[gm, ], allfish[g, ], allfish[m, ])
    out <- cbind(out, c(combpval, gpval, mpval))
    colnames(out)[ncol(out)] <- "Pval_combined"
    fdr <- stats::p.adjust(out$Pval_combined, method = "fdr")
    out <- cbind(out, fdr)
    colnames(out)[ncol(out)] <- "Pval_combined_FDR"
    holm <- stats::p.adjust(out$Pval_combined, method = "holm")
    out <- cbind(out, holm)
    colnames(out)[ncol(out)] <- "Pval_combined_Holm"

    ## keepers <- intersect(
    ##   c(
    ##     which(out$Num_In_Path_Metab >= min_analyte),
    ##     which(is.na(out$Num_In_Path_Metab))
    ##   ),
    ##   c(
    ##     which(out$Num_In_Path_Gene >= min_analyte),
    ##     which(is.na(out$Num_In_Path_Gene))
    ##   )
    ## )

    keepers <- unique(
      c(
        which(out$Num_In_Path_Metab >= min_analyte),
        which(out$Num_In_Path_Gene >= min_analyte)
      )
    )

    # Now that p-values are calculated, only return pathways that are in the list
    # of pathways that contain user genes and metabolites
    ## pathwaydf <- getPathwayFromAnalyte(analytes,
    ##   includeRaMPids = TRUE,
    ##   NameOrIds = NameOrIds
    ##   )
    if(pathway_definitions!="RaMP"){
      pathwaydf$pathwayName = pathwaydf$pathwayRampId
    }
    out2 <- merge(
      pathwaydf[, c(
        "pathwayName", "pathwayRampId", "pathwayId",
        "pathwaySource"
      )],
      out[keepers, ],
      by = "pathwayRampId"
    )
  } # end merging when genes and metabolites were run
  out2 <- out2[!duplicated(out2), ]

  analyte_type <- c()
  if (G == 1 && M == 1) {
    analyte_type <- "both"
  } else if (G == 1 && M == 0) {
    analyte_type <- "genes"
  } else if (G == 0 && M == 1) {
    analyte_type <- "metabolites"
  }

  out2$analytes <- apply(out2, 1, function(x) {
    pathwayid <- x["pathwayRampId"]
    sigpathwaydf <- pathwaydf[which(pathwaydf$pathwayRampId == pathwayid), ]

    analytes <- sigpathwaydf[, "commonName"] %>%
      paste0(collapse = ";")
    return(analytes)
  })
  if (includeRaMPids) {
    return(list(fishresults = out2, analyte_type = analyte_type, result_type = "pathway_enrichment"))
  } else {
    return(list(fishresults = out2 %>% cleanup(), analyte_type = analyte_type, result_type = "pathway_enrichment"))
  }
}


#' Function that search analytes (gene or compounds)  or a list of analytes and
#' returns associated pathways
#'
#' @param analytes a vector of analytes (genes or metabolites) that need to be searched
#' @param find_synonym find all synonyms or just return same synonym (T/F)
#' @param NameOrIds whether input is "names" or "ids" (default is "ids")
#' @param includeRaMPids include internal RaMP identifiers (default is "FALSE")
#' @return a list contains all metabolites as name and pathway inside.
#' @examples
#' \dontrun{
#' pkg.globals <- setConnectionToRaMP(
#'   dbname = "ramp2", username = "root",
#'   conpass = "", host = "localhost"
#' )
#' mypath <- getPathwayFromAnalyteV2(analytes = c("2-hydroxyglutarate", "glutamate"))
#' }
#' @export
getPathwayFromAnalyte <- function(analytes = "none",
                                  find_synonym = FALSE,
                                  NameOrIds = "ids",
                                  includeRaMPids = FALSE) {

  rampId <- pathwayRampId <- c()

  print("Starting getPathwayFromAnalyte()")
  if (is.null(analytes) || length(analytes) == 0) {
    warning("Input analyte list is NULL or empty. Aborting getPathwayFromAnalyte()")
    return(NULL)
  }

  if (!(NameOrIds %in% c("ids", "names"))) {
    warning(paste0(
      "NameOrIds must have a value in c('ids','names')\n",
      "Supplied NameOrIds falue = ('", NameOrIds, "')\nAborting getPathwayFromAnlyte()"
    ))
    return(NULL)
  }

  list_metabolite <- analytes
  list_metabolite <- sapply(list_metabolite, shQuote)
  list_metabolite <- paste(list_metabolite, collapse = ",")
  if (list_metabolite == "") {
    warning("Unable to retrieve metabolites")
    return(NULL)
  }

  if (NameOrIds == "ids") {
    print("Working on ID List...")
    sql <- paste0("select p.pathwayName, p.type as pathwaySource, p.sourceId as pathwayId, s.sourceId as inputId, group_concat(distinct s.commonName order by s.commonName separator '; ') as commonName, s.rampId, p.pathwayRampId from
                  source s,
                  analytehaspathway ap,
                  pathway p
                  where
                  s.sourceId in (", list_metabolite, ")
                  and
                  ap.rampId = s.rampId
                  and
                  p.pathwayRampId = ap.pathwayRampId
                  and
                  p.type != 'hmdb'
                  group by inputId, rampId, pathwayId, p.pathwayName, p.type, p.pathwayRampId
                  order by pathwayName asc")
  } else {
    print("Working on analyte name list...")
    sql <- paste0(
      "select p.pathwayName, p.type as pathwaySource, p.sourceId as pathwayId, lower(asyn.Synonym) as inputCommonName, group_concat(distinct s.sourceId order by s.sourceId separator '; ') as sourceIds, s.rampId, p.pathwayRampId
    from
    source s,
    analytesynonym asyn,
    analytehaspathway ap,
    pathway p
    where
    asyn.Synonym in (", list_metabolite, ")
    and
    s.rampId = asyn.rampId
    and
    ap.rampId = s.rampId
    and
    p.pathwayRampId = ap.pathwayRampId
    and
    p.type != 'hmdb'
    group by inputCommonName, s.rampId, pathwayId, p.pathwayName, p.type, p.pathwayRampId
    order by pathwayName asc
  "
    )
  }

  con <- connectToRaMP()
  df2 <- RMariaDB::dbGetQuery(con, sql)


  if (find_synonym && nrow(df2) > 0) {
    rampIds <- df2[, "rampId"]

    rampIds <- sapply(rampIds, shQuote)
    rampIds <- paste(rampIds, collapse = ",")

    sql <- paste0("select rampId as rampId, group_concat(distinct Synonym order by Synonym separator '; ')
     as synonyms from analytesynonym
     where rampId in (", rampIds, ") group by rampId")

    synonymsDf <- RMariaDB::dbGetQuery(con, sql)

    if (nrow(synonymsDf) > 0) {
      df2 <- merge(df2, synonymsDf, by = "rampId")
    }
  }

  RMariaDB::dbDisconnect(con)

  if (!includeRaMPids && nrow(df2) > 0) {
    df2 <- subset(df2, select = -c(rampId, pathwayRampId))
  }

  print("finished getPathwayFromAnalyte()")
  print(paste0("Found ", nrow(df2), " associated pathways."))

  return(df2)
}

#' Reads an excel file containing metabolite and gene annotations and filters pathways
#' to those pathways represented by the input analytes.
#'
#' @param analytes a vector of analytes (genes or metabolites) that need to be searched
#' @param pathways If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows
#' @param analyte_type "genes" or "metabolites"
#' @return A pathwaydf compatible with runFisherTest
#' @author Andrew Patt
getCustomPathwayFromAnalyte <- function(analytes, pathways, analyte_type) {
  print("Starting getCustomPathwayFromAnalyte()")
  tryCatch(
    {
      if (analyte_type == "metabolites") {
        pathway_definitions <- readxl::read_excel(pathways, sheet = 1)
      } else if (analyte_type == "genes") {
        pathway_definitions <- readxl::read_excel(pathways, sheet = 2)
      }
    },
    error = function(e) {
      print("Pathway file could not be found or is improperly formatted. Please supply path to GMX file for custom pathway definitions")
    }
  )

  pathwaydf <- data.frame(Analyte = character(), Pathway = character())

  for (i in analytes) {
    for (j in 1:ncol(pathway_definitions)) {
      if (i %in% unlist(pathway_definitions[, j])) {
        pathwaydf <- rbind(pathwaydf, data.frame(
          Analyte = i,
          Pathway = colnames(pathway_definitions)[j]
        ))
      }
    }
  }

  print(paste0(
    "Found ", length(unique(pathwaydf$Analyte)),
    " out of ", length(unique(analytes)),
    " input analytes in pathway definitions file"
  ))

  pathwaydf$pathwayname <- pathwaydf$pathwayId <- pathwaydf$pathwayRampId <- pathwaydf$Pathway
  pathwaydf$inputId <- pathwaydf$commonName <- pathwaydf$Analyte
  if (analyte_type == "metabolites") {
    pathwaydf$rampId <- paste0("RAMP_C_", pathwaydf$inputId)
  } else if (analyte_type == "genes") {
    pathwaydf$rampId <- paste0("RAMP_G_", pathwaydf$inputId)
  }
  pathwaydf$pathwaySource <- "custom"
  pathwaydf <- subset(pathwaydf, select = -c(Pathway, Analyte))

  return(pathwaydf)
}



#' Perform fuzzy multiple linkage partitioning clustering on pathways
#' identified by Fisher's test
#'
#' @param fishers_df The full result object generated by runCombinedFisherTest
#' @param perc_analyte_overlap Minimum overlap for pathways to be considered similar
#' (Default = 0.5)
#' @param min_pathway_tocluster Minimum number of 'similar' pathways required to start
#' a cluster (medoid) (Default = 3)
#' @param perc_pathway_overlap Minimum overlap for clusters to merge (Default = 0.5)
#'
#' @return list:[[1]] Pathway enrichment result with dataframe having a cluster assignment column added
#' [[2]] analyte type
#' [[3]] cluster assignment in the list form
#' @examples
#' \dontrun{
#' pkg.globals <- setConnectionToRaMP(
#'   dbname = "ramp2", username = "root",
#'   conpass = "", host = "localhost"
#' )
#' pathwaydf <- getPathwayFromAnalyte(c(
#'   "ensembl:ENSG00000135679", "hmdb:HMDB0000064",
#'   "hmdb:HMDB0000148", "ensembl:ENSG00000141510"
#' ))
#' fisher.results <- runCombinedFisherTest(pathwaydf = pathwaydf)
#' clustered.fisher.results <- findCluster(fisher.results)
#' }
#' @export
findCluster <- function(fishers_df, perc_analyte_overlap = 0.5,
                        min_pathway_tocluster = 2, perc_pathway_overlap = 0.5) {
  print("Clustering pathways...")

  if (perc_analyte_overlap <= 0 || perc_analyte_overlap >= 1 ||
    perc_pathway_overlap <= 0 || perc_pathway_overlap >= 1) {
    warning("No Clustering. perc_analyte_overlap and percent_pathway_overlap must bee in the range of (0,1), exclusive (not exactly 0 or 1).")
    return(fishers_df)
  }

  if (is.null(fishers_df$fishresults) || nrow(fishers_df$fishresults) < 1) {
    warning("The contained input pathway dataframe is empty (fishers_df$fishresults). Returning input result without clustering.")
    return(fishers_df)
  }

  analyte_type <- fishers_df$analyte_type
  fishers_df <- fishers_df$fishresults
  list_pathways <- fishers_df %>% dplyr::pull("pathwayId")
  list_pathways <- sapply(list_pathways, shQuote)
  list_pathways <- paste(list_pathways, collapse = ",")
  query <- paste0(
    "SELECT pathwayRampId, sourceId from pathway where sourceId in (",
    list_pathways,
    ")"
  )
  con <- connectToRaMP()
  idkey <- RMariaDB::dbGetQuery(con, query) %>%
    dplyr::rename("pathwayId" = "sourceId") ##  %>%
  ## dplyr::rename("rampId" = "pathwayRampId")

  rampToSource <- function(x) {
    out <- with(idkey, {
      idkey %>%
        dplyr::filter(pathwayRampId == x) %>%
        dplyr::pull("pathwayId")
    })
    return(out)
  }

  RMariaDB::dbDisconnect(con)

  fishers_df <-
    fishers_df %>%
    dplyr::left_join(idkey, by = "pathwayId")
  if (nrow(fishers_df) == 0) {
    return(NULL)
  } else if (nrow(fishers_df) == 1) {
    fishers_df$cluster_assignment <- "Did not cluster"
    fishers_df$rampids <- fishers_df$pathwayRampId
    fishers_df$pathwayRampId <- NULL
    output <- list(fishresults = fishers_df, analyte_type = analyte_type, cluster_list = "Did not cluster")
    return(output)
  } else {
    # similarity_matrix_list<-loadOverlapMatrices()
    similarity_matrix_gene <- genes_result
    similarity_matrix_analyte <- analyte_result
    similarity_matrix_metab <- metabolites_result
    if (analyte_type == "both") {
      # similarity_matrix = similarity_matrix_list[["analyte"]]
      similarity_matrix <- similarity_matrix_analyte
    } else if (analyte_type == "metabolites") {
      # similarity_matrix = similarity_matrix_list[["metab"]]
      similarity_matrix <- similarity_matrix_metab
    } else if (analyte_type == "genes") {
      # similarity_matrix = similarity_matrix_list[["gene"]]
      similarity_matrix <- similarity_matrix_gene
    } else {
      stop("analyte_type should be 'genes' or metabolites'")
    }
    pathway_list <- fishers_df[, "pathwayRampId"]

    pathway_indices <- match(pathway_list, rownames(similarity_matrix))

    if (length(which(is.na(pathway_indices))) > 0) {
      pathway_indices <- pathway_indices[-which(is.na(pathway_indices))]
    }

    pathway_matrix <- similarity_matrix[pathway_indices, pathway_indices]
    unmerged_clusters <- apply(pathway_matrix, 1, function(x) {
      # if(length(which(x>=perc_analyte_overlap))>(min_pathway_tocluster+1)){
      if (length(which(x >= perc_analyte_overlap)) > (min_pathway_tocluster - 1)) {
        return(colnames(pathway_matrix)[which(x >= perc_analyte_overlap)])
      } else {
        return(NA)
      }
    })
    # Remove the unmerged clusters
    if (length(which(is.na(unmerged_clusters))) > 0) {
      unmerged_clusters <- unmerged_clusters[-which(is.na(unmerged_clusters))]
    }

    if (length(unmerged_clusters) == 0) {
      # stop("No medoids found, make perc_analyte_overlap or min_pathway_tocluster smaller")
      cluster_list <- rep("Did not cluster", times = nrow(fishers_df))
    } else {
      # Evaluate similarity between clusters
      cluster_similarity <- matrix(0, ncol = length(unmerged_clusters), nrow = length(unmerged_clusters))
      for (i in 1:length(unmerged_clusters)) {
        for (j in 1:length(unmerged_clusters)) {
          cluster_similarity[i, j] <- length(intersect(unmerged_clusters[[i]], unmerged_clusters[[j]])) /
            length(unique(c(unmerged_clusters[[i]], unmerged_clusters[[j]])))
        }
      }
      colnames(cluster_similarity) <- rownames(cluster_similarity) <- names(unmerged_clusters)
      unmerged_cluster_similarity <- cluster_similarity

      cluster_list <- unmerged_clusters

      # Merge Clusters
      count <- 1
      while (length(which(cluster_similarity >= perc_pathway_overlap)) > nrow(cluster_similarity)) {
        cluster_similarity_mod <- cluster_similarity
        for (i in 1:nrow(cluster_similarity_mod)) {
          cluster_similarity_mod[i, i] <- 0
        }

        clusters_to_merge <- which(cluster_similarity_mod == max(cluster_similarity_mod), arr.ind = TRUE)
        clusters_to_merge <- unique(t(apply(clusters_to_merge, 1, sort)))

        for (i in 1:nrow(clusters_to_merge)) {
          # print(" ")
          # if(length(!is.na(cluster_list[[clusters_to_merge[i, 1]]])) > 1) {
          #   print(paste0("cluster list 1 is greater than 1 ", length(!is.na(cluster_list[[clusters_to_merge[i, 1]]]))))
          #   print(cluster_list[[clusters_to_merge[i, 1]]])
          # } else {
          #   print("cluster list 1 length is 1")
          # }
          #
          # if(length(!is.na(cluster_list[[clusters_to_merge[i, 1]]])) > 1) {
          #   print(paste0("cluster list 2 is greater than 1 ", length(!is.na(cluster_list[[clusters_to_merge[i, 2]]]))))
          #   print(cluster_list[[clusters_to_merge[i, 2]]])
          # } else {
          #   print("cluster list 2 length is 1")
          # }

          if (all(!is.na(cluster_list[[clusters_to_merge[i, 1]]])) && all(!is.na(cluster_list[[clusters_to_merge[i, 2]]]))) {
            cluster_list[[clusters_to_merge[i, 1]]] <- unique(unlist(cluster_list[c(clusters_to_merge[i, 1], clusters_to_merge[i, 2])]))
            cluster_list[[clusters_to_merge[i, 2]]] <- NA
          }
        }

        if (length(which(is.na(cluster_list))) > 0) {
          cluster_list <- cluster_list[-which(is.na(cluster_list))]
        }

        cluster_similarity <- matrix(0, ncol = length(cluster_list), nrow = length(cluster_list))
        for (i in 1:length(cluster_list)) {
          for (j in 1:length(cluster_list)) {
            cluster_similarity[i, j] <- length(intersect(cluster_list[[i]], cluster_list[[j]])) /
              length(unique(c(cluster_list[[i]], cluster_list[[j]])))
          }
        }

        if (nrow(cluster_similarity) == 1) {
          # stop("Clusters converged, use larger perc_pathway_overlap")
          # return(rep(1,times = nrow(fishers_df)))
          cluster_list <- rep("Did not cluster", times = nrow(fishers_df))
        }
        count <- count + 1
        if (count == length(unmerged_clusters) + 1) {
          stop("ERROR: while loop failed to terminate")
          # return(rep(1,times = nrow(fishers_df)))
          # cluster_list<-rep("Did not cluster",times = nrow(fishers_df))
        }
      }
      if (length(unique(cluster_list)) != 1) {
        colnames(cluster_similarity) <- rownames(cluster_similarity) <- paste0("cluster_", c(1:length(cluster_list)))
      }
    }
    # return(cluster_list)

    # Reformat cluster list to embed into results file
    rampids <- as.vector(fishers_df$pathwayRampId)
    # fishers_df$pathwayRampId<-NULL

    if (length(cluster_list) > 1) {
      cluster_assignment <- sapply(rampids, function(x) {
        pathway <- x
        clusters <- ""
        for (i in 1:length(cluster_list)) {
          if (pathway %in% cluster_list[[i]]) {
            clusters <- paste0(clusters, i, sep = ", ", collapse = ", ")
          }
        }
        if (clusters != "") {
          clusters <- substr(clusters, 1, nchar(clusters) - 2)
        } else {
          clusters <- "Did not cluster"
        }
        return(clusters)
      })
      fishers_df <- cbind(fishers_df, cluster_assignment)
    } else {
      fishers_df <- cbind(fishers_df, rep("Did not cluster", times = nrow(fishers_df)))
    }

    ## fishers_df$rampids <- rampids
    fishers_df <- cleanup(fishers_df)
    rownames(fishers_df) <- NULL

    ## Remove RaMP ids
    colnames(pathway_matrix) <- rownames(pathway_matrix) <- sapply(
      rownames(pathway_matrix),
      rampToSource
    )
    cluster_list <- lapply(cluster_list, function(x) {
      out <- sapply(x, rampToSource)
      names(out) <- NULL
      return(out)
    })
    names(cluster_list) <- paste("Cluster", 1:length(cluster_list))
    output <- list(fishresults = fishers_df, analyte_type = analyte_type, cluster_list = cluster_list, pathway_matrix = pathway_matrix)
    print("Finished clustering pathways...")
    return(output)
  }
}
ncats/RaMP-DB documentation built on Oct. 28, 2023, 8:12 a.m.