R/MultiLayerFunctions.R

Defines functions getColorTable cytoscapeVis getNodeList getEdgeList writeMultiLayer makeBarPlot makeFeqTable calculateMultiLayerStats makeMultiLayer processMappings buildCorrNet buildSpecSimNet buildMassDiffNet buildExpNet loadInputData

Documented in buildExpNet calculateMultiLayerStats loadInputData makeMultiLayer writeMultiLayer

#' @name loadInputData
#'
#' @aliases loadInputData
#'
#' @title Load the input data
#'
#' @description
#' The function `loadInputData` loads the input data, i.e., the files needed
#' to build the networks
#'
#' @param peakListF
#' `Path`, path to the TSV file that contains the peak list (one peak per row)
#' in a MetaboLights-like format. The first row is the header (i.e., the list
#' of columns' names). The file should contain at least 4 columns:
#' "database_identifier" (i.e., ChEBI), "metabolite_identification" (i.e.,
#' metabolite name, if identified), "mass_to_charge", and "retention_time". It
#' is to note that these column names are fixed. The TSV file can also contain
#' intensity (or abundance) values. In this case, the column names are free
#' (e.g., can be named after the samples), but no blank spaces are allowed and
#' the first character must be a letter. In addition, all the abundances must
#' be placed at the end of the table, i.e., in the last columns. NOTE. Special
#' characters, such as single quotes ("'"), and hashtag ("#"), are forbidden
#' and the peak list file should not contain any of them
#'
#' @param intCol
#' `numeric`, number of the first column containing the intensities or of the
#' subset of columns to take the intensity values from. The default value is
#' 23, as in all datasets available in MetaboLights
#' (https://www.ebi.ac.uk/metabolights)
#'
#' @param transF
#' `path`, (optional) path to the CSV file containing the transformation list.
#' It must contain at  least the columns: "name", "formula", and "mass". This
#' file is needed if the mass difference network is to be built
#'
#' @param spectraF
#' `path`, (optional) path to the MSMS data in MGF format, if available
#'
#' @param gsmnF
#' `path`, path to the compound graph in GML format, as generated by Met4J
#' (https://forgemia.inra.fr/metexplore/met4j). The compound graphs of some
#' organisms are available in the data folder
#'
#' @param spectraSS
#' `numeric`, (optional) samples the spectra dataset, according to the given
#' sample size. This would speed up the process and it is useful when testing
#' your data and/or this package
#'
#' @param resPath
#' `path`, path to the folder where the results will be stored
#'
#' @param met2NetDir
#' `path`, path to the directory where the Python package Metabolomics2Network
#' is stored
#'
#' @param configF
#' `path`, path to the file that contains the column names (i.e., alias) of the
#' idenMetF file and the information they contain, such as name, chebi,
#' formula, etc. See an example in
#' extdata/MTBLS1586/Metabolomics2NetworksData/JsonConf.txt.
#' NOTE. This information will be used to do the metabolite mapping to the GSMN
#' (using metabolomics2Network)
#'
#' @param idenMetF
#' `path`, path to the TSV file that contains the list of experimental features
#' that were identified and that have a CHEBI id associated. The header of this
#' file must match the aliases from the configuration file (configF)
#'
#' @param metF
#' `path`, path to the TSV file containing the list of metabolites in the GSMN
#' of the organism of interest. They must contain at least two columns: ID and
#' Chebi. It is important that all the metabolites have a chebi ID, that there
#' is a single chebi ID per metabolite, and that it corresponds to the main
#' chebi ID. If you are not sure that your list of chebi IDs fits these
#' requirement, set the `cleanMetF` parameter to TRUE.
#' An example of a correct list of metabolites can be found in
#' extdata/MTBLS1586/Metabolomics2NetworksData/WormJamMetWithMasses.tsv
#'
#' @param cleanMetF
#' `boolean`, set it to TRUE if  you want to clean your metabolites' file, as
#' previously described
#'
#' @import igraph Spectra
#'
#' @importFrom readr read_tsv read_csv
#' @importFrom MsBackendMgf MsBackendMgf
#'
#' @return
#' Named list containing all the data (peakList, spectra, transformations, and
#' gsmn)
#'
#' @author Elva Novoa, \email{elva-maria.novoa-del-toro@@inrae.fr}
#'
#' @examples
#' # See the MultiLayerNetwork vignette
#'
#' @export

loadInputData <- function(peakListF, intCol = 23, transF = NULL,
  spectraF = NULL, gsmnF, spectraSS = NULL, resPath, met2NetDir, configF,
  idenMetF, metF, cleanMetF = TRUE) {

  # read peak list
  data <- read.table(peakListF, sep = "\t", header = TRUE, quote = "")
  rownames(data) <- data$id

  # accordingt to readMaf
  colnames(data)[colnames(data) == "mass_to_charge"] <- "mz"
  colnames(data)[colnames(data) == "retention_time"] <- "rtime"

  # check column position of the first intensity values
  if(length(intCol) == 1) {
    if (is.na(intCol)) {
      intCol <- 23:ncol(data)
    } else {
      intCol <- intCol:ncol(data)
    }
  }

  # create QFeatures object containing peak list
  peakList <-
    readQFeatures(data, ecol = intCol, fnames = which(colnames(data) == "id"))

  # read peak list (as tibble)
  data <- readr::read_tsv(peakListF)


  # keep only identified metabolites
  identifiedMet <-
    data[!is.na(data$database_identifier), c("id", "database_identifier")]

  # check if there is a fragmentation spectra file
  if (!is.null(spectraF)) {
    # read fragmentation spectra
    spectra <-
      Spectra::Spectra(
        spectraF,
        source = MsBackendMgf::MsBackendMgf(),
        backend = Spectra::MsBackendDataFrame()
      )

    # check if sampling is to be done
    if (exists("spectraSS") && !is.null(spectraSS)) {

      # check if sample size < current spectra size
      if (spectraSS < length(spectra)) {

        # sample the fragmentation spectra
        keep <- sample(seq_len(length(spectra)), spectraSS)
        spectra <- spectra[keep]
      }
    }

    # sanity checks
    checkSpectra(spectra)
    checkIdNamespace(peakList, spectra)

  } else {
    spectra <- NULL
  }


  # check if there is a transformations file
  if (!is.null(transF)) {
    # read transformations
    transformations <- readr::read_csv(transF, col_names = TRUE)
  } else {
    transformations <- NULL
  }

  # read GSMN
  gsmn <- igraph::as.undirected(igraph::read_graph(gsmnF, format = "gml"))

  ############################################################# TO REMOVE ????
  # remove compartments from the label of the nodes
  labels <-
    unlist(
      lapply(igraph::get.vertex.attribute(gsmn, "label"), function(X) {
        n <- nchar(X)
        substr(X, 1, n-2)
      }
      )
    )
  # set the name of the metabolites
  gsmn <- igraph::set.vertex.attribute(gsmn, name = "name", value = labels)
  ############################################################# TO REMOVE ????

  # create results' path
  dir.create(resPath, recursive = TRUE)

  # check whether the metabolites' file is to be cleaned
  if (cleanMetF == TRUE) {
    # build command to run python code
    com <-
      paste0(
        "python3 ",
        find.package("MetClassNetR"),
        "/Python/cleanMetaboliteTable.py ",
        metF
      )

    # execute code
    system(com)

    # update metabolites' file name
    metF <-
      paste0(
        substr(metF, 1, nchar(metF) - 4),
        "_Processed.tsv"
      )
  }

  return (
    list(
      peakList = peakList,
      allComp = data$id,
      identifiedMet = identifiedMet,
      spectra = spectra,
      transformations = transformations,
      gsmn = gsmn,
      resPath = resPath,
      configF = configF,
      idenMetF = idenMetF,
      metF = metF,
      met2NetDir = met2NetDir
      )
    )
}


#' @name buildExpNet
#'
#' @aliases buildExpNet
#'
#' @title Build experimental networks
#'
#' @description
#' The function `buildExpNet` builds experimental networks, based on spectral
#' similarity, correlation, and mass difference
#'
#' @param inputData
#' `list`, list returned by the `loadInputData` function
#'
#' @param net2Build
#' `list`, list of experimental networks to build, according to the following
#' options:
#'    "all" - generates 3 experimental networks: mass difference, correlation,
#'    and spectral similarity,
#'    "m" - builds the mass difference network,
#'    "s" - builds the spectral similarity network,
#'    "c" - builds the correlation network.
#'    It is to note that combinations of m, s, and c are possible, writing them
#'    separated by commas, e.g., c("m", "c") would generate the mass
#'    difference, and correlation networks. The default value is "all"
#'
#' @param directed
#' `boolean`, boolean value (TRUE/FALSE). If TRUE then the networks that are
#' generated will be directed, and undirected otherwise. The default value is
#' FALSE (i.e., undirected network)
#'
#' @param ppmMass
#' `numeric`, (optional) allowed error for mass differences calculus. It is
#' only needed if the mass difference network is to be built. The default value
#' is 10
#'
#' @param ppmSpec
#' `numeric`, (optional) relative allowed error for spectral similarity
#' calculus. It is only needed if the spectral similarity network is to be
#' built. The default value is 0
#'
#' @param tol
#' `numeric`, (optional) absolute tolerance for spectral similarity calculus.
#' It is only needed if the spectral similarity network is to be built. The
#' default value is 0.005
#'
#' @param corrModel
#' `vector`, (optional) character vector containing the model(s) to be used for
#' the correlation calculus in MetNet. Please check the available models in the
#' current version of MetNet. There are 10 models available in version 1.14.0:
#' "lasso", "randomForest", "clr", "aracne", "pearson", "pearson_partial",
#' "spearman", "spearman_partial", "ggm", and "bayes". This parameter is only
#' needed if the correlation network is to be built. The default value is
#' "pearson"
#'
#' @param corrThresh
#' `numeric`, (optional) floating point number indicating the correlation
#' threshold to consider that two features are correlated. It is only needed if
#' the correlation network is to be built. The default value is 0.25 (i.e., at
#' least 25% of correlation between the abundance values, either positive or
#' negative)
#'
#' @param specSimThresh
#' `numeric`, (optional) floating point number indicating the spectral
#' similarity threshold to consider that two features have similar spectra. It
#' is only needed if the spectral similarity network is to be built. The
#' default value is 0.7
#'
#' @import igraph MetNet Spectra
#'
#' @return
#' List of experimental networks as igraph objects
#'
#' @author Elva Novoa, \email{elva-maria.novoa-del-toro@@inrae.fr}
#'
#' @examples
#' # See the MultiLayerNetwork vignette
#'
#' @export
buildExpNet <- function(inputData, net2Build = "all", directed = FALSE,
  ppmMass = 10, ppmSpec = 0, tol = 0.005, corrModel = "pearson",
  corrThresh = 0.25, specSimThresh = 0.7) {

    # empty list to save the experimental networks
    expNet <- list()

    # check if mass difference network is to be built
    if (any(c("m", "all") %in% net2Build)) {

      # build mass difference network
      res <- buildMassDiffNet(inputData, ppmMass, directed)

      # add network to the list
      expNet[["m"]] <- res$net

      # save the mass difference matrix
      massDiff <- res$massDiff
    }

    # check if spectral similarity network is to be built
    if (any(c("s", "all") %in% net2Build)) {

      # check if the mass difference matrix does not already exists
      if (!exists("massDiff")) {

        # build mass difference network
        res <- buildMassDiffNet(inputData, ppmMass, directed)

        # save the mass difference matrix
        massDiff <- res$massDiff
      }

      # build spectral similarity network
      net <-
        buildSpecSimNet(
          inputData, tol, ppmSpec, specSimThresh, massDiff, directed
          )

      # add network to the list
      expNet[["s"]] <- net
    }

    # check if correlation network is to be built
    if (any(c("c", "all") %in% net2Build)) {

      # build correlation network
      net <- buildCorrNet(inputData, directed, corrModel, corrThresh)

      # add network to the list
      expNet[["c"]] <- net
    }

    return(expNet)
  }


# Function to build the mass difference network
# INPUTS:
#  inputData - list returned by the loadInputData function
#  ppmMass   - allowed error for mass differences calculus
#  directed  - boolean value (TRUE/FALSE). If TRUE then the networks that are
#              generated will be directed, and undirected otherwise. The
#              default value is FALSE (i.e., undirected network)
# OUTPUT: mass difference network as an igraph object
buildMassDiffNet <- function(inputData, ppmMass, directed) {

  data <-
    as.data.frame(rowData(inputData$peakList)[[names(inputData$peakList)]])
  trans <- as.data.frame(inputData$transformations)

  # create mass difference adjacency matrix
  massDiff  <-
    MetNet::structural(
      data,
      trans,
      var = c("name", "mass"),
      ppm = ppmMass,
      directed = directed
      )

  # add row data
  rowData(massDiff) <-
    as.data.frame(rowData(inputData$peakList[[names(inputData$peakList)]]))

  # get edges from the adjacency matrix
  massDiffDF <- as.data.frame(massDiff) |> filter(binary == 1)

  # create igraph object
  net <-
    igraph::graph_from_data_frame(
      massDiffDF,
      directed = directed,
      vertices = inputData$allComp
      )

  return(list(net = net, massDiff = massDiff))
}


# Function to build the spectral similarity network
# INPUTS:
#  inputData     - list returned by the loadInputData function
#  tol           - absolute tolerance for spectral similarity calculus
#  ppmSpec       - relative allowed error for spectral similarity calculus. It
#                  is only needed if the spectral similarity network is to be
#                  built. The default value is 0
#  specSimThresh - spectral similarity threshold. Similarities equal or higher
#                  than this value will be kept
#  massDiff      - mass difference adjacency matrix as generated by MetNet
#  directed      - boolean value (TRUE/FALSE). If TRUE then the networks that
#                  are generated will be directed, and undirected otherwise. The
#                  default value is FALSE (i.e., undirected network)
# OUTPUT: spectral similarity network as an igraph object
buildSpecSimNet <-
  function(inputData, tol, ppmSpec, specSimThresh, massDiff, directed) {
  # filter spectra 10 % of intensity to speed up calculus
  filteredSpectra <-
    Spectra::filterIntensity(
      inputData$spectra,
      function(x) { x > max(x, na.rm = TRUE) / 10 }
      )

  # calculate spectral similarity
  spectralSim <-
    spec_molNetwork(
      filteredSpectra,
      MAPFUN = Spectra::joinPeaksGnps,
#      methods = "gnps",
      methods = "ndotproduct",
      tolerance = tol,
      ppm = ppmSpec,
      type = "outer"
    )

  # add spectral similarity to the structural adjacency matrix from MetNet
  spectralSimA <-
    MetNet::addSpectralSimilarity(
      am_structural = massDiff,
      ms2_similarity = spectralSim
      )

  # get edges from the adjacency matrix
  spectralSimDF <-
    as.data.frame(spectralSimA) |>
    filter(!is.na(ndotproduct)) |>
    filter(Row != Col) |>
    filter(ndotproduct >= specSimThresh)

  # create igraph object
  net <-
    igraph::graph_from_data_frame(
      spectralSimDF,
      directed = directed,
      vertices = inputData$allComp
      )

  return(net)
}


# Function to build the correlation network
# INPUTS:
#  inputData - list returned by the loadInputData function
#  directed  - boolean value (TRUE/FALSE). If TRUE then the networks that are
#              generated will be directed, and undirected otherwise. The
#              default value is FALSE (i.e., undirected network)
#  corrModel - character vector containing the model(s) to be used for the
#              correlation calculus
# corrThresh - floating point number indicating the correlation threshold to
#              consider that two features are correlated
# OUTPUT: correlation network as an igraph object
buildCorrNet <- function(inputData, directed, corrModel, corrThresh) {

  # calculate correlation
  corr <-
    qfeat_statistical(
      x = inputData$peakList,
      assay_name = names(inputData$peakList),
      model = corrModel,
      na.omit = TRUE,
      p.adjust = "BH"
    )

  # get edges from the adjacency matrix
  corrDF <- as.data.frame(corr)

  # filter correlation to keep only those above threshold
  corrDF <- corrDF[abs(corrDF[[grep("coef", names(corrDF))]]) >= corrThresh, ]

  # create igraph object
  net <-
    igraph::graph_from_data_frame(
      corrDF,
      directed = directed,
      vertices = inputData$allComp
      )

  return(net)
}

# mapMetToGSMN was moved to mapMetToGSMN.R


# Function to process the mapping data to clean it, i.e., remove empty
# mappings, collapse multi-mappings and remove the ";"
# INPUTS:
#  identMetF      - file name containing the identified metabolites
#  pathToMappings - path to the folder that contains the mappings generated
#                   by map2network (Python tool)
#  resFile        - name of the file to process
# OUTPUT: nothing, but it generates 2 files: one with the original mappings
#         and another one with the processed mappings
processMappings <- function(identMetF, pathToMappings, resFile) {

  # read mapping results
  mapRes <- read.csv(paste0(pathToMappings, resFile), sep = "\t")

  # remove rows that have no mapping to the GSMN and keep only interesting cols
  mapRes <- mapRes[mapRes$mapped.on.id != "", 1:5]

  # open file with the original annotations
  anno <- read.csv(identMetF, sep = "\t")

  colnames(anno)[2] <- c("originalChebi")

  # merge mappings and original annotations data frames
  mapRes <- merge(mapRes, anno, by.x = "metabolite.name", by.y = "id")

  # check if feature names had a complement (i.e., "_N") and remove it
  mapRes[, "metabolite.name"] <-
    sapply(
      mapRes[, "metabolite.name"],
      function(featName) {
        if (length(grep("_[0-9]{1,2}$", featName)) > 0) {
          substr(featName, 1, nchar(featName)-2)
        } else {
          featName
        }
      }
    )

  # empty data frame to save the multi-mappings
  multiMappings <- mapRes[0, ]

  # check for multi-mappings and collapse them
  for (i in seq_len(nrow(mapRes))) {

    # collapse all mappings of current feature
    collapsedMappings <-
      lapply(
        mapRes[i, 2:ncol(mapRes)],
        function(X) {
          X <- as.character(X)
          strsplit(X, ";")
          }
        )

    # add multi-mappings
    multiMappings <-
      rbind(
        multiMappings,
        cbind(
          data.frame(
            metabolite.name =
              rep(mapRes[i, 1], length(collapsedMappings[[1]]))
          ),
          as.data.frame(
            lapply(
              collapsedMappings,
              function(X) {
                X[[1]][1:length(X[[1]])]
              }
            )
          )
        )
      )
  }

  # save original mappings with a different name
  write.table(
    mapRes,
    paste0(
      pathToMappings,
      gsub("[.].{3}$", "", resFile),
      "_OriginalRawMappings.txt"
    ),
    row.names = FALSE,
    quote = FALSE,
    sep = "\t"
  )
  print(paste0("Original mappings saved to ",pathToMappings,
    gsub("[.].{3}$", "", resFile),
    "_OriginalRawMappings.txt"))

  # save multi-mappings in the original final
  write.table(
    unique(multiMappings),
    paste0(pathToMappings, resFile),
    row.names = FALSE,
    quote = FALSE,
    sep = "\t"
  )
  print(paste0("Multimappings saved to ",pathToMappings, resFile))

  return()
}


#' @name makeMultiLayer
#'
#' @aliases makeMultiLayer
#'
#' @title Make a multi-layer network
#'
#' @description
#' Function to make the multi-layer network by connecting the Genome-Scale
#' Metabolic Network (GSMN) and the experimental networks
#'
#' @param inputData
#' `list`, list returned by the `loadInputData` function
#'
#' @param expNetworks
#' `list`, list returned by the buildExpNet function
#'
#' @param pathToMappings
#' `path`, path to folder containing the file(s) with the mapping between the
#' metabolites from the  experimental networks and those from the GSMN. Such
#' mapping  can be obtained with tools such as Metabolomics2Networks (see
#' the MultiLayerNetwork vignette). The file must contain at least 4 columns,
#' separated by tabs: "metabolite name" (name of the features from the peak
#' list), "mapped on id" (id of the corresponding metabolite from the GSMN),
#' "distance" (distance between the feature and its corresponding metabolite.
#' Note. The distance is equal to zero when the mapping is exact), and "chebi"
#' (ChEBI id of the corresponding metabolite from the GSMN)
#'
#' @return
#' Multi-layer network in list format containing 3 named elements:
#  "layers" (list of igraph objects), "type" (type of layer: Exp/GSMN),
#  "interLayerEdges" (data frame with 3 columns: expNode, gsmnNode, distance)
#'
#' @author Elva Novoa, \email{elva-maria.novoa-del-toro@@inrae.fr}
#'
#' @examples
#' # See the MultiLayerNetwork vignette
#'
#' @export
makeMultiLayer <- function(inputData, expNetworks, pathToMappings) {

  # get list of files in the mapping folder
  mappingFiles <- list.files(pathToMappings)


  # remove original mappings and mapping files containing all the information
  # (i.e., several extra columns)
  mappingFiles <-
    mappingFiles[
      grep(
        mappingFiles,
        pattern = "(OriginalRawMappings[.]txt$)|(.+_AllInfo.csv$)",
        invert = TRUE
        )
    ]

  # create an empty data frame
  interLayerEdges <-
    data.frame(
      expNode = character(),
      gsmnNode = character(),
      distance = character(),
      chebi    = character(),
      mapType  = character(),
      msi      = numeric(),
      mx1      = numeric(),
      mx2      = numeric(),
      bipN     = numeric()
    )

  # loop through mapping files
  for (mapF in mappingFiles) {
    # read table
    mapping <-
      read.table(paste0(pathToMappings, mapF), header = TRUE, sep = "\t")

    # loop to collapse multiple mappings and get all the inter-layer edges
    for (i in seq_len(nrow(mapping))) {
      feat <- as.character(mapping[i, "metabolite.name"])
      met <- mapping[i, "mapped.on.id"]
      dis <- mapping[i, "distance"]
      chebi <- mapping[i, "chebi"]

      msi <- ifelse(any(colnames(mapping) == "MSI"), mapping$MSI[i], -1)


      # get mapping type from the file name (e.g., ManualAnnotation) or from
      # the table
      mapType <-
        ifelse(
          any(colnames(mapping) == "annotationType"),

          mapping$annotationType[i],

          gsub(".*_(.*)[.]txt$", "\\1", mapF, perl = TRUE)
          )

      # add current inter-layer edges
      interLayerEdges <-
        rbind(
          interLayerEdges,
          data.frame(
            expNode  = feat,
            gsmnNode = met,
            distance = dis,
            chebi    = chebi,
            mapType  = mapType,
            msi      = msi,
            mx1      = 1,   # multiplex # 1 contains the experimental networks
            mx2      = 2,   # multiplex # 2 is the GSMN and has 1 layer
            bipN     = 1    # bipartite interactions number
          )
        )
    }

  }

  # make a multi-layer network with the GSMN and the experimental networks
  multiLayer <-
    list(
      layers = c(expNetworks, gsmn = list(inputData$gsmn)),
      type = c(rep("Exp", length(expNetworks)), "GSMN"),
      interLayerEdges = interLayerEdges
      )

  return(multiLayer)
}


#' @name calculateMultiLayerStats
#'
#' @aliases calculateMultiLayerStats
#'
#' @title Calculate some statistics of the multi-layer network
#'
#' @description
#' Function to calculate some statistics of the multi-layer network
#'
#' @param multiLayer
#' `list`, list containing the multi-layer network as returned by the
#' `makeMultiLayer` function
#'
#' @param inputData
#' `list`, list returned by the `loadInputData` function
#'
#' @import ggplot2
#'
#' @return
#' Nothing, but it creates several plots in the resPath directory
#'
#' @author Elva Novoa, \email{elva-maria.novoa-del-toro@@inrae.fr}
#'
#' @examples
#' # See the MultiLayerNetwork vignette
#'
#' @export
calculateMultiLayerStats <- function(multiLayer, inputData) {

  # verify if the results folder does not exist
  if (!file.exists(inputData$resPath)) {

    # create folder
    dir.create(inputData$resPath)
  }

  # make table of frequencies of the mapped GSMN nodes
  t <- makeFeqTable(multiLayer$interLayerEdges$gsmnNode, FALSE, "GSMN_node")

  # make plot
  makeBarPlot(
    resPath = inputData$resPath,
    data = t,
    xAxis = "GSMN_node",
    yAxis = "Freq",
    title = "GSMN nodes mapping frequency",
    vertical = FALSE
    )

  # make table of frequencies of the mapped GSMN nodes distances
  t <- makeFeqTable(multiLayer$interLayerEdges$distance, FALSE, "Distance")

  # make plot
  makeBarPlot(
    resPath = inputData$resPath,
    data = t,
    xAxis = "Distance",
    yAxis = "Freq",
    title = "Ontology distance of the mapped GSMN nodes",
    vertical = TRUE
  )

  return()
}


# Function to make a frequency table
# INPUTS:
#  data       - data to use
#  decreasing - if TRUE the frequencies will be sorted in descending order, or
#               in ascending order, otherwise
#  name       - name to give to the column where the values of the input data
#               will be stored
# OUTPUT: table of frequencies

makeFeqTable <- function(data, decreasing, name, sort = TRUE) {
  if (sort == TRUE) {
    # make table of frequencies
    t <- as.data.frame(sort(table(data), decreasing = decreasing))
  } else {
    # make table of frequencies
    t <- as.data.frame(table(data))
  }


  # check if the resulting data frame has more than 1 column, which is expected
  if (ncol(t) > 1) {
    colnames(t)[1] <- name
  } else {
    # no plot can be generated if there is a single column, so we need to make
    # a new data frame. There is a single column when the table function returns
    # a single value
    t <- data.frame(
      V1   = rownames(t),
      Freq = t[, 1]
    )

    # set column name
     colnames(t)[1] <- name
  }

  return(t)
}


# Function to make and save a bar plot
# INPUTS:
#  resPath  - path to the folder where the results will be stored
#  data     - data frame containing the data to plot
#  xAxis    - column name of the data for the x-axis
#  yAxis    - column name of the data for the y-axis
#  title    - title of the plot
#  vertical - if TRUE, the bars will be vertical, otherwise, they will be
#             horizontal
# OUTPUT: none, but it saves the plot in the resPath directory
makeBarPlot <-
  function(resPath, data, xAxis, yAxis, title, vertical = TRUE) {

  # make plot
  p <-
    ggplot2::ggplot(
      data = data,
      ggplot2::aes(x = eval(as.symbol(xAxis)), y = eval(as.symbol(yAxis)))
      ) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::ggtitle(title) +
    ggplot2::xlab(xAxis) +
    ggplot2::ylab(yAxis) +
    ggplot2::theme(
      plot.title = ggplot2::element_text(hjust = 0.5, face = "bold", size = 16),
      axis.text = ggplot2::element_text(size = 8)
      )

  # check orientation
  if (vertical == TRUE) {
    p <-
      p +
      ggplot2::geom_text(
        ggplot2::aes(label = eval(as.symbol(yAxis))), vjust = -0.3, size = 3.5)
  } else {
    p <-
      p +
      ggplot2::geom_text(
        ggplot2::aes(label = eval(as.symbol(yAxis))), hjust = -0.3, size = 3.5
        ) +
      ggplot2::coord_flip()
  }

  # save plot
  ggplot2::ggsave(
    paste0(resPath, stringr::str_replace_all(title, " ", "_"), ".png"),
    plot = p,
    dpi = 300
  )

  return()
}


#' @name writeMultiLayer
#'
#' @aliases writeMultiLayer
#'
#' @title Save and visualize multi-layer network
#'
#' @description
#' Function to save the list of edges and nodes from the multi-layer network
#' to visualize it in Cytoscape
#'
#' @param inputData
#' `list`, list returned by the `loadInputData` function
#'
#' @param multiLayer
#' `list`, list containing the multi-layer network as returned by the
#' `makeMultiLayer` function
#'
#' @param visualize
#' `boolean`, if == TRUE, the multi-layer layer will be visualized in Cytoscape
#' (NOTE. Cytoscape needs to be open). The default value is FALSE
#'
#' @import igraph RCy3
#'
#' @return
#' Nothing, but it generates files with the list of nodes, edges, and the
#' Cytoscape visualization (if visualize == TRUE)
#'
#' @author Elva Novoa, \email{elva-maria.novoa-del-toro@@inrae.fr}
#'
#' @examples
#' # See the MultiLayerNetwork vignette
#'
#' @export
writeMultiLayer <- function(inputData, multiLayer, visualize = FALSE) {

  # get list of nodes
  allNodes <- getNodeList(multiLayer)

  # save list of nodes
  write.csv(
    allNodes,
    paste0(inputData$resPath, "MultiLayer4Cytoscape_NodeList.csv"),
    row.names = FALSE,
    quote = FALSE
    )

  # get list of edges
  allEdges <- getEdgeList(multiLayer, allNodes)

  # save list of edges
  write.csv(
    allEdges,
    paste0(inputData$resPath, "MultiLayer4Cytoscape_EdgeList.csv"),
    row.names = FALSE,
    quote = FALSE
  )

  # save multilayer network in a format compatible with multixrank
  # -- multiplex networks first
  # get list of multiplex networks
  mx <- unique(allEdges$mxNum)

  # loop through all the multiplex networks
  for (mxN in mx[mx > 0]) {
    # set folder's path
    dirPath <-
      paste0(inputData$resPath, "multiXrankFolder/multiplex/", mxN, "/")


    # create folder, if needed
    if (dir.exists(dirPath) == FALSE) {
      dir.create(dirPath, recursive = TRUE)
    }


    # get list of layers of current multiplex network
    layers <- unique(allEdges$layerNum[allEdges$mxNum == mxN])

    # loop through the layers
    for (layerN in layers) {
      # get edges from current layer
      edgesLayer <-
        allEdges[allEdges$layerNum == layerN & allEdges$mxNum == mxN, ]

      # get node type of current layer
      nodeType <-
        unique(
          allNodes$nodeType[
            allNodes$node %in% rbind(edgesLayer$source, edgesLayer$target)
          ]
        )

      # get nodes from current multiplex
      nodesMx <- allNodes$numericId[allNodes$nodeType == nodeType]

      # add artificial self-loops to each node so that they are retrievable
      # by the random walk
      edgesLayerNum <-
        edgesLayer[, c("sourceNum", "targetNum")] %>%
        rbind(data.frame(sourceNum = nodesMx, targetNum = nodesMx))

      # save layer (only numeric source and target)
      write.table(
        edgesLayerNum,
        paste0(dirPath, unique(edgesLayer$interaction), ".tsv"),
        sep = "\t",
        quote = FALSE,
        row.names = FALSE,
        col.names = FALSE
        )
    }
  }

  # -- bipartite interactions
  # get list of multiplex networks
  bp <- unique(allEdges$bipartiteNum)

  # loop through all the multiplex networks
  for (bpN in bp[bp > 0]) {
    # set folder's path
    dirPath <-
      paste0(inputData$resPath, "multiXrankFolder/bipartite/")

    # create folder, if needed
    if (dir.exists(dirPath) == FALSE) {
      dir.create(dirPath, recursive = TRUE)
    }

    # get list of edges from current bipartite network
    edgesBip <- allEdges[allEdges$bipartiteNum == bpN, ]

    # save bipartite interactions
    write.table(
      edgesBip[, c("sourceNum", "targetNum")],
      paste0(
        dirPath,
        unique(edgesBip$mx1),
        "_",
        unique(edgesBip$mx2),
        ".tsv"
        ),
      sep = "\t",
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE
    )
  }

  # check whether the network is to be visualized
  if (visualize == TRUE) {
    # get inter-layer edges
    inter <- allEdges[allEdges[[3]] == "InterLayer", ]

    # get nodes connected by inter-layer edges
    nodes <- allNodes[allNodes[[1]] %in% unique(c(inter[[1]], inter[[2]])), ]

    # get all edges among the nodes that have  inter-layer edges
    edges <-
      allEdges[allEdges[[1]] %in% nodes$node & allEdges[[2]] %in% nodes$node, ]

    # Cytoscape visualization
    cytoscapeVis(
      nodes = nodes, edges = edges, resPath = inputData$resPath,
      mainType = "GSMN", fixedColors = TRUE
      )
  }

  return()
}


# Function to get the list of edges in a multi-layer network
# INPUTS:
#   multiLayer - multi-layer network
#   allNodes   - data frame containing all nodes (as returned by getNodeList)
# OUTPUT: list of edges, including inter-layer ones
getEdgeList <- function(multiLayer, allNodes) {
  allEdges <-
    data.frame(
      source       = character(),
      target       = character(),
      interaction  = character(),
      sourceNum    = numeric(),
      targetNum    = numeric(),
      layerNum     = numeric(),
      mxNum        = numeric(),
      bipartiteNum = numeric(),
      mx1          = numeric(),
      mx2          = numeric()
      )

  # get edge list of all the networks and paste it in a single data frame
  for (i in seq_len(length(multiLayer$layers))) {
    # initialize variables
    if (i == 1) {
      currentType <- multiLayer$type[i]
      layerNum <- 1
      mxNum <- 1
    } else {
      if (multiLayer$type[i] == currentType) {
        layerNum <- layerNum + 1
      } else {
        currentType <- multiLayer$type[i]
        layerNum <- 1
        mxNum <- mxNum + 1
      }
    }

    # get edge list
    edges <- igraph::as_edgelist(multiLayer$layers[[i]])

    # verify if there are any edges in the current network
    if (nrow(edges) > 0) {

      # add edges to general list
      allEdges <-
        rbind(
          allEdges,
          data.frame(
            source = edges[, 1],
            target = edges[, 2],
            interaction =
              paste(multiLayer$type[i], names(multiLayer$layers)[i], sep = "_"),
            sourceNum =
              sapply(
                edges[, 1],
                function(X) {
                  allNodes$numericId[allNodes$node == X]
                  }
                ),
            targetNum =
              sapply(
                edges[, 2],
                function(X) {
                  allNodes$numericId[allNodes$node == X]
                }
              ),
            layerNum = rep(layerNum, nrow(edges)),
            mxNum = rep(mxNum, nrow(edges)),
            bipartiteNum = rep(0, nrow(edges)),
            mx1 = rep(0, nrow(edges)),
            mx2 = rep(0, nrow(edges))
          )
        )
    }
  }

  # get numeric ID of target node. NOTE. This can be NULL if the corresponding
  # node was removed from the network because it is a side compound
  targetNum <-
    sapply(
      multiLayer$interLayerEdges[, 2],
      function(X) {

        compounds <- read.csv(inputData$metF, sep = "\t")

        if (any(allNodes$node == X)) {
          allNodes$numericId[allNodes$node == X]
        }
      }
    )

  # replace NULL values with NAs to be able to keep them after unlisting
  targetNum[sapply(targetNum, is.null)] <- NA

  # create data frame with inter-layer edges
  interLayerE <-
    data.frame(
      source       = multiLayer$interLayerEdges[, 1],
      target       = multiLayer$interLayerEdges[, 2],
      interaction  = rep("InterLayer", nrow(multiLayer$interLayerEdges)),
      sourceNum    =
        sapply(
          multiLayer$interLayerEdges[, 1],
          function(X) {
            allNodes$numericId[allNodes$node == X]
          }
        ),
      targetNum    = unlist(targetNum),
      # layer and multiplex numbers are 0 for the inter-layer edges
      layerNum     = rep(0, nrow(multiLayer$interLayerEdges)),
      mxNum        = rep(0, nrow(multiLayer$interLayerEdges)),
      bipartiteNum = multiLayer$interLayerEdges$bipN,
      mx1          = multiLayer$interLayerEdges$mx1,
      mx2          = multiLayer$interLayerEdges$mx2
    )

  # remove inter-layer edges that include a side compound, i.e., NAs
  if (any(is.na(interLayerE$targetNum))) {
    interLayerE <- interLayerE[-which(is.na(interLayerE$targetNum)), ]
  }


  # add remaining inter-layer edges to the list
  allEdges <-
    rbind(
      allEdges,
      interLayerE
    )

  return(allEdges)
}


# Function to get the list of nodes in a multi-layer network
# INPUT: multi-layer network
# OUTPUT: list of nodes
getNodeList <- function(multiLayer) {

  # get types of layers
  type <- unique(multiLayer$type)

  # initialize empty variables
  nodes <- list()
  nodeType <- list()
  numericId <- list()

  # loop through the types of layers
  for (t in type) {
    # get nodes from current type
    n <-
      unique(
        unlist(
          lapply(
            multiLayer$layers[multiLayer$type == t],
            function(X) names(V(X))
            )
          )
        )

    # add numeric id
    numericId <- c(numericId, seq_len(length(n)) + length(nodes))

    # add nodes to the list
    nodes <- c(nodes, n)

    # add node type
    nodeType <- c(nodeType, rep(t, length(n)))

  }

  # create data frame
  allNodes <-
    data.frame(
      node = unlist(nodes),
      nodeType = unlist(nodeType),
      numericId = unlist(numericId)
    )

  return(allNodes)
}


# Definition of a function to visualize in Cytoscape the subnetworks of the
# experimental nodes that map to the same metabolite node in the GSMN.
# NOTE. Cytoscape needs to be open
# INPUTS:
#   allNodes    - list of nodes
#   allEdges    - list of edges
#   resPath     - path where the results will be stored
#   mainType    - a subnetwork will be generated for each of the nodes from
#                 this node type so that can observe the connection patterns
#                 between the different edges connecting all the nodes from the
#                 other types that map to each of the nodes from the main type.
#                 The default value is "GSMN"
#   fixedColors - boolean value (TRUE/FALSE), if == TRUE, a set of predefined
#                 colors will be used to style the nodes and edges, as follows:
#                 NODES: {blue = experimental, orange = GSMN},
#                 EDGES: {yellow = correlation, orange = mass difference,
#                         green = spectral similarity, black = GSMN,
#                         blue = inter-layer}.
#                 If fixedColors == FALSE, the colors of the nodes and edges
#                 will be assigned automatically. The default value is TRUE
# OUTPUT: None, but it generates a Cytoscape file with the visualization
cytoscapeVis <- function(nodes, edges, resPath, mainType = "GSMN",
  fixedColors = TRUE) {

    # verify that Cytoscape is launched
    RCy3::cytoscapePing()

    # close any open session
    RCy3::closeSession(save.before.closing = FALSE)

    # create aggregated version of the multi-layer network
    aggregated <-
      igraph::graph_from_data_frame(edges, directed = FALSE, vertices = nodes)

    # visualize aggregated network in Cytoscape
    RCy3::createNetworkFromIgraph(
      aggregated,
      title = "MultiLayer",
      collection = "MultiLayerNetwork"
    )
    RCy3::layoutNetwork(layout.name = "force-directed", network = "MultiLayer")

    # get types of nodes and edges in the network
    nodeTypes <- unique(get.vertex.attribute(aggregated, "nodeType"))
    edgeTypes <- unique(get.edge.attribute(aggregated, "interaction"))

    # get colors to use
    colors <- getColorTable(nodeTypes, edgeTypes, fixedColors)

    # color the nodes
    RCy3::setNodeColorMapping(
      table.column = "nodeType", table.column.values = nodeTypes,
      colors =
        sapply(
          nodeTypes,
          function(X) colors$color[colors$type == "Node" & colors$name == X]
        ),
      mapping.type = "d", network = "MultiLayer", style.name = "default"
    )

    # color the edges
    RCy3::setEdgeColorMapping(
      table.column = "interaction", table.column.values = edgeTypes,
      colors =
        sapply(
          edgeTypes,
          function(X) colors$color[colors$type == "Edge" & colors$name == X]
        ),
      mapping.type = "d", network = "MultiLayer", style.name = "default"
    )

    # get inter-layer edges and nodes connected by them
    inter <- edges[edges[[3]] == "InterLayer", ]

    # get nodes connected by inter-layer edges
    nodesIL <-nodes[nodes[[1]] %in% unique(c(inter[[1]], inter[[2]])), ]

    # get nodes from main node type
    nodesILT <- nodes[nodes$nodeType == mainType, ]

    # loop to generate sub-networks
    for (n in nodesILT$node) {
      # get edges connecting current node
      curEd <- edges[edges$target == n | edges$source == n, ]

      # remove edges from main type
      omit <- grep(mainType, curEd$interaction)
      if (length(omit) > 0) {
        curEd <-  curEd[-omit, ]
      }

      # get list of all nodes connected to current node
      nCon <- unique(c(curEd$source, curEd$target))

      # create subnetwork
      subnet <-
        igraph::subgraph(
          aggregated, vids = which(names(V(aggregated)) %in% nCon))

      # visualize subnetwork in Cytoscape
      RCy3::createNetworkFromIgraph(
        subnet,
        title = paste0(n, "_AllLayers"),
        collection = n
      )
      RCy3::layoutNetwork(
        layout.name = "force-directed", network = paste0(n, "_AllLayers")
        )

      # get types of edges connecting the nodes of current subnetwork
      edgeTypes <- as.vector(unique(edge_attr(subnet, "interaction")))

      # remove inter-layer edges
      edgeTypes <- edgeTypes[- which(edgeTypes == "InterLayer")]

      # loop to generate a subnetwork per edge type
      for(eType in edgeTypes) {
        # select edges of current type
        RCy3::selectEdges(
          edges = eType,
          by.col = "interaction",
          preserve.current.selection = FALSE,
          network = paste0(n, "_AllLayers")
        )

        # select all nodes connected by selected edges
        RCy3::selectNodesConnectedBySelectedEdges(
          network = paste0(n, "_AllLayers"))

        # create subnetwork
        RCy3::createSubnetwork(
          nodes = "selected",
          edges = "selected",
          subnetwork.name = paste0(n, "_", eType),
          network = paste0(n, "_AllLayers"),
        )

        # apply layout
        RCy3::layoutNetwork(
          layout.name = "force-directed", network = paste0(n, "_", eType))
      }
    }

    RCy3::saveSession(paste0(resPath, "Subnetworks_", Sys.Date()))

    return()
  }


# Function to get the table of colors for the nodes and edges
# INPUTS:
#   nodeTypes   - types of nodes in the network to color
#   edgeTypes   - types of edges in the network to color
#   fixedColors - whether to return fixed colors or not
# OUTPUT:
#   data frame of three columns: type, name, and color, containing the colors
#   for the nodes and edges
getColorTable <- function(nodeTypes, edgeTypes, fixedColors) {

  # check whether predefined list of colors is to be used
  if (fixedColors == TRUE) {
    colors <-
      data.frame(
        type = c(rep("Edge", 5), rep("Node", 2)),
        name = c("Exp_c", "Exp_m", "Exp_s", "GSMN_gsmn", "InterLayer", "Exp",
                 "GSMN"),
        # code colors:
        #   #F0E442 = yellow edges (Exp_c), #D55E00 = orange edges (Exp_m),
        #   #009E73 = green edges (Exp_s), #000000 = black edges (GSMN_gsmn),
        #   #0072B2 = blue edges (InterLayer), #56B4E9 = blue nodes (Exp),
        #   #E69F00 = orange nodes (GSMN)
        color = c("#F0E442", "#D55E00", "#009E73", "#000000", "#0072B2",
                  "#56B4E9", "#E69F00")
      )
  } else { # set the colors automatically
    # get number of types of edges and nodes
    edgeTypesNo <- length(edgeTypes)
    nodeTypesNo <- length(nodeTypes)

    # pick blind-color friendly colors
    palette <-
      palette.colors(palette = "Okabe-Ito")[1:(edgeTypesNo + nodeTypesNo)]

    # make data frame with the colors for each type of node and edge
    colors <-
      data.frame(
        type = c(rep("Edge", edgeTypesNo), rep("Node", nodeTypesNo)),
        name = c(edgeTypes, nodeTypes),
        color = palette
      )
  }

  return(colors)
}
MetClassNet/MetClassNetR documentation built on June 30, 2023, 2:12 p.m.