R/make_distance_matrices_list.R

# make_distance_matrices_list.R

# Purpose: This is a helper function for clusterSystems and is unlikely to be useful elsewhere.
#
# Parameters:
# distances           Character vector indicating the distance metrics to be used.
# all_genes           Character vector of HGNC symbols with which to make distance matrices
#
# Value:  A list of objects of class distance.

# Author: Rachel Silverstein


make_distance_matrices_list <- function (distances, all_genes) {
  # initialize a list to hold the distance matrices
  distanceMatrices <- list()

  for (i in seq_along(distances)) {
    if (distances[i] == "expression_profile") {
      GEO <- fetchData("GEOprofiles")
      expr_dist_matrix <- make_matrix(all_genes, dist_fn =  expr_dist, data_source = GEO)
      distanceMatrices <- append(distanceMatrices, list(expr_dist_matrix))
    } else if (distances[i] == "transcription_factor") {
      GTRD <- fetchData("GTRDgeneTFs")
      tf_dist_matrix <- make_matrix(all_genes, dist_fn =  tf_dist, data_source = GTRD)
      distanceMatrices <- append(distanceMatrices, list(tf_dist_matrix))
    } else if ((distances[i] == "network_jaccard") | (distances[i] == "network_distance")) {
      STRING <- fetchData("STRINGedges0.8")
      # convert the STRINGedges object into an igraph object
      STRINGgraph <- igraph::graph_from_edgelist(as.matrix(STRING[,1:2]))
      # The string graph does not contain any unconnected vertices, only interactions,
      # so any genes that have no annotated interactions will be missing.
      # Add these missing genes as unconnected nodes in the graph.
      string_genes <- unique(c(STRING$a, STRING$b))
      missing_genes <- setdiff(all_genes, string_genes)
      STRINGgraph <- igraph::add.vertices(STRINGgraph, nv = length(missing_genes), name = missing_genes)
      if (distances[i] == "network_jaccard") {
        string_dist_matrix <- make_matrix(all_genes, dist_fn = jaccard_dist, data_source = STRINGgraph)
        distanceMatrices <- append(distanceMatrices, list(string_dist_matrix))
      } else if (distances[i] == "network_distance") {
        string_matrix <- igraph::distances(STRINGgraph, v = all_genes, to = all_genes)
        # remove all the infinite values generated by unconnected nodes
        # get a number one larger than the largest finite distance in the matrix
        large_num <- max(string_matrix[is.finite(string_matrix)]) + 1
        # replace any infinite distances with this distance
        sel <- is.infinite(string_matrix)
        string_matrix[sel] <- large_num
        # convert to a scale of 0 to 1
        string_matrix <- string_matrix/large_num
        # put it into distance format recognized by clustering functions
        string_dist_matrix <- stats::as.dist(string_matrix)
        distanceMatrices <- append(distanceMatrices, list(string_dist_matrix))
      }
    } else {
      msg <- paste(c("\n", distances[i], " is not a recognized distance metric."), collapse = "")
      stop(msg)
    }
  }
  # add names to each of the distance matrices
  names(distanceMatrices) <- distances
  return(distanceMatrices)
}
hyginn/BCB420.2019.ESA documentation built on May 29, 2019, 1:23 p.m.