R/main_functions.R

Defines functions EvaluateConnections GenerateNetworkPaths GenerateEdgeWeights

Documented in EvaluateConnections GenerateEdgeWeights GenerateNetworkPaths

#######################################################################
### Generates files of cluster:ligand:receptor:cluster edge weights ###
#######################################################################
#'
#' Generates files of cluster:ligand:receptor:cluster edge weights
#'
#' For a Seurat object, calculates all edge weights connecting source cell populations to ligands,
#' ligands to receptors and receptors to target cell populations. Cell-ligand and receptor-cell edge weights are
#' defined as the log2 fold-change difference in expression of the gene in that cell type relative to all other
#' cell types in the Seurat object - as defined by the active identities. The ligand:receptor edge weights
#' are assigned using protein:protein association scores from the STRING database.
#'
#' @param seurat.object a list of genes
#' @param file.label a Seurat object with cluster identities
#' @param species species of the experiment - currently includes 'human' and 'mouse'
#' @param populations.use threshold of percentage of cells expressing the gene in a cluster for it to be considered expressed
#' @param pct.threshold percentage of cells in a cluster expressing a gene for it to be considered
#' @param use.string.offline whether to use packaged STRING scores (default) or retrieve online from STRINGdb
#' @param string.dir directory for storing retrieved online STRING data (defaults to working directory)
#' @param string.ver version of online STRING data-base to use. Not set by default.
#' @param string.receptors a list of receptor names that STRING recognises (if cannot map defaults)
#' @param string.ligands a list of ligand names that STRING recegnises (if cannot map defaults)
#' @param string.input.map named vector of STRING-compatable gene symbols with names corresponding to genes in dataset
#' @param string.evidence.class what STRING evidence class to use. Defaults to 'combined_score'. Alternatives are: "neighborhood",
#'  "neighborhood_transferred", "fusion", "cooccurence", "homology", "coexpression", "coexpression_transferred", "experiments",
#'  "experiments_transferred", "database", "database_transferred", "textmining" and "textmining_transferred"
#'
#' @param verbose whether to print additional information about run (default: FALSE)
#'
#' @return NULL - results written to file
#'
#'
#' @export
#'
GenerateEdgeWeights <- function(seurat.object,
                                file.label,
                                species,
                                populations.use = NULL,
                                pct.threshold = 0.1,
                                use.string.offline = TRUE,
                                string.dir = NULL,
                                string.ver = NULL,
                                string.receptors = NULL,
                                string.ligands = NULL,
                                string.input.map = NULL,
                                string.evidence.class = "combined_score",
                                verbose = FALSE) {

  if (is.null(populations.use)) {
    populations.use <- names(table(Idents(seurat.object)))
  }

  extdata.path <- system.file("extdata", package = "scTalk")

  ## Read in the ligand-receptor pairs
  ## These were taken from Ramilowski et al. (2015) Nature Communications
  ligand.receptor.pairs = read.csv(paste0(extdata.path, "/All.Pairs-Table 1.csv"),
                                   header=TRUE, row.names=1, stringsAsFactors = FALSE)

  if (species == "human") {
    ## Don't need to map, but make a dummy map to use same code as for mouse
    gene.names <- c(ligand.receptor.pairs$Ligand.ApprovedSymbol, ligand.receptor.pairs$Receptor.ApprovedSymbol)
    gene.names <- unique(gene.names)
    mappings <- data.frame(Human.gene.name = gene.names, Gene.name = gene.names, stringsAsFactors = FALSE)
  } else if (species == "mouse") {
    mapping.file <- paste0(extdata.path, "/human_mouse_ensembl_gene_mappings.txt")
    mappings = read.csv(mapping.file, header=TRUE, stringsAsFactors = FALSE)
    mappings %>% dplyr::distinct(Human.gene.name, .keep_all = TRUE) -> mappings
  } else{
    stop(paste("species", species, "not currently a valid option: please specify mouse or human"))
  }

  ## Keep pairs that are either literature supported or putative but filter out those annotated as being incorrect
  ligand.receptor.pairs = ligand.receptor.pairs[ligand.receptor.pairs[ ,"Pair.Evidence"] %in% c("literature supported", "putative"), ]
  receptors = as.character(ligand.receptor.pairs[, 3])
  ligands = as.character(ligand.receptor.pairs[, 1])
  all.genes = c(receptors, ligands)

  ## If mouse, maps the gene names to human. If human, mapping doesn't do anything.
  ligand.receptor.mappings = mappings[mappings[, 1] %in% all.genes, ]
  gene.names = rownames(seurat.object)
  ligand.receptor.mappings = ligand.receptor.mappings[as.character(ligand.receptor.mappings[, 2]) %in% gene.names, ]

  ### If species-mapped, removes non-unique human -> mouse mappings
  counts = table(ligand.receptor.mappings[, 1])
  counts = counts[counts==1]
  ligand.receptor.mappings = ligand.receptor.mappings[as.character(ligand.receptor.mappings[, 1]) %in% names(counts), ]
  rownames(ligand.receptor.mappings) = ligand.receptor.mappings[ , 1]

  ### Overlap pairs with the genes expressed in the data
  ligand.receptor.pairs = ligand.receptor.pairs[, c(1, 3)]
  ligand.receptor.pairs.expressed = ligand.receptor.pairs[as.character(ligand.receptor.pairs[, 1]) %in% rownames(ligand.receptor.mappings)
                                                          & as.character(ligand.receptor.pairs[, 2]) %in% rownames(ligand.receptor.mappings), ]

  ### Produce a table of weighted edges for the following:
  ### Cluster -> Ligand
  ### Ligand -> Receptor
  ### Receptor -> Cluster

  unique.genes = unique(c(as.character(ligand.receptor.pairs.expressed[,1]), as.character(ligand.receptor.pairs.expressed[, 2])))
  unique.input.genes = unlist(lapply(unique.genes, function(x) as.character(ligand.receptor.mappings[x, 2])))
  print("Calculating cluster-specific ligand/expression characteristics")
  de.results = calculate_cluster_specific_expression(geneList = unique.input.genes,
                                                     seurat.object = seurat.object,
                                                     threshold=pct.threshold,
                                                     cluster.set = populations.use)

  ## Build a table for each ligand-receptor pair as expressed in each cluster
  print("Identifying all potential cluster:ligand:receptor:cluster paths")
  all.de.results = get_gene_pair_expression_values(ligand.receptor.pairs = ligand.receptor.pairs.expressed,
                                                   mapping.table = ligand.receptor.mappings,
                                                   cluster.names = populations.use,
                                                   gene.de.results = de.results)
  if (verbose) {
    print(paste(nrow(all.de.results), " potential paths in data. Some examples:"))
    print(head(all.de.results))
    }


  ### Pull out connections for clusters of interest and add weights
  indicies = all.de.results$Cluster1 %in% populations.use & all.de.results$Cluster2 %in% populations.use

  ligand.receptor.edges = unique(all.de.results[indicies , c("Gene1", "Gene2")])
  colnames(ligand.receptor.edges) = c("source", "target")
  pair.identifiers = as.character(apply(ligand.receptor.edges, 1, function(x) paste0(x[1], ".", x[2])))
  rownames(ligand.receptor.edges) = pair.identifiers

  ## Get weights using the STRING data-base
  ligands = as.character(ligand.receptor.edges[, 1])
  receptors = as.character(ligand.receptor.edges[, 2])

  if (verbose) print(paste0(length(unique(ligands)), " ligands to score in STRING"))
  if (verbose) print(paste0(length(unique(receptors)), " receptors score in STRING"))

  ### Here use the STRING data-base to give mouse-specific scores to ligand-receptor relationships

  if (use.string.offline) {
    print("Using package STRING scores")
    if (species == "human") {
      lr_score_table <- read.table(paste0(extdata.path, "/STRING_v10_human_ligand-receptor_scores.tsv"),
                                   sep="\t", header=TRUE, row.names=1, stringsAsFactors = FALSE)
    } else if (species == "mouse") {
      lr_score_table <- read.table(paste0(extdata.path, "/STRING_v10_mouse_ligand-receptor_scores.tsv"),
                                   sep="\t", header=TRUE, row.names=1, stringsAsFactors = FALSE)
    } else{
      stop("invalid species input - either use 'mouse' or 'human'")
    }
  } else {
    ## Retrieve information direct from STRINGdb
    lr_score_table = make_STRING_table(ligands = unique(ligands),
                                       receptors = unique(receptors),
                                       dir.path = string.dir,
                                       string.ver = string.ver,
                                       species = species,
                                       string.receptors = string.receptors,
                                       string.ligands = string.ligands,
                                       string.input.map = string.input.map,
                                       verbose = verbose)
  }

  if (string.evidence.class %in% colnames(lr_score_table)) {
    lr_score_table = lr_score_table[, c("Ligand", "Receptor", string.evidence.class)]
  } else {
    stop(paste0("STRING evidence class ", string.evidence.class, " not in table. Available evidence types:",
                colnames(lr_score_table)[3:length(colnames(lr_score_table))]))
  }

  colnames(lr_score_table) <- c("Ligand", "Receptor", "Score")

  if (verbose) print(head(lr_score_table)) ## print out some interactions

  overlapping.genes = intersect(pair.identifiers, rownames(lr_score_table))
  print(paste0(length(overlapping.genes), " overlaps between ligand-receptor map and STRINGdb"))

  weights = lr_score_table[overlapping.genes, ]$Score
  weights = weights/1000

  ligand.receptor.edges.overlap = ligand.receptor.edges[overlapping.genes, ]
  ligand.receptor.edges.overlap$weight = weights
  ligand.receptor.edges.overlap$relationship = "ligand.receptor"
  ligands = as.character(ligand.receptor.edges[, 1])
  receptors = as.character(ligand.receptor.edges[, 2])

  ## Mapping 'source' population to corresponding ligand
  cluster.ligand.edges = unique(all.de.results[indicies , c("Cluster1", "Gene1", "Gene1.Log2_fold_change")])
  colnames(cluster.ligand.edges) = c("source", "target", "weight")
  cluster.ligand.edges$relationship = "cluster.ligand"
  cluster.ligand.edges$source = paste0("S:", cluster.ligand.edges$source)

  ## Mapping 'target' population to corresponding receptor
  receptor.cluster.edges = unique(all.de.results[indicies , c("Gene2", "Cluster2", "Gene2.Log2_fold_change")])
  colnames(receptor.cluster.edges) = c("source", "target", "weight")
  receptor.cluster.edges$relationship = "receptor.cluster"
  receptor.cluster.edges$target = paste0("T:", receptor.cluster.edges$target)

  ## Build a table of edges,
  col.order =  c("source", "target", "relationship", "weight")
  ligand.receptor.edges.overlap = ligand.receptor.edges.overlap[, col.order]
  receptor.cluster.edges = receptor.cluster.edges[, col.order]
  cluster.ligand.edges = cluster.ligand.edges[, col.order]

  all.edges = rbind(as.matrix(ligand.receptor.edges.overlap), trimws(as.matrix(receptor.cluster.edges)), trimws(as.matrix(cluster.ligand.edges)))

  ### Write the network edges to file
  write.csv(all.edges, file = paste0(file.label, "_all_ligand_receptor_network_edges.csv"),
            row.names = FALSE, quote = FALSE)

  ## Also write out just the weights based on expression values
  expression.edges = rbind(trimws(as.matrix(receptor.cluster.edges)), trimws(as.matrix(cluster.ligand.edges)))
  write.csv(expression.edges, file = paste0(file.label, "_ligand_receptor_weights.csv"),
            row.names = FALSE, quote = FALSE)

}

######################################################################
### Generates files of weigted source:ligand:receptor:target paths ###
######################################################################
#'
#' Generates files of weighted source:ligand:receptor:target paths
#'
#' Uses files generated with the GenerateEdgeWeights function to generate
#' weighted paths connecting source(clusters):ligands:receptors:target(clusters).
#' Paths are weighted by summing the three edges - source:ligand, ligand:receptor and
#' receptor:target. Paths passing a minimum weight threshold are retained. For
#' following permuation testing, a 'background' set of paths without thresholding
#' are also written out.
#'
#'
#' @param file.label a list of genes
#' @param populations.use the cell populations to use (default is all)
#' @param min.weight minimum weight for retaining a path
#' @param ncores number of cores for parallelisation
#'
#' @return NULL - results written to file
#'
#' @export
#'
#' @importFrom foreach "%dopar%"
#'
GenerateNetworkPaths <- function(file.label,
                                 populations.use = NULL,
                                 min.weight = 1.5,
                                 ncores = 1)
  {

  edge.score.file <- paste0(file.label, "_all_ligand_receptor_network_edges.csv")
  score.table <- read.csv(edge.score.file, stringsAsFactors = FALSE)

  if (is.null(populations.use)) {
    cluster.source.table <- subset(score.table, relationship == "cluster.ligand")
    source.clusters <- unique(sub(".:(.*)", "\\1", cluster.source.table$source))

    cluster.target.table <- subset(score.table, relationship == "receptor.cluster")
    target.clusters <- unique(sub(".:(.*)", "\\1", cluster.target.table$target))

    populations.use <- union(source.clusters, target.clusters)
  }

  all.weights <- score.table$weight
  all.edges <- score.table[, c("source", "target")]

  populations.test <- populations.use

  # Set up multiple workers
  system.name <- Sys.info()['sysname']
  new_cl <- FALSE
  if (system.name == "Windows") {
    new_cl <- TRUE
    cluster <- parallel::makePSOCKcluster(rep("localhost", ncores))
    doParallel::registerDoParallel(cluster)
  } else {
    doParallel::registerDoParallel(cores=ncores)
  }

  SeuratObjectFilter <- function(x) class(get(x)) == "Seurat"
  seurat.objs <- ls()[unlist(lapply(ls(), SeuratObjectFilter))]

  ## Go through and calculate summed path weights from source to target populations
  ## Minimum weight of 1.5 to select for paths with some up-regulation (in ligand, receptor or both)
  complete.path.table <- foreach::foreach(s.pop = populations.test,
                                   .combine = 'rbind') %dopar%
  {
    target.path.table <- c()
    for (t.pop in populations.test) {
      source.population <- paste0("S:", s.pop)
      target.population <- paste0("T:", t.pop)
      this.path.table <- get_weighted_paths(edge.weight.table = score.table,
                                           source.population = source.population,
                                           target.population = target.population,
                                           min.weight = min.weight)
      target.path.table <- rbind(target.path.table, this.path.table)
    }
    return(target.path.table)
  }

  write.csv(complete.path.table, file = paste0(file.label, "_network_paths_weight", min.weight, ".csv"))

  ## Generate a background set of paths for permutation testing
  ## Set mimimum weight to -100 to capture all paths
  background.path.table <- foreach::foreach(s.pop = populations.test,
                                          .combine = 'rbind',
                                          .noexport = seurat.objs) %dopar%

  {
    target.path.table <- c()
    for (t.pop in populations.test) {
      source.population <- paste0("S:", s.pop)
      target.population <- paste0("T:", t.pop)
      this.path.table <- get_weighted_paths(score.table,
                                            source.population = source.population,
                                           target.population = target.population,
                                           min.weight = -100)
      target.path.table <- rbind(target.path.table, this.path.table)
    }
    return(target.path.table)
  }
  dim(background.path.table)

  if (new_cl) { ## Shut down cluster if on Windows
    ## stop cluster
    parallel::stopCluster(cluster)
  }

  write.csv(background.path.table, file = paste0(file.label, "_background_paths.csv"))

}

####################################################
### Perform permutation testing on network edges ###
####################################################
#'
#' Perform permutation testing on network edges
#'
#' Uses permutation testing to identify cell-cell connections that have
#' path weights greater than would be expected by change.
#'
#'
#' @param file.label label for input and output files
#' @param populations.test set of cell populations to test for (default: all in file)
#' @param num.permutations number of permutations to perform
#' @param return.results whether to return the results table (default: FALSE)
#' @param ncores number of cores to use in parallelisation
#'
#' @return NULL - results written to file
#'
#' @export
#'
#' @importFrom foreach "%dopar%"
#'
EvaluateConnections <- function(file.label,
                                populations.test = NULL,
                                min.weight = 1.5,
                                num.permutations = 100000,
                                return.results = FALSE,
                                ncores = 1) {

  ## Read in the individual weights for the edges in the network
  weights.file = paste0(file.label, "_all_ligand_receptor_network_edges.csv")
  weights.table = read.csv(weights.file, stringsAsFactors = FALSE)

  if (is.null(populations.test)) {
    cluster.source.table <- subset(weights.table, relationship == "cluster.ligand")
    source.clusters <- unique(sub(".:(.*)", "\\1", cluster.source.table$source))

    cluster.target.table <- subset(weights.table, relationship == "receptor.cluster")
    target.clusters <- unique(sub(".:(.*)", "\\1", cluster.target.table$target))

    populations.test <- union(source.clusters, target.clusters)
  }

  ligand.receptor.table = weights.table[weights.table$relationship == "ligand.receptor", ]
  rownames(ligand.receptor.table) = paste0(ligand.receptor.table$source, "_", ligand.receptor.table$target)

  receptor.cluster.table = weights.table[weights.table$relationship == "receptor.cluster", ]
  rownames(receptor.cluster.table) = paste0(receptor.cluster.table$source, "_", receptor.cluster.table$target)

  cluster.ligand.table = weights.table[weights.table$relationship == "cluster.ligand", ]
  rownames(cluster.ligand.table) = paste0(cluster.ligand.table$source, "_", cluster.ligand.table$target)

  ## Read in the background network file
  completeFile = paste0(file.label, "_background_paths.csv")
  background.table = read.csv(completeFile, row.names = 1)
  dim(background.table)

  ## Read in the filtered network file
  thisFile = paste0(file.label, "_network_paths_weight", min.weight, ".csv")
  complete.path.table = read.csv(thisFile, row.names = 1)

  # Set up multiple workers
  system.name <- Sys.info()['sysname']
  new_cl <- FALSE
  if (system.name == "Windows") {
    new_cl <- TRUE
    cluster <- parallel::makePSOCKcluster(rep("localhost", ncores))
    doParallel::registerDoParallel(cluster)
  } else {
    doParallel::registerDoParallel(cores=ncores)
  }

  SeuratObjectFilter <- function(x) class(get(x)) == "Seurat"
  seurat.objs <- ls()[unlist(lapply(ls(), SeuratObjectFilter))]

  ## Permutation testing for determing signifcant cell-cell connections
  ## Iterate through each combination of populations and do random selections
  ## of fold changes for ligands and receptors. Calculate empirical P-value.
  pvalue.table <- foreach::foreach(s.pop = populations.test,
                                   .combine = 'rbind',
                                   .export = c("complete.path.table", "background.table"),
                                   .noexport = seurat.objs) %dopar%
    {
      this.source = paste0("S:", s.pop)
      table.subset <- c()
      for (t.pop in populations.test) {
        this.target = paste0("T:", t.pop)

        ## Add weights together
        s.indicies = which(complete.path.table$Source == this.source)
        t.indicies = which(complete.path.table$Target == this.target)
        sub.table = complete.path.table[intersect(s.indicies, t.indicies), ]

        num.paths = nrow(sub.table)
        path.sum = sum(sub.table$Weight)

        ## Get number of total paths from background table
        s.indicies = which(background.table$Source == this.source)
        t.indicies = which(background.table$Target == this.target)
        sub.table = background.table[intersect(s.indicies, t.indicies), ]

        num_total = nrow(sub.table)

        ## Get weights from the background table
        s.indicies.background = which(background.table$Source == this.source)
        t.indicies.background = which(background.table$Target == this.target)
        combined.indicies = intersect(s.indicies.background, t.indicies.background)
        sub.background.table = background.table[combined.indicies, ]
        ligand.receptor.set = paste0(sub.background.table$Ligand, "_", sub.background.table$Receptor)

        ## Now get the real weights of ligand-receptor connections
        ligand.receptor.sub.table = ligand.receptor.table[ligand.receptor.set, ]
        ppi.weights = ligand.receptor.sub.table$weight

        random.paths = rep(NA, num_total)
        random.weight.sums = rep(NA, num_total)
        for (i in 1:num.permutations) {
          random.weights = randomise_FC_weights(ppi.weights, cluster.ligand.table, receptor.cluster.table)
          random.weights = random.weights[random.weights >= 1.5]

          this.random.weight.sum = sum(random.weights)
          this.random.path.count = length(random.weights)

          random.paths[i] = this.random.path.count
          random.weight.sums[i] = this.random.weight.sum

        }
        p.sum = sum(random.weight.sums >= path.sum)/length(random.weight.sums)

        thisLine = data.frame(Source_population = s.pop,
                              Target_population = t.pop,
                              Num_paths = num.paths,
                              Sum_path = path.sum,
                              Sum_path_pvalue = p.sum)

        table.subset <- rbind(table.subset, thisLine)

      }
      return(table.subset)
  }

  if (new_cl) { ## Shut down cluster if on Windows
    ## stop cluster
    parallel::stopCluster(cluster)
  }

  ## Do P-value adjustment for multiple testing
  pval.adj = p.adjust(pvalue.table$Sum_path_pvalue, method = "BH")
  pvalue.table$Sum_path_padj = pval.adj
  pvalue.table <- pvalue.table[order(pvalue.table$Sum_path_padj, decreasing = FALSE), ]

  ## Write test results to file
  out.file = paste0("Permutation_tests_", file.label, "_network.csv")
  write.csv(pvalue.table, file = out.file, row.names = FALSE)

  if (return.results) return(pvalue.table)

}
VCCRI/scTalk documentation built on June 5, 2021, 6:35 a.m.