R/TFtargetsINcmap.R

Defines functions TFtargetsINcmap plotTFtargetsINcmap showLargestRegulons

Documented in plotTFtargetsINcmap showLargestRegulons TFtargetsINcmap

##' Summarise TF targets enrichment when TF perturbed in Connectivity Map
##' @rdname TFtargetsINcmap
##' @name TFtargetsINcmap
##' @author Vitalii Kleshchevnikov
##' @description \code{TFtargetsINcmap}
##' @param regulons data.table containing TF regulons (TF and target columns)
##' @param alternative indicates alternative hypothesis for \link[stats]{ks.test}. "trt_oe": negative_regulator = "less", positive_regulator = "greater"; "trt_sh.cgs" or anything else: negative_regulator = "greater"; positive_regulator = "less"
##' @param pert_types a character vector of perturbation types, query for available perturbation types using \code{perturbTable(CMap_files, ~ pert_type)}
##' @param cell_ids a character vector of cell line names, query for available cell line names using \code{perturbTable(CMap_files, ~ cell_id)}
##' @param gene_names a character vector of HUGO gene names. Use \code{"all"} to select all perturbations.
##' @param pert_itimes a character vector of perturbation times, query for available perturbation times using \code{perturbTable(CMap_files, ~ pert_itime)}
##' @param CMap_files a list of directories and urls produced by \code{\link{loadCMap}}
##' @param keep_one_oe keep only one overexpression experiment per gene and condition. Applicable only when \code{pert_types = "trt_oe"}. Perturbations where sig_id matches pert_id are retained ("one"). To invert the selection use "other". To select all use "all"
##' @param landmark_only look only at landmark genes. Details: https://docs.google.com/document/d/1q2gciWRhVCAAnlvF2iRLuJ7whrGP6QjpsCMq1yWz7dU/edit
##' @param clustermq_memory When using clustermq: memory requested for each job
##' @param clustermq_job_size When using clustermq: The number of function calls per job
##' @param min_targets_in_cmap minimal number of TF targets in cmap
##' @param force_gene_names look at all \code{gene_names} but less cell lines
##' @return list containing data.table summary of TF perturbation effects on TF targets, object of class 'GCT' (CMap data), summary of genes vs cell lines available in CMap and returned (gene_cell_counts)
##' @export TFtargetsINcmap
##' @import data.table
##' @import clustermq
##' @import gsEasy
##' @seealso \code{\link{regFromCMAP}}, \code{\link{completeCellGeneCombs}}, \code{\link{showLargestRegulons}}
TFtargetsINcmap = function(regulons, alternative = "less",
                           pert_types = c("trt_oe"),
                           cell_ids = c("A375", "A549", "HA1E","HCC515",
                                        "HEPG2", "HT29","MCF7", "PC3"),
                           gene_names = c("NFKB1", "E2F1", "MYC", "TP53",
                                          "YY1", "JUN", "STAT1", "STAT3"),
                           pert_itimes = c("96 h"),
                           CMap_files, keep_one_oe = c("one", "other", "all")[1],
                           landmark_only = F,
                           clustermq_memory = 2000, clustermq_job_size = 1,
                           min_targets_in_cmap = 3, force_gene_names = F
){


  cmap = readCMAPsubset(is_touchstone = "all",
                        pert_types = pert_types,
                        pert_itimes = pert_itimes, cell_ids = cell_ids,
                        gene_names = gene_names, CMap_files = CMap_files,
                        keep_one_oe = keep_one_oe,
                        landmark_only = landmark_only)

  gene_cell_counts = completeCellGeneCombs(cmap, force_gene_names)

  cell_ids = cell_ids[cell_ids %in% colnames(gene_cell_counts$gene_cell_counts)]
  gene_names = gene_names[gene_names %in% rownames(gene_cell_counts$gene_cell_counts)]

  # look only at regulons where target genes were measured in cmap
  regulons = regulons[target_entrezgene %in% rownames(cmap@mat)]
  regulons[, targets_in_cmap := uniqueN(target_entrezgene), by = TF]
  regulons = regulons[targets_in_cmap >= min_targets_in_cmap]
  # TF in cmap
  regulons = regulons[TF %in% cmap@rdesc$pr_gene_symbol]
  # update gene name list
  gene_names = gene_names[gene_names %in% regulons$TF]

  # produce data for plotting
  res = Q(function(cell_line){
    library(regNETcmap)
    library(gsEasy)
    mat = cmap@mat
    res = lapply(gene_names, function(TF_measured){
      res = lapply(gene_names, function(TF_sh){
        TF_ind = cmap@rdesc$pr_gene_symbol %in% regulons[TF %in% TF_measured, TF]
        target_ind = rownames(mat) %in% regulons[TF %in% TF_measured, target_entrezgene]
        cell_line_ind = cmap@cdesc$cell_id %in% cell_line
        TF_sh_ind = cmap@cdesc$pert_iname %in% TF_sh
        vec = mat[, cell_line_ind & TF_sh_ind]
        x = vec[!target_ind]
        y = vec[target_ind]
        TF_val = vec[TF_ind][1]
        w = ks.test(x, y, alternative = alternative)
        if(alternative == "less") GSEA_vec = -vec else GSEA_vec = vec
        GSEA_pval1 = gsEasy::gset(S = which(target_ind), N = NULL,
                                  r = GSEA_vec, p = 1, min_its = 1000,
                                  max_its = 1e+05,
                                  significance_threshold = 1, log_dismiss = -10,
                                  raw_score = FALSE)
        GSEA_pval10 = gsEasy::gset(S = which(target_ind), N = NULL,
                                   r = GSEA_vec, p = 10, min_its = 1000,
                                   max_its = 1e+05,
                                   significance_threshold = 1, log_dismiss = -10,
                                   raw_score = FALSE)
        size = sum(target_ind)
        data.table(target = ifelse(target_ind,"TF_targets", "other_genes"), TF_sh_lab = paste0(TF_sh," ", paste0(pert_types, collapse = "|")),
                   cell_ids = cell_line, TF_measured_lab = paste0(TF_measured, " regulon\ntargets (cmap):", size,"\n total:", regulons[TF %in% TF_measured, unique(size)]),
                   TF_measured = TF_measured, TF_sh = TF_sh,
                   gene_exp = vec, TF_val = TF_val,
                   pval = w$p.value, statistic = w$statistic,
                   GSEA_pval1 = GSEA_pval1, GSEA_pval10 = GSEA_pval10,
                   #GSEA_score1 = GSEA_score1, GSEA_score10 = GSEA_score10,
                   size = size
        )
      })
      Reduce(rbind, res)

    })
    Reduce(rbind, res)
  }, cell_ids,
  export = list(regulons = regulons, gene_names = gene_names,
                cmap = cmap, alternative = alternative,
                pert_types = pert_types),
  job_size = clustermq_job_size, memory = clustermq_memory)
  res = Reduce(rbind, res)
  res[, pvals := paste0("ks: ", signif(pval, 3), "\n",
                        signif(GSEA_pval1, 3), "\n",
                        signif(GSEA_pval10, 3)
                        )]
  list(res = res, cmap = cmap, gene_cell_counts = gene_cell_counts, regulons = regulons)
}

##' Plot TF perturbation effects (CMap) on TF targets
##' @rdname plotTFtargetsINcmap
##' @name plotTFtargetsINcmap
##' @param res data.table containing summary of TF perturbation effects on TF targets returned by \code{\link{TFtargetsINcmap}}
##' @param TFsels which TF to plot, defaults to all in \code{res}
##' @param xlim passed to \link[ggplot2]{xlim}
##' @import data.table
##' @export plotTFtargetsINcmap
plotTFtargetsINcmap = function(res, TFsels = NULL,
                               title = "TF that positively regulate their targets \n(shRNA induces shift to the left)",
                               lab_text_size = 2, lab_text_y = 1.2, lab_text_x = -2.2,
                               strip.text_size = 8, vline_size = 1,
                               xlim = c(-3.3, 3.3), ylim = c(0, 1.7)) {
  if(is.null(TFsels)) TFsels = unique(c(res$TF_sh, res$TF_measured))
  plots = list()
  for (cellline in unique(res$cell_ids)) {
    p = ggplot(res[cell_ids == cellline & TF_sh %in% TFsels & TF_measured %in% TFsels],
               aes(x = gene_exp, color = target)) +
      geom_density() + facet_grid(TF_measured_lab~TF_sh_lab) +
      geom_vline(xintercept = 0, color = "black", size = vline_size) +
      geom_vline(aes(xintercept = TF_val), color = "#00BFC4", size = vline_size) +
      xlim(xlim) + ylim(ylim) +
      geom_text(y = lab_text_y, x = lab_text_x,
                aes(label = pvals), size = lab_text_size) +
      theme(strip.text.y = element_text(angle = 0, size = strip.text_size),
            strip.text.x = element_text(size = strip.text_size)) +
      ggtitle(title, subtitle = cellline) + xlab("z-score")
    plots[[which(unique(res$cell_ids) %in% cellline)]] = p
  }
  plots
}

##' Show largest TF regulons
##' @rdname showLargestRegulons
##' @name showLargestRegulons
##' @param regulons data.table containing TF regulons (TF and target columns)
##' @param N number of largest regulons to display
##' @import data.table
##' @export showLargestRegulons
showLargestRegulons = function(regulons, N = 40) {
  regulons = copy(regulons)
  regulons[, size := uniqueN(target), by = TF]
  regulons = regulons[order(size, decreasing = T)]
  list(regulons = regulons, summary = unique(regulons[order(size, decreasing = T),.(TF, size)])[1:N])
}

##' Find cell lines and perturbations (Connectivity map) where all pert_iname are present in all cell_id
##' @rdname completeCellGeneCombs
##' @name completeCellGeneCombs
##' @param cmap object of class 'GCT' [package "cmapR"] with 7 slots containing z-score matrix, perturbation and feature details of the Connectivity map project. Returned by \code{\link{readCMAP}} or \code{\link{readCMAPsubset}}
##' @param force_gene_names look at all \code{gene_names} but less cell lines
##' @import data.table
##' @export completeCellGeneCombs
completeCellGeneCombs = function(cmap, force_gene_names = F) {
  gene_cell = as.data.table(cmap@cdesc)[, .(pert_iname, cell_id)]
  gene_cell$pert_iname = as.character(gene_cell$pert_iname)
  gene_cell$cell_id = as.character(gene_cell$cell_id)
  gene_cell_counts = table(gene_cell)
  if(ncol(gene_cell_counts) > 1 & nrow(gene_cell_counts) > 1){
    gene_zero_counts = rowMeans(gene_cell_counts == 0)
    cell_zero_counts = colMeans(gene_cell_counts == 0)
    if(max(gene_zero_counts) < max(cell_zero_counts) | force_gene_names){
      gene_cell_counts = gene_cell_counts[,c(!as.logical(cell_zero_counts))]
      gene_zero_counts = rowMeans(gene_cell_counts == 0)
      cell_zero_counts = colMeans(gene_cell_counts == 0)
      gene_cell_counts = gene_cell_counts[!as.logical(gene_zero_counts),]
      gene_cell_counts = gene_cell_counts[,!as.logical(cell_zero_counts)]
    } else {
      gene_cell_counts = gene_cell_counts[!as.logical(gene_zero_counts),]
      gene_zero_counts = rowMeans(gene_cell_counts == 0)
      cell_zero_counts = colMeans(gene_cell_counts == 0)
      gene_cell_counts = gene_cell_counts[!as.logical(gene_zero_counts),]
      gene_cell_counts = gene_cell_counts[,!as.logical(cell_zero_counts)]
    }
  }
  list(gene_cell_counts = gene_cell_counts, gene_cell_counts_requested = table(gene_cell))
}
vitkl/regNETcmap documentation built on Feb. 18, 2020, 3:43 a.m.