R/summary.R

Defines functions summarize_dnw_for_pathway get_alpha_level2 get_perm_needed get_alpha_level get_number_of_tests get_min_alpha summarize_genes summarize_pathways summarize_genes_for_pathway summarize_edges_for_pathway filter_pathways rename_genes

Documented in filter_pathways get_min_alpha rename_genes summarize_dnw_for_pathway summarize_edges_for_pathway summarize_genes summarize_genes_for_pathway summarize_pathways

#' Rename genes in the differential network analysis results
#' 
#' @param results A 'dnapath_list' or 'dnapath' object.
#' @param gene_mat A matrix of key value pairs. The first column should contain
#' current gene names, and the second column the new names. Any genes that are
#' not in this matrix will retain their current names. 
#' @return A 'dnapath_list' object containing only those pathways with differential
#' connectivity p-values below `alpha`.
#' @export
rename_genes <- function(results, gene_mat) {
  if(is.vector(gene_mat) && length(gene_mat) == 2) {
    gene_mat <- matrix(gene_mat, ncol = 2)
  } else if(ncol(gene_mat) != 2) {
    stop("'gene_mat' must be a 2 column matrix or vector of length 2.")
  }
  
  if(class(results) == "dnapath_list") {
    new_gene_names <- results[[1]]$genes
    # Subset gene_mat onto genes contained in the dnapath results.
    new_gene_names <- sapply(new_gene_names, function(x) {
      index <- which(gene_mat[, 1] == x)
      if(length(index) == 0)
        return(x) # No new name found, keep original.
      if(length(index) > 1)
        warning("Duplicate names found for ", x, ".")
      return(gene_mat[index, 2])
    })
    
    for(i in 1:length(results)) {
      results[[i]]$genes <- new_gene_names
    }
  } else if (class(results) == "dnapath") {
    new_gene_names <- results$genes
    # Subset gene_mat onto genes contained in the dnapath results.
    gene_mat <- gene_mat[gene_mat[, 1] %in% results$genes, ]
    # Replace each gene with new gene names.
    new_gene_names[sapply(gene_mat[, 1], 
                          function(x) which(new_gene_names == x))] <-
      gene_mat[, 2]
    
    results$genes <- new_gene_names
  } else {
    stop("'results' must be a ", '"dnapath_list" or "dnapath" object.')
  }
  
  return(results)
}

#' Remove pathways with non-significant DC scores.
#' 
#' @param x A 'dnapath_list' object.
#' @param alpha_pathway Threshold for pathway p-values to determine significance.
#' If NULL, defaults to 0.05 or the minimum possible threshold (based on the
#' number of permutatiosn that were run).
#' @return A 'dnapath_list' object containing only those pathways with differential
#' connectivity p-values below `alpha`.
#' @export
filter_pathways <- function(results, alpha_pathway = NULL) {
  if(class(results) != "dnapath_list")
    stop("'results' must be a 'dnapath_list' object.")
  
  if(is.null(alpha_pathway)) {
    alpha_pathway <- max(0.05, get_min_alpha(results))
  }
  if(alpha_pathway < get_min_alpha(results)) {
    warning("alpha_pathway = ", alpha_pathway, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(results))
    alpha_pathway <- get_min_alpha(results)
  }
  
  index <- which(sapply(results, function(x) x$p_value_path <= alpha_pathway))
  if(length(index) > 0) {
    results <- subset(results, pathways = index)
  } else {
    return(NULL)
  }
  
  results <- sort(results)
  return(results)
}

#' Summarize differential connections for a pathway
#' 
#' @param results A 'dnapath' object.
#' @param alpha_edge Threshold for p-values of edge DC scores. If NULL,
#' defaults to 0.05 or the minimum possible threshold (based on the
#' number of permutatiosn that were run).
#' @return A tibble summarizing the differential connections in
#' the pathway.
#' @export
summarize_edges_for_pathway <- function(results, alpha_edge = NULL) {
  if(class(results) != "dnapath") 
    stop("'results' must be a 'dnapath' object.")
  
  if(is.null(alpha_edge)) {
    alpha_edge <- max(0.05, get_min_alpha(results))
  }
  if(alpha_edge < get_min_alpha(results)) {
    warning("alpha_edge = ", alpha_edge, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(results))
    alpha_edge <- get_min_alpha(results)
  }
  
  # Note, d_edges, p_value_edges, nw1, nw2, etc. are lower tri of matrix,
  # and these fill the matrix by column.
  genes <- results$genes[results$pathway_genes]
  edges <- sapply(combn(length(genes), 2, simplify = FALSE), 
                  function(x) paste(genes[x[1]], genes[x[2]], sep = " - "))
  
  # Subset on results with p-values below alpha_edge
  index <- which(results$p_value_edges <= alpha_edge)
  if(length(index) == 0)
    # If no edges meet the threshold, return an empty tibble.
    return(tibble(pathway = character(),
                  edges = character(),
                  dc_score = numeric(),
                  p_value = numeric(),
                  p_value_m = numeric(),
                  nw1 = numeric(),
                  nw2 = numeric()))
  
  if(is.null(results$pathway)) results$pathway <- "unnamed"
  
  df <- tibble(pathway = results$pathway,
               edges = edges[index],
               dc_score = results$d_edges[index],
               p_value = results$p_value_edges[index],
               p_value_m = results$p_value_edges_mono[index],
               nw1 = results$nw1[index],
               nw2 = results$nw2[index])
  
  return(df)
}

#' Summarize differential connectivity of genes in a pathway
#' 
#' @param results A 'dnapath' object on a single pathway.
#' @param alpha_gene Threshold for p-values of gene DC scores, used to
#' subset the results. If NULL (or 1), results for all genes are shown.
#' @return A tibble summarizing the differential connectivity of genes in
#' the pathway.
#' @export
summarize_genes_for_pathway <- function(results, alpha_gene = NULL) {
  if(class(results) != "dnapath") 
    stop("'results' must be a 'dnapath' object.")
  
  if(is.null(alpha_gene)) {
    alpha_gene <- 1
  }
  if(alpha_gene < get_min_alpha(results)) {
    warning("alpha_gene = ", alpha_gene, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(results))
    alpha_gene <- get_min_alpha(results)
  }
  
  genes <- results$genes[results$pathway_genes]
  
  if(is.null(results$pathway)) results$pathway <- "unnamed"
  
  index <- which(results$p_value_genes <= alpha_gene)
  if(length(index) == 0) 
    # If no genes meet the threshold, return an empty tibble.
    return(tibble(pathway = character(),
                  genes = character(),
                  dc_score = numeric(),
                  p_value = numeric(),
                  p_value_m = numeric(),
                  mean_expr1 = numeric(),
                  mean_expr2 = numeric()))
  
  df <- tibble(pathway = results$pathway,
               genes = genes[index],
               dc_score = results$d_genes[index],
               p_value = results$p_value_genes[index],
               p_value_m = results$p_value_genes_mono[index],
               mean_expr1 = results$mean_expr[1, index],
               mean_expr2 = results$mean_expr[2, index])
  
  return(df)
}



#' Summarize the differential connectivity of pathways.
#' 
#' @param results A 'dnapath_list' object.
#' @param alpha_pathway Threshold for p-values of pathway DC scores. 
#' If NULL (or 1), results for all pathways are shown.
#' @param alpha_gene Threshold for p-values of gene DC scores. Used to determine
#' the number of genes that are differentially connected within each pathway.
#' If NULL,
#' defaults to 0.05 or the minimum possible threshold (based on the
#' number of permutatiosn that were run).
#' @return A tibble summarizing the differential connectivity of genes in
#' the pathway.
#' @export
summarize_pathways <- function(results, alpha_pathway = NULL, alpha_gene = NULL) {
  if(class(results) == "dnapath") {
    # If a single pathway is provided, return a summary of the edges.
    return(summarize_edges_for_pathway(results,))
  } else if(class(results) != "dnapath_list"){
    stop("'results' must be a 'dnapath_list' or 'dnapath' object.")
  }
  
  if(is.null(alpha_gene)) {
    alpha_gene <- max(0.05, get_min_alpha(results))
  }
  if(alpha_gene < get_min_alpha(results)) {
    warning("alpha_gene = ", alpha_gene, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(results))
    alpha_gene <- get_min_alpha(results)
  }
  
  if(is.null(alpha_pathway)) {
    alpha_pathway <- 1
  }
  if(alpha_pathway < get_min_alpha(results)) {
    warning("alpha_pathway = ", alpha_pathway, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(results))
    alpha_pathway <- get_min_alpha(results)
  }
  
  p_value <- sapply(results, function(x) x$p_value_path)
  index <- which(p_value <= alpha_pathway)
  
  if(length(index) == 0) 
    # If no genes meet the threshold, return an empty tibble.
    return(tibble(pathway = character(),
                  dc_score = numeric(),
                  p_value = numeric(),
                  n_genes = numeric(),
                  n_dc = numeric(),
                  mean_expr1 = numeric(),
                  mean_expr2 = numeric()))
  
  results <- results[index]
  # Remove any NULL results from the list.
  pathway <- sapply(results, function(x) x$pathway)
  d_pathway <- sapply(results, function(x) x$d_pathway)
  p_value <- p_value[index]
  n_genes <- sapply(results, function(x) length(x$pathway_genes))
  n_genes_dc <- sapply(results, function(x) sum(x$p_value_genes <= alpha_gene))
  mean_expr1 <- sapply(results, function(x)  sum(x$mean_expr[1, ])) / n_genes
  mean_expr2 <- sapply(results, function(x)  sum(x$mean_expr[2, ])) / n_genes
  # Note, mean_expr are divided by n_genes, the total number of possible genes
  # in the pathway. This way, if there are any missing genes from the expression
  # profile, they are counted as having 0 expression.
  
  df <- tibble(pathway = pathway,
               dc_score = d_pathway,
               p_value = p_value,
               n_genes = n_genes,
               n_dc = n_genes_dc,
               mean_expr1 = mean_expr1,
               mean_expr2 = mean_expr2)
  
  alpha_gene <- round(alpha_gene, 3)
  colnames(df)[5] <- paste0("n_dc (", ifelse(alpha_gene == 0, 
                                             "<0.001", alpha_gene), ")")

  return(df)
}


#' Summarize the differential connectivity of genes over all pathways.
#' 
#' @param results A 'dnapath_list' object.
#' @param alpha_gene Threshold for p-values of gene DC scores. Used to determine
#' the number of pathways that each gene is differentially connected in. If NULL,
#' defaults to 0.05 or the minimum possible threshold (based on the
#' number of permutatiosn that were run).
#' @return A tibble summarizing the differential connectivity of genes across
#' all pathways.
#' @export
summarize_genes <- function(results, alpha_gene = NULL) {
  if(class(results) == "dnapath") {
    return(summarize_genes_for_pathway(results,))
  } else if(class(results) != "dnapath_list"){
    stop("'results' must be a 'dnapath_list' or 'dnapath' object.")
  }
  
  if(is.null(alpha_gene)) {
    alpha_gene <- max(0.05, get_min_alpha(results))
  }
  if(alpha_gene < get_min_alpha(results)) {
    warning("alpha_gene = ", alpha_gene, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(results))
    alpha_gene <- get_min_alpha(results)
  }
  
  
  gene_list <- results[[1]]$genes
  p <- length(gene_list)
  
  n_pathways <- numeric(p)
  n_dc <- numeric(p)
  mean_expr1 <- numeric(p)
  mean_expr2 <- numeric(p)
  
  for(i in 1:length(results)) {
    x <- results[[i]]
    pathway_genes <- x$gene[x$pathway_genes]
    index <- sapply(pathway_genes, function(gene) which(gene_list == gene)[1])

    n_pathways[index] <- n_pathways[index] + 1
    n_dc[index] <- n_dc[index] + (x$p_value_genes <= alpha_gene)
    mean_expr1[index] <- x$mean_expr[1, ]
    mean_expr2[index] <- x$mean_expr[2, ]
  }
  
  df <- tibble(gene = gene_list,
               n_pathways = n_pathways,
               n_dc = n_dc,
               mean_expr1 = mean_expr1,
               mean_expr2 = mean_expr2)
  # Note, pathways containing a gene can be found by using:
  #  > subset(results, genes = gene_name)
  
  alpha_gene <- round(alpha_gene, 3)
  colnames(df)[3] <- paste0("n_dc (", ifelse(alpha_gene == 0, 
                                             "<0.001", alpha_gene), ")")
  
  return(df)
}


#' Get the minimum alpha level for the permutation test
#' 
#' @param results A 'dnapath_list' or 'dnapath' object.
#' @return The minimum alpha level that can be used based on the number
#' of permutations performed in the analysis.
#' @export
get_min_alpha <- function(results) {
  if(class(results) == "dnapath") {
    return(1 / results$n_perm)
  } else if(class(results) == "dnapath_list"){
    return(1 / results[[1]]$n_perm)
  } else {
    return(NA)
  }
}















get_number_of_tests <- function(pathway_list) {
  if(!is.list(pathway_list)) {
    n_tests <- length(pathway_list) * (length(pathway_list) - 1) / 2
  } else {
    n_tests <- sum(sapply(pathway_list, 
                          function(x) length(x) * (length(x) - 1) / 2))
  }
  return(n_tests)
}

get_alpha_level <- function(pathway_list, alpha_overall = 0.05) {
  N <- get_number_of_tests(pathway_list)
  # Significance level for each test (edge, gene, and pathway):
  alpha <- ((1 - (1 - alpha_overall)^(1 / N)) / 2)^(1 / 3)
  return(alpha)
}

get_perm_needed <- function(pathway_list, alpha_overall = 0.05) {
  alpha <- get_alpha_level(pathway_list, alpha_overall)
  n_perm <- ceiling(1 / alpha)
  return(n_perm)
}

get_alpha_level2 <- function(pathway_list, alpha_overall = 0.05,
                              interval = c(0, 0.2)) {
  if(!is.list(pathway_list)) pathway_list <- list(pathway = pathway_list)
  genes <- unique(unlist(pathway_list))
  edges <- matrix(0, nrow = length(genes), ncol = length(genes))
  for(pathway in pathway_list) {
    index <- which(genes %in% pathway)
    edges[index, index] <- edges[index, index] + 1
  }
  edges <- edges[lower.tri(edges)]
  edges <- edges[edges != 0]
  alpha <- alpha_overall^3
  f <- function(x) {
    1 - exp(sum(log(1 - 2 * edges * x^3))) - alpha_overall
  }
  alpha <- uniroot(f, interval = interval)$root
  return(alpha)
}


#' Summarize differential connectivity scores for a pathway
#' 
#' @param results The results from performing [dnapath()] on a single pathway.
#' @param alpha The desired significance level. DC scores with p-values below
#' this threshold will be set to zero.
#' @param monotonized If true, monotonized (i.e. step-down) p-values from the 
#' permutation test will be used. 
#' @return The p by p differential network, with nonzero entries containing
#' the significant DC scores.
summarize_dnw_for_pathway <- function(results, alpha = 0.1, monotonized = FALSE) {
  if(class(results) != "dnapath") 
    stop("'results' must be a 'dnapath' object.")
  
  genes_in_dnw <- results$genes[results$pathway_genes]
  p <- length(genes_in_dnw)
  
  if(p == 0) {
    warning("results pathway does not contain any genes.")
    return(NULL)
  }
  
  dnw <- matrix(0, p, p)
  colnames(dnw) <- genes_in_dnw
  
  d_edges <- results$d_edges
  
  # If p-value is above alpha, set to zero.
  if(monotonized) {
    d_edges[results$p_value_edges_mono > alpha] <- 0
  } else {
    d_edges[results$p_value_edges > alpha] <- 0
  }
  
  dnw[lower.tri(dnw)] <- d_edges
  dnw <- dnw + t(dnw)
  
  return(dnw)
}
tgrimes/dnapath2 documentation built on May 21, 2020, 5:53 p.m.