R/dna.R

Defines functions rev.dnapathpath_list tail.dnapath_list head.dnapath_list `c.dnapath_list` `[<-.dnapath_list` `[.dnapath_list` order.dnapath_list sort.dnapath_list subset.dnapath_list summary.dnapath summary.dnapath_list plot.dnapath print.dnapath print.dnapath_list dnapath

Documented in dnapath head.dnapath_list order.dnapath_list plot.dnapath print.dnapath print.dnapath_list rev.dnapathpath_list sort.dnapath_list subset.dnapath_list summary.dnapath summary.dnapath_list tail.dnapath_list

#' Differential network analysis
#' 
#' @param expr_pair Either (1) a list of two matrices, or objects to be coerced
#' to one, that contain the gene 
#' expression profile from two populations (groups), or (2) a single matrix, or 
#' object to be coerced to one, that contains the expression profiles for both 
#' groups. In either case, the rows of each matrix should
#' correspond to samples and the columns should correspond to individual genes. 
#' If a single matrix is provided, the `groups` argument 
#' must also be set to specify which rows belong to which group.
#' @param pathway_list A single vector or list of vectors containing gene names 
#' to indicate pathway membership. The vectors are used to subset the matrices
#' in 'expr_pair'. 
#' @param groups (Optional) If `expr_pair` is a single matrix, `groups` must be
#' a vector of length equal to the number of rows in `expr_pair`, and it should
#' contain two unique elements (for example, the two group names). This argument
#' is used to identify which samples belong to which group.
#' @param network_inference A function used to infer the pathway network. It
#' should take in an n by p matrix and return a p by p matrix of association 
#' scores. (Built-in options include: \code{\link{run_aracne}}, 
#' \code{\link{run_bc3net}}, \code{\link{runc3net}},
#' \code{\link{run_clr}}, \code{\link{run_corr}}, 
#' \code{\link{run_dwlasso}}, \code{\link{run_genie3}}, 
#' \code{\link{run_glasso}}, \code{\link{run_mrnet}},
#' \code{\link{run_pcor}}, and \code{\link{run_silence}}.) 
#' Defaults to \code{\link{run_pcor}} for partial correlations.
#' @param n_perm The number of random permutations to perform during 
#' permutation testing. If n_perm == 1, the permutation tests are not performed.
#' If n_perm is larger than the number of possible permutations, n_perm will
#' be set to this value with a warning message.
#' @param lp_set A vector of lp values used to compute differential connectivity
#' scores. If multiple values are given, then the results are returned as
#' a list having the same length. (Note: the option of providing a vector of lp 
#' values is available so that network inference methods only need to be
#' run once for each pathway).
#' @param seed (Optional) Used to set.seed prior to permutation test for
#' each pathway. This allows results for individual pathways to be easily 
#' reproduced.
#' @return A list containing results for each pathway in 'pathway_list'. These
#' results include the p-values for differential connectivity of each gene, the 
#' overall differential connectivity of the pathway. Pathways without any significance
#' are excluded from the results.
#' @export
dnapath <- function(expr_pair, pathway_list, groups = NULL,
                network_inference = run_pcor, n_perm = 100, lp_set = 2, 
                seed = NULL, verbose = TRUE) {
  
  #################################################################
  # If a list of pathways are given, perform dnapath over each pathway.
  #################################################################
  if(!is.null(pathway_list) && is.list(pathway_list)) {
    # Subset the input expression data onto only those genes contained
    # within all of the pathways.
    genes <- unique(unlist(pathway_list))
    # Make sure there are no duplicate genes in any pathway.
    pathway_list <- lapply(pathway_list, unique)
    if(inherits(expr_pair, "list")) {
      # Expecting expr_pair to be a list of two matrices.
      if(length(expr_pair) != 2) 
        stop("'expr_pair' list does not contain two matrices.")
      expr_pair[[1]] <- expr_pair[[1]][, colnames(expr_pair[[1]]) %in% genes]
      expr_pair[[2]] <- expr_pair[[2]][, colnames(expr_pair[[2]]) %in% genes]
    } else {
      expr_pair <- expr_pair[, colnames(expr_pair) %in% genes]
    }
    
    results_by_pathway <- lapply(pathway_list, function(pathway) {
      dnapath(expr_pair, pathway, groups, network_inference, n_perm, lp_set, seed)
    })
    index_null <- which(sapply(results_by_pathway, is.null))
    
    # If all pathways returned NULL results, stop here and return NULL.
    if(length(index_null) == length(pathway_list)) {
      warning("No pathways resulted in differential networks.\n")
      return(NULL)
    }
    
    # If multiple lp values used, re-arrange results as a list of
    # dnapath_list objects by lp value.
    if(length(lp_set) > 1) {
      results_by_lp <- vector("list", length(lp_set))
      for(i in 1:length(lp_set)) {
        results_by_lp[[i]] <- lapply(results_by_pathway, 
                                     function(results) results[[i]])
        
        # Add pathway name to each result, if pathway_list is named.
        if(!is.null(names(pathway_list))) {
          for(j in 1:length(results_by_lp[[i]])) {
            results_by_lp[[i]][[j]]$pathway <- names(pathway_list)[j]
          }
        } 
        class(results_by_lp[[i]]) <- "dnapath_list"
      }
      
      # Remove pathways that returned NULL results from both the results list.
      # Remove pathways that returned NULL results from both the results list.
      if(length(index_null) > 0) {
        results_by_lp <- lapply(results_by_lp, function(results) results[-index_null])
      }
      
      names(results_by_lp) <- paste0("lp = ", lp_set)
    } else {
      results_by_lp <- results_by_pathway
      # Add pathway name to each result, if pathway_list is named.
      if(!is.null(names(pathway_list))) {
        for(i in 1:length(results_by_lp)) {
          results_by_lp[[i]]$pathway <- names(pathway_list)[i]
        }
      } 
      
      # Remove pathways that returned NULL results from both the results list.
      if(length(index_null) > 0) {
        results_by_lp <- results_by_lp[-index_null]
      }
      
      class(results_by_lp) <- "dnapath_list"
    }
    
    return(results_by_lp)
    
  } else {
    # pathway_list is a vector of genes (a single pathway). Proceed
    # with the differential network analysis on this pathway.
    pathway <- pathway_list
  }
  
  #################################################################
  # Run differential network analysis on the pathway
  #################################################################
  # Set seed for reproducibility of permutation test, if one is provided.
  if(!is.null(seed)) {
    set.seed(seed)
  }
  
  # Process the expr_pair input. 
  # Handle the list and single matrix cases separately.
  if(inherits(expr_pair, "list")) {
    # Expecting expr_pair to be a list of two matrices.
    if(length(expr_pair) != 2) 
      stop("'expr_pair' list does not contain two matrices.")
    
    # Save the group labels, if there are any.
    group_labels <- names(expr_pair)
    if(is.null(group_labels)) 
      group_labels <- c("1", "2")
    
    # Store the gene names in the original gene expression datasets.
    gene_names <- unique(union(colnames(expr_pair[[1]]), 
                               colnames(expr_pair[[2]])))
    
    # Store the number of samples in each dataset.
    n <- c(nrow(expr_pair[[1]]), nrow(expr_pair[[2]]))
    
    # If a pathway is provided, subset counts.
    if(!is.null(pathway)) {
      index <- which(colnames(expr_pair[[1]]) %in% pathway)
      if(length(index) < 2) {
        if(verbose)
          cat("Fewer than two genes are expressed in one group. Returning NULL.\n")
        return(NULL)
      }
      expr_pair[[1]] <- expr_pair[[1]][, index]
      index <- which(colnames(expr_pair[[2]]) %in% pathway)
      if(length(index) < 2) {
        if(verbose)
          cat("Fewer than two genes are expressed in one group. Returning NULL.\n")
        return(NULL)
      }
      expr_pair[[2]] <- expr_pair[[2]][, index]
    }
    
    # Join the two datasets and coerce the output into a matrix.
    expr_pair <- as.matrix(merge(expr_pair[[1]], expr_pair[[2]], 
                                 all = TRUE, sort = FALSE))
    
  } else {
    # expr_pair must be a matrix.
    # Allow data.frame, tibble, and other objects that are coercible to matrix.
    expr_pair <- tryCatch(as.matrix(expr_pair),
                          error = function(e) 
                            stop("'expr_pair' is not a list and as.matrix() failed."))
    if(is.null(groups)) 
      stop("'groups' must be specified when 'expr_pair' is a single matrix.")
    if(length(unique(groups)) != 2)
      stop("length(unique(groups)) = ", length(unique(groups)), ", ",
           "expected to equal 2. For example, if the first n rows ",
           'are from group "a" and the last m rows are from group "b", then ',
           "'expr_pair' should contain n + m rows and 'group' may be the ",
           'vector c(rep("a", n), rep("b", m)).')
    if(length(groups) != nrow(expr_pair))
      stop("length(groups) = ", length(groups), ", but nrow(expr_pair) = ",
           nrow(expr_pair), ". These must be equal.")
    
    # Store the gene names in the original gene expression dataset.
    gene_names <- colnames(expr_pair)
    
    # If a pathway is provided, subset counts.
    if(!is.null(pathway)) {
      index <- which(colnames(expr_pair) %in% pathway)
      if(length(index) < 2) {
        if(verbose)
          cat("Fewer than two genes are expressed in one group. Returning NULL.\n")
        return(NULL)
      }
      expr_pair <- expr_pair[, index]
    }
    
    # Order rows to ensure that group 1 is first, followed by group 2.
    group_labels <- unique(groups)
    index_group_1 <- which(groups == group_labels[1])
    index_group_2 <- which(groups == group_labels[2])
    expr_pair <- expr_pair[c(index_group_1, index_group_2), ]
    
    # Store the number of samples in each dataset.
    n <- c(length(index_group_1), length(index_group_2))
  }
  
  pathway_genes <- sapply(colnames(expr_pair), 
                          function(gene) which(gene_names == gene)[1])
  
  # Fill in missing values with 0
  index_na <- which(is.na(expr_pair))
  if(length(index_na) > 0) {
    # # Don't give a warning message.
    # warning("Setting", length(index_na), "NA values to 0.")
    expr_pair[which(is.na(expr_pair))] <- 0
  }
  
  N <- nrow(expr_pair) # Total number of observations across both samples.
  p <- ncol(expr_pair) # Total number of genes in union.
  n_lp <- length(lp_set)
  
  # If total number of observations is small, use exact set of permutations. 
  # Order of samples doesn't matter, so keep only first half if sample sizes
  # are the same. If sample sizes are unequal, all permutations are used.
  if((n[1] == n[2]) && (choose(N, n[1]) / 2 <= n_perm)) {
    permutations <- combn(1:N, n[1])
    permutations <- permutations[, 1:(ncol(permutations) / 2)] 
    permutations <- permutations[, -1]
  } else if((n[1] != n[2]) && (choose(N, n[1]) <= n_perm)) {
    permutations <- combn(1:N, n[1])
    permutations <- permutations[, -1]
  } else {
    permutations <- cbind(sapply(1:n_perm, function(x) sample(1:N, n[1])))
  }
  
  if(ncol(permutations) < n_perm) 
    warning("Only ", ncol(permutations), " possible permutations. Setting ",
            "'n_perm' to this value.")
  
  n_perm <- ncol(permutations)
  
  # Initialize lists to store DC scores and p-values for each lp in lp_set.
  # Scores are initialized to 0 and p-values to 1.
  scores_1 <- network_inference(expr_pair[1:n[1], ])
  scores_2 <- network_inference(expr_pair[-(1:n[1]), ])
  
  d_path_list <- lapply(lp_set, function(lp) d_pathwayC(scores_1, scores_2, lp = lp))
  d_gene_list <- lapply(lp_set, function(lp) d_genesC(scores_1, scores_2, lp = lp))
  d_edge_list <- lapply(lp_set, function(lp) d_edgesC(scores_1, scores_2, lp = lp))
  
  # Obtain ranks of original DC scores for monotonization.
  d_path_ranks <- lapply(d_path_list, function(d_path) order(d_path, decreasing = TRUE))
  d_gene_ranks <- lapply(d_gene_list, function(d_gene) order(d_gene, decreasing = TRUE))
  d_edge_ranks <- lapply(d_edge_list, function(d_edge) order(d_edge, decreasing = TRUE))
  
  pval_path_list <- lapply(lp_set, function(lp) 1)
  pval_gene_list <- lapply(lp_set, function(lp) rep(1, p))
  pval_edge_list <- lapply(lp_set, function(lp) rep(1, p * (p - 1) / 2))
  
  pval_path_list_mono <- lapply(lp_set, function(lp) 1)
  pval_gene_list_mono <- lapply(lp_set, function(lp) rep(1, p))
  pval_edge_list_mono <- lapply(lp_set, function(lp) rep(1, p * (p - 1) / 2))
  
  # Save inferred networks.
  nw1 <- scores_1[lower.tri(scores_1)]
  nw2 <- scores_2[lower.tri(scores_2)]
  
  # If inferred networks are empty, then quit without running permutation test.
  if(all(scores_1 == 0) && all(scores_2 == 0)) {
    if(verbose)
      cat("Inferred network is empty. Permutation test not performed.\n")
    n_perm <- 1 # Only original data were considered.
  } else {
    for(i in 1:n_perm) {
      network_1 <- permutations[, i]
      
      scores_1 <- network_inference(expr_pair[network_1, ])
      scores_1[which(is.na(scores_1))] <- 0
      scores_2 <- network_inference(expr_pair[-network_1, ])
      scores_2[which(is.na(scores_2))] <- 0
      
      for(k in 1:n_lp) {
        pval_path_list[[k]] <- pval_path_list[[k]] +
          (d_pathwayC(scores_1, scores_2, lp = lp_set[[k]]) >= d_path_list[[k]])
        pval_gene_list[[k]] <- pval_gene_list[[k]] +
          (d_genesC(scores_1, scores_2, lp = lp_set[[k]]) >= d_gene_list[[k]])
        pval_edge_list[[k]] <- pval_edge_list[[k]] +
          (d_edgesC(scores_1, scores_2, lp = lp_set[[k]]) >= d_edge_list[[k]])
        
        # Compute step-down p-values.
        # Rank index for original scores:
        index <- d_path_ranks[[k]]
        n_index <- length(index)
        # Calculate scores of permuted data.
        d_path_perm <- d_pathwayC(scores_1, scores_2, lp = lp_set[[k]])
        # Take the cumulative max, in rank order, starting from the min.
        d_path_perm <- cummax(d_path_perm[rev(index)])[n_index:1][order(index)]
        # Accumulate number of permuted scores that are larger than original.
        pval_path_list_mono[[k]] <- pval_path_list_mono[[k]] +
          (d_path_perm >= d_path_list[[k]])
        
        # Repeat step-down procedure for genes and edges.
        index <- d_gene_ranks[[k]]
        n_index <- length(index)
        d_gene_perm <- d_genesC(scores_1, scores_2, lp = lp_set[[k]])
        d_gene_perm <- cummax(d_gene_perm[rev(index)])[n_index:1][order(index)]
        pval_gene_list_mono[[k]] <- pval_gene_list_mono[[k]] +
          (d_gene_perm >= d_gene_list[[k]])
        
        index <- d_edge_ranks[[k]]
        n_index <- length(index)
        d_edge_perm <- d_edgesC(scores_1, scores_2, lp = lp_set[[k]])[index]
        d_edge_perm <- cummax(d_edge_perm[rev(index)])[n_index:1][order(index)]
        pval_edge_list_mono[[k]] <- pval_edge_list_mono[[k]] +
          (d_edge_perm >= d_edge_list[[k]])
      }
    }
  }
  
  # If no permutation testing is done, set p-values to NA.
  if(n_perm == 1) {
    pval_path_list <- lapply(pval_path_list, function(pval) NA)
    pval_gene_list <- lapply(pval_gene_list, function(pval) NA)
    pval_edge_list <- lapply(pval_edge_list, function(pval) NA)
    
    pval_path_list_mono <- lapply(pval_path_list_mono, function(pval) NA)
    pval_gene_list_mono <- lapply(pval_gene_list_mono, function(pval) NA)
    pval_edge_list_mono <- lapply(pval_edge_list_mono, function(pval) NA)
  } else {
    pval_path_list <- lapply(pval_path_list, function(pval) pval / (n_perm + 1))
    pval_gene_list <- lapply(pval_gene_list, function(pval) pval / (n_perm + 1))
    pval_edge_list <- lapply(pval_edge_list, function(pval) pval / (n_perm + 1))
    
    for(k in 1:length(lp_set)) {
      index <- d_path_ranks[[k]]
      pval_path_list_mono[[k]] <- 
        cummax(pval_path_list_mono[[k]][index] / (n_perm + 1))[order(index)]
      index <- d_gene_ranks[[k]]
      pval_gene_list_mono[[k]] <- 
        cummax(pval_gene_list_mono[[k]][index] / (n_perm + 1))[order(index)]
      index <- d_edge_ranks[[k]]
      pval_edge_list_mono[[k]] <- 
        cummax(pval_edge_list_mono[[k]][index] / (n_perm + 1))[order(index)]
    }
  }
  
  results <- vector("list", n_lp)
  for(i in 1:n_lp) {
    results[[i]] <- list(genes = gene_names,
                         pathway_genes = pathway_genes,
                         pathway = NULL,
                         p_value_path = pval_path_list[[i]],
                         p_value_genes = drop(pval_gene_list[[i]]),
                         p_value_edges = drop(pval_edge_list[[i]]),
                         p_value_path_mono = pval_path_list_mono[[i]],
                         p_value_genes_mono = pval_gene_list_mono[[i]],
                         p_value_edges_mono = pval_edge_list_mono[[i]],
                         d_pathway = d_path_list[[i]],
                         d_genes = drop(d_gene_list[[i]]),
                         d_edges = drop(d_edge_list[[i]]),
                         nw1 = nw1,
                         nw2 = nw2,
                         mean_expr = rbind(apply(expr_pair[1:n[1], ], 2, mean),
                                           apply(expr_pair[-(1:n[1]), ], 2, mean)),
                         groups = group_labels,
                         n_perm = n_perm,
                         lp = lp_set[i],
                         seed = seed)
    class(results[[i]]) <- "dnapath"
  }
  
  # If only one lp is considered, unlist over lp.
  if(length(results) == 1) {
    results <- results[[1]]
  }
  
  class(results) <- "dnapath"
  return(results)
}

#' Print function for 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param ... Additional arguments are ignored.
#' @return Prints a summary of the module.
#' @export
print.dnapath_list <- function(x, ...) {
  if(!is(x, "dnapath_list")) 
    stop(paste0("'", deparse(substitute(x)), 
                "' is not a 'dnapath_list' object."))
  
  groups <- x[[1]]$groups
  
  message <- paste0("Differential network analysis results between, ", groups[1], 
                    " (group 1) and ", groups[2], " (group 2) over ", length(x), 
                    " pathways.\n")
  cat(message)
  summary(x)
}

#' Print function for 'dnapath' object.
#' 
#' @param x A 'dnapath' object.
#' @param ... Additional arguments are ignored.
#' @return Prints a summary of the module.
#' @export
print.dnapath <- function(x, ...) {
  if(!is(x, "dnapath")) 
    stop(paste0("'", deparse(substitute(x)), 
                "' is not a 'dnapath' object."))
  
  pathway <- drop(x$pathway)
  p <- length(x$pathway_genes)
  
  groups <- x$groups
  
  message <- paste0("Differential network analysis between, ", groups[1], 
                    " (group 1) and ", groups[2], " (group 2);")
  
  if(is.null(pathway)) {
    message <- paste0(message, " results for an unnamed ", "pathway containing ", 
                      p, " genes.")
  } else {
    message <- paste0(message, ' results for the pathway "', pathway, 
                      '" containing ', p, " genes.")
  }
  
  # Add pathway p-value to the message. 
  p_value <- round(x$p_value_path, 3)
  if(p_value == 0) {
    p_value <- "< 0.001"
  } else {
    p_value <- paste("=", p_value)
  }
  
  message <- paste0(message, ' Pathway p-value ', p_value, ".")
  
  message <- paste0(message, " The mean expression of genes in the pathway is ",
                    round(mean(x$mean_expr[1, ]), 1), " in group 1 and ",
                    round(mean(x$mean_expr[2, ]), 1), " in group 2.\n")
  
  cat(message)
  summary(x)
}

#' Plot function for 'dnapath' object.
#' 
#' @param x A 'dnapath' object.
#' @param alpha Threshold for p-values to infer differentially connected edges.
#' If NULL (the default) then no edges are removed from the plot.
#' @param monotonized If TRUE, monotonized (i.e. step-down) p-values from the 
#' permutation test will be used.
#' @param ... Additional arguments are passed into the plotting function
#' \code{\link[SeqNet]{plot_network}}.
#' @return Plots the differential network and returns the graph object. 
#' See \code{\link[SeqNet]{plot_network}} for details.
#' @export
plot.dnapath <- function(x, alpha = NULL, monotonized = FALSE, ...) {
  if(!is(x, "dnapath")) 
    stop(paste0("'", deparse(substitute(x)), 
                "' is not a 'dnapath' object."))
  
  # If there are any NA mean expression values, set them to 0.
  if(any(is.na(x$mean_expr))) {
    x$mean_expr[which(is.na(x$mean_expr))] <- 0
  }
  # If any mean expression values are negative, set all values equal
  # to make fold-changes equal to 1.
  if(any(x$mean_expr < 0)) {
    x$mean_expr <- 10
  }
  
  genes_in_dnw <- x$genes[x$pathway_genes]
  p <- length(genes_in_dnw)
  dc_vals <- (x$nw2 - x$nw1)
  
  if(monotonized) {
    p_values <- x$p_value_edges_mono
  } else {
    p_values <- x$p_value_edges
  }
  if(!is.null(alpha)) {
    # If p-value is above alpha, edges will be scaled to zero width.
    p_values[p_values > alpha] <- 1
  }
  # Base edge weights are from -log(p-value), and are further scaled
  # by degree of change in association strength.
  edge_weights <- -log(p_values) *
    abs(dc_vals) / max(c(abs(x$nw1), abs(x$nw2)))
  if(any(is.nan(edge_weights))) {
    # 0 / 0 association scores will result in NaN values. Set these to 0.
    edge_weights[is.nan(edge_weights)] <- 0
  }
  
  fold_change <- log2(x$mean_expr[2, ] / x$mean_expr[1, ])
  if(any(is.nan(fold_change))) {
    # 0 / 0 expression will result in NaN values. Set these to 0.
    fold_change[is.nan(fold_change)] <- 0
  }
  mean_expr <- apply(x$mean_expr, 2, max)
  mean_expr <- mean_expr / max(mean_expr)
  node_weights <- log2(1 + mean_expr) * 31 + 1
  
  # Positive values indicate increase in association from group 1 to group 2.
  # nw2 - nw1
  dnw <- matrix(0, p, p)
  dnw[lower.tri(dnw)] <- dc_vals
  dnw <- dnw + t(dnw)
  colnames(dnw) <- genes_in_dnw
  
  # TODO: set color based on group with higher magnitude of association.
  # what if going from -0.3 to 0.3? No change in magnitude, but largest difference.
  # Maybe just set color = to sign in group 1.
  g <- SeqNet::plot_network(dnw, display_plot = FALSE, ...)
  g$vertex.size <- node_weights
  g$edge.width <- edge_weights * 2.5
  g$vertex.color <- ifelse(fold_change > 0, 
                           rgb(1, 0.19, 0.19, 
                               pmax(0, pmin(1, abs(fold_change) / 2))),
                           rgb(0.31, 0.58, 0.8, 
                               pmax(0, pmin(1, abs(fold_change) / 2))))
  g$vertex.color
  g$vertex.frame.color <- rgb(0.3, 0.3, 0.3, 0.5)
  # Rescale edge weights to be used for color transparency.
  edge_weights <- edge_weights / max(edge_weights)
  if(any(is.nan(edge_weights))) edge_weights <- 0 # No edges are significantly DC.
  g$edge.color <- ifelse(abs(x$nw2) > abs(x$nw1), 
                         rgb(1, 0.19, 0.19, 
                             pmax(0.3, edge_weights)),
                         rgb(0.31, 0.58, 0.8, 
                             pmax(0.3, edge_weights)))
  g$vertex.label.cex <- 1
  
  plot(g)
  
  invisible(g)
}

#' Summary function for 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param by_gene If TRUE, summarizes the differential network analysis by genes
#' instead of by pathways.
#' @param alpha_pathway Threshold for p-values of pathway DC scores; used 
#' to subset the results.
#' If NULL (or 1), results for all pathways are shown.
#' @param alpha_gene Threshold for p-values of gene DC scores. Used to determine
#' the number of genes that are differentially connected within each pathway. If NULL,
#' defaults to 0.05 or the minimum possible threshold (based on the
#' number of permutatiosn that were run).
#' @param ... Additional arguments are ignored.
#' @return Summarizes the differential network analysis results.
#' @export
summary.dnapath_list <- function(x, by_gene = FALSE, 
                             alpha_pathway = NULL, alpha_gene = NULL, ...) {
  if(is.null(alpha_gene)) {
    alpha_gene <- max(0.05, get_min_alpha(x))
  }
  if(alpha_gene < get_min_alpha(x)) {
    warning("alpha_gene = ", alpha_gene, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(x))
    alpha_gene <- get_min_alpha(x)
  }
  
  if(is.null(alpha_pathway)) {
    alpha_pathway <- 1
  }
  if(alpha_pathway < get_min_alpha(x)) {
    warning("alpha_pathway = ", alpha_pathway, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(x))
    alpha_pathway <- get_min_alpha(x)
  }
  
  if(by_gene) {
    df <- summarize_genes(x, alpha_gene = alpha_gene)
    # df <- dplyr::arrange(df, n_dc / n_pathways, desc(mean_expr1 + mean_expr2))
    print(df)
  } else {
    df <- summarize_pathways(x, alpha_pathway = alpha_pathway, 
                             alpha_gene = alpha_gene)
    # df <- dplyr::arrange(df, p_value, desc(d_pathway))
    print(df)
  }
  
  invisible(df)
}

#' Summary function for 'dnapath' object.
#' 
#' @param x A 'dnapath' object.
#' @param by_gene If TRUE, summarizes the differential network analysis by genes;
#' otherwise, summarizes by gene-gene interactions.
#' @param alpha Threshold for p-values to determine significance; defaults 
#' to 1 and returns all results. If 'by_gene'
#' is FALSE, then 'alpha' is used to filter edges. If 'by_gene' is TRUE,
#' then 'alpha' is used to filter genes. 
#' @param ... Additional arguments are passed into \code{\link{summarize_pathways}}
#' (default) or \code{\link{summarize_genes_over_pathways}} (if by_gene is TRUE).
#' @return Summarizes the differential network analysis result.
#' @export
summary.dnapath <- function(x, by_gene = TRUE, alpha = NULL, ...) {
  if(is.null(alpha)) {
    alpha <- 1
  }
  if(alpha < get_min_alpha(x)) {
    warning("alpha = ", alpha, " cannot be met with the number of ",
            "permutations. Setting to ", get_min_alpha(x))
    alpha <- get_min_alpha(x)
  }
  
  if(by_gene) {
    df <- summarize_genes_for_pathway(x, alpha_gene = alpha)
    df <- dplyr::arrange(df, p_value, desc(dc_score))
    print(df[, -1])
  } else {
    df <- summarize_edges_for_pathway(x, alpha_edge = alpha)
    df <- dplyr::arrange(df, p_value, desc(dc_score))
    print(df[, -1])
  }
  
  invisible(df)
}


#' Subset function for 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param pathways A set of pathways to index on. This can be (1) a vector of 
#' character strings, corresponding to 
#' pathway names or regular expressions used to find pathways, (2) a vector of 
#' indices to select pathways, (3) a vector of 
#' negative indices indicating pathways to remove, or (4) a logical (boolean) 
#' vector  that is the same length of current number of pathways in `x`.
#' @param genes A set of gene names to index on; exact matching is used. Only 
#' pathways containing these genes are retained. 
#' @param ... Additional arguments are ignored.
#' @return A subset of the differential network analysis results.
#' @export
subset.dnapath_list <- function(x, pathways = NULL, genes = NULL, ...) {
  # Pathways to keep or remove. These are used so that both pathways and genes
  # can be provided, and the union of results is returned.
  index_keep <- rep(FALSE, length(x))
  index_remove <- rep(FALSE, length(x)) # Only used for -pathway input.
  
  if(!is.null(pathways)) {
    if(is.numeric(pathways)) {
      if(all(pathways > 0)) {
        index_keep[pathways] <- TRUE
      } else if(all(pathways < 0)) {
        index_remove[pathways] <- TRUE
      } else {
        stop("pathway indices must be all postive or all negative.")
      }
    } else if(is.logical(pathways)) {
      if(length(pathways) == length(x)) {
        index_keep[pathways] <- TRUE
      } else {
        stop("pathway is a logical vector but is not same length as the current",
             "number of pathways in 'x'.")
      }
    } else if(is.character(pathways)) {
      for(pathway in pathways) {
        index <- grep(pathway, names(x), ignore.case = TRUE)
        if(length(index) > 0)
          index_keep[index] <- TRUE
      }
    }
  }
  
  if(!is.null(genes)) {
    for(gene in genes) {
      index <- which(sapply(x, function(path) 
        any(path$genes[path$pathway_genes] == gene)))
      if(length(index) > 0)
        index_keep[index] <- TRUE
    }
  }
  
  # If pathways were specified for removal, update index_keep.
  if(sum(index_remove) > 1) {
    if(!is.null(genes)) {
      # If pathways were subset on genes, then update the index_keep array.
      index_keep[index_remove] <- FALSE
    } else {
      # Otherwise, keep all pathways that were not specified for removal.
      index_keep[!index_remove] <- TRUE
    }
  }
  
  x <- x[index_keep]
  
  if(length(x) == 0) {
    return(NULL)
  } else if(length(x) == 1) {
    return(x[[1]])
  } else {
    class(x) <- "dnapath_list"
    return(x)
  }
}


#' Sort function for 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param decreasing Logical. If TRUE (the default), results are sorted in 
#' decreasing order.
#' @param by The variable to sort the results by. Must be one of: "mean_expr", 
#' the mean expression of each pathway across both groups; "mean_expr1"
#' or "mean_expr2", the mean expression of each pathway in group
#' 1 or 2, respectively; "dc_score", the differential connectivity score
#' of the pathway; "p_value", the p-value of the dc score; "n_genes", 
#' the number of genes in each pathway; "n_dc" the number of significantly 
#' differentially conncted genes in each pathway.
#' @param ... Additional arguments are ignored.
#' @return The differential network analysis results ordered by DC pathway score.
#' @export
sort.dnapath_list <- function(x, decreasing = TRUE, by = "mean_expr", ...) {
  if(class(x) != "dnapath_list")
    stop("`x` must be a 'dnapath_list' object.")
  
  if(by == "mean_expr") {
    index <- order(sapply(x, function(res) mean(res$mean_expr)), 
                   decreasing = decreasing)
  } else if(by == "mean_expr1") {
    index <- order(sapply(x, function(res) mean(res$mean_expr[1, ])), 
                   decreasing = decreasing)
  } else if(by == "mean_expr2") {
    index <- order(sapply(x, function(res) mean(res$mean_expr[2, ])), 
                   decreasing = decreasing)
  } else if(by == "dc_score") {
    index <- order(sapply(x, function(res) res$d_pathway), 
                   decreasing = decreasing)
  } else if(by == "p_value") {
    index <- order(sapply(x, function(res) res$p_value_path), 
                   decreasing = decreasing)
  } else if(by == "n_genes") {
    index <- order(sapply(x, function(res) length(res$pathway_genes)), 
                   decreasing = decreasing)
  } else if(by == "n_dc") {
    alpha <- max(0.05, get_min_alpha(x))
    index <- order(sapply(x, function(res) sum(res$p_value_genes <= alpha)), 
                   decreasing = decreasing)
  } else {
    stop('`by` must be one of: "mean_expr", the mean expression of ',
         'each pathway across both groups; "mean_expr1" or "mean_expr2", the ',
         'mean expression of each pathway in group 1 or 2, respectively; ',
         '"dc_score", the differential connectivity score of the pathway; ',
         '"p_value", the p-value of the dc score; "n_genes", the number of ',
         'genes in each pathway; "n_dc" the number of significantly ', 
         'differentially conncted genes in each pathway.')
  }

  x <- x[index]
  
  return(x)
}

#' Order function for 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param ... Additional arguments are ignored.
#' @return The sorted indices of differential network analysis results,
#' ordered by DC pathway score.
#' @export
order.dnapath_list <- function(x, ...) {
  if(class(x) == "dnapath_list") {
    index <- order(sapply(x, function(res) res$d_pathway), decreasing = TRUE)
    return(index)
  } else {
    return(NULL)
  }
}

#' Extract parts of a 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param i The indices of pathways to extract.
#' @param ... Additional arguments are ignored.
#' @return A 'dnapath_list' object containing pathways indexed by 'i'.
#' @export
`[.dnapath_list` <- function(x, i, ...) {
  x <- unclass(x)[i]
  
  # If all elements are removed, return the empty list.
  if(length(x) == 0) return(x)
  
  p <- length(x[[1]]$genes)
  remove_genes <- rep(TRUE, p)
  # x[[j]]$pathway_genes contains indices for x[[j]]$genes that 
  # are correspond to the genes contained in the pathway.
  # First, obtain all indices that are retained in the subset.
  keep <- sort(unique(unlist(sapply(x, function(path) path$pathway_genes))))
  # Check that the selected pathways are not empty.
  if(length(keep) == 0) {
    warning("Selected pathways contained no genes. Returning NULL.")
    return(NULL)
  }
  # Do not remove the gene names that are indexed by the pathways.
  remove_genes[keep] <- FALSE
  # New indices are shifted by the number of genes that were removed
  # before them. For ex. if the first first 3 genes are removed,
  # index 4 will become 4 - 3. If gene 5 is also removed. Index 6
  # would become 6 - (3 + 1) = 2.
  # The cumsum counts the number of genes that were removed prior
  # to each index.
  index_shift <- cumsum(remove_genes)
  newgenes <- x[[1]]$genes[!remove_genes]
  # In the above example, newindex would be the vector 
  # (0, 0, 0, 1, 0, 2, ...). 
  # Indices 4 and 5 are retained and newindex[(4, 5)] provides the
  # new, shifted indices.
  newindex <- ((1:p) - index_shift)
  
  # Loop through each pathway and store the new gene set and indices.
  for(j in 1:length(x)) {
    x[[j]]$genes <- newgenes
    x[[j]]$pathway_genes <- newindex[x[[j]]$pathway_genes]
  }
  
  class(x) <- "dnapath_list"
  return(x)
}
 

#' Replace parts of a 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param ... Additional arguments are ignored.
#' @return Replacement is not defined; an error is generated.
#' @export
`[<-.dnapath_list` <- function(x, ...) {
  stop("Replacement undefined for 'dnapath_list' object.")
}

#' Combine two 'dnapath_list' objects.
#' 
#' @param ... 'dnapath_list' objects to be concatenated.
#' @return Concatenation is not defined; an error is generated.
#' @export
`c.dnapath_list` <- function(...) {
  stop("Concatenation is undefined for 'dnapath_list' objects.")
}

#' Return the first five pathways of a 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param ... Additional arguments are ignored.
#' @return A 'dnapath_list' object containing the first five pathways in 'x'.
#' @export
head.dnapath_list <- function(x, ...) {
  return(x[1:min(5, length(x))])
}

#' Return the last five pathways of a 'dnapath_list' object.
#' 
#' @param x A 'dnapath_list' object.
#' @param ... Additional arguments are ignored.
#' @return A 'dnapath_list' object containing the last five pathways in 'x'.
#' @export
tail.dnapath_list <- function(x, ...) {
  return(x[max(1, length(x) - 4):length(x)])
}

#' Reverse the order of pathways in a 'dnapath_list' object.
#' 
#' @param x A 'dnapathpath_list' object.
#' @param ... Additional arguments are ignored.
#' @return A 'dnapathpath_list' object containing the pathways in 'x' in reverse order.
#' @export
rev.dnapathpath_list <- function(x, ...) {
  return(x[length(x):1])
}
tgrimes/dnapath2 documentation built on May 21, 2020, 5:53 p.m.