R/coregulstions.R

Defines functions alluvial_enriched generate_count_table alluvial_enrichment_tests draw_alluvial_plot get_comigrations_by_name get_condition_biased_comigrations

Documented in alluvial_enriched alluvial_enrichment_tests draw_alluvial_plot generate_count_table get_comigrations_by_name get_condition_biased_comigrations

##############################################
# The deveroper and the maintainer:          #
#   Lang Ho Lee (lhlee@bwh.harvard.edu)      #
#   Sasha A. Singh (sasingh@bwh.harvard.edu) #
##############################################

# Mute warnings
options(warn=1)

#' @title get_condition_biased_comigrations
#' @description get comigrations that at least one biased cluster is involved in.
#' Biased clusters are defined by
#' @param clustering_result A list containing XINA clustering results.
#' See \link[XINA]{xina_clustering}
#' @param count_table A data frame generated by using \link[plyr]{count}.
#' If count_table is NULL (by default), XINA will consider all the comigrations.
#' @param selected_conditions A vector of condition names used in XINA clustering results.
#' The number of selected conditions should be at least two.
#' @param condition_composition The resulting data frame of 'plot_condition_compositions'.
#' See \link[XINA]{plot_condition_compositions}.
#' @param threshold_percent Default is 50.  The percentage threshold for finding condition-biased clusters
#' @param color_for_null A color for non-condition-biased comigrations. Default is 'gray'
#' @param color_for_highly_matched A color for comigrations that are involved with more than two condition-biased clusters. Default is 'red4'
#' @param cex Size of cluster number on block axis. Default if 0.7. See \link[alluvial]{alluvial}.
#' @param alpha Transparency of alluvia colors. Default is 0.3. See \link[alluvial]{alluvial}.
#' @return An alluvial plot displaying comigrations and the data frame containing condition-biased comigrations.
#' @export
#' @examples
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # get a vector of experimental conditions analyzed in the clustering results
#' conditions <- as.vector(example_clusters$condition)
#'
#' # get condition composition information
#' condition_composition <- plot_condition_compositions(example_clusters)
#'
#' comigrations_size10 <- alluvial_enriched(example_clusters, conditions, comigration_size=10)
#' # Finding condition-biased comigrations by 50% threshold
#' condition_biased_comigrations <-
#' get_condition_biased_comigrations(clustering_result=example_clusters,
#' count_table=comigrations_size10, selected_conditions=conditions,
#' condition_composition=condition_composition)
#'
#' # Finding condition-biased comigrations by 70% threshold
#' condition_biased_comigrations <-
#' get_condition_biased_comigrations(clustering_result=example_clusters,
#' count_table=comigrations_size10, selected_conditions=conditions,
#' condition_composition=condition_composition,
#' threshold_percent=70)
#'
get_condition_biased_comigrations <- function(clustering_result, count_table=NULL,
                                              selected_conditions, condition_composition, threshold_percent=50,
                                              color_for_null='gray', color_for_highly_matched='red4',
                                              cex=0.7, alpha=0.3) {
  Condition <- Percent_ratio <- NULL
  if (is.null(count_table)) {
    count_table <- generate_count_table(clustering_result, selected_conditions, 0)
  }
  ratio_threshold <- threshold_percent
  biased_clusters <- list()
  for (condition in selected_conditions) {
    tmp <- subset(condition_composition, Percent_ratio>=ratio_threshold & Condition==condition)
    biased_clusters[[condition]] <- tmp$Cluster
  }
  alluvia_colors <- c()
  for (i in seq_len(nrow(count_table))) {
    matched_conditions <- c()
    for (condition in selected_conditions) {
      cl <- count_table[i,condition]
      if (!is.na(match(cl, biased_clusters[[condition]]))) {
        matched_conditions <- c(matched_conditions, condition)
      }
    }
    if (length(matched_conditions)==1) {
      alluvia_colors <- c(alluvia_colors, clustering_result$color_for_condition[matched_conditions])
    } else if (length(matched_conditions)>1) {
      alluvia_colors <- c(alluvia_colors, color_for_highly_matched)
    } else {
      alluvia_colors <- c(alluvia_colors, color_for_null)
    }
  }
  cnt_table_4draw <- count_table[!is.na(match(alluvia_colors, c(clustering_result$color_for_condition, color_for_highly_matched))),]
  alluvia_colors <- alluvia_colors[!is.na(match(alluvia_colors, c(clustering_result$color_for_condition, color_for_highly_matched)))]
  if (nrow(cnt_table_4draw)>0){
    return(draw_alluvial_plot(clustering_result, selected_conditions, cnt_table_4draw,
                              alluvia_colors = alluvia_colors, cex=cex, alpha=alpha))
  } else {
    print("No biased comigration was found")
  }
}

#' @title get_comigrations_by_name
#' @description 'get_comigrations_by_name' finds proteins comigrated with the given proteins
#' @param clustering_result A list containing XINA clustering results. See \link[XINA]{xina_clustering}
#' @param selected_conditions A vector of condition names used in XINA clustering results.
#' The number of selected conditions should be at least two.
#' @param protein_list A vector containing gene names.
#' @param cex Size of cluster number on block axis. Default if 0.7. See \link[alluvial]{alluvial}
#' @param alpha Transparency of alluvia colors. Default is 0.3. See \link[alluvial]{alluvial}
#' @return An alluvial plot displaying comigrations and the data frame containing comigrations of the input proteins
#' @export
#' @examples
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # the clustering result table
#' all_proteins  <- as.character(example_clusters$aligned$`Gene name`)
#' # get a vector of experimental conditions analyzed in the clustering results
#' classes <- as.vector(example_clusters$condition)
#'
#' comigrated_prots_all <- get_comigrations_by_name(example_clusters, classes, all_proteins[1:3])
#'
get_comigrations_by_name <- function(clustering_result, selected_conditions, protein_list, cex=0.7, alpha=0.3){
  matched_comigrations_all <- data.frame()
  comigrated_prots_all <- list()
  count_table <- generate_count_table(clustering_result, selected_conditions, 0)
  found_proteins <- c()
  for (prot_name in protein_list) {
    cl_matched <- clustering_result$aligned[clustering_result$aligned$"Gene name"==prot_name, selected_conditions]
    comigrations_matched <- count_table
    comigrated_prots <- clustering_result$aligned
    flag_found <- TRUE
    if (length(cl_matched)==0) {
      flag_found <- FALSE
    } else if (is.na(sum(as.numeric(cl_matched)))) {
      flag_found <- FALSE
    } else if (sum(as.numeric(cl_matched))==0) {
      flag_found <- FALSE
    }
    if (flag_found){
      if (length(selected_conditions)==1){
        names(cl_matched) <- selected_conditions
      }
      for (condition in selected_conditions){
        comigrations_matched <- subset(comigrations_matched, comigrations_matched[condition]==as.integer(cl_matched[condition]))
        comigrated_prots <- subset(comigrated_prots, comigrated_prots[condition]==as.integer(cl_matched[condition]))
      }
      found_proteins <- c(found_proteins, prot_name)
      comigrated_prots_all[[prot_name]] <- comigrated_prots
      if (nrow(matched_comigrations_all)==0) {
        matched_comigrations_all <- comigrations_matched
      } else {
        matched_comigrations_all <- rbind(matched_comigrations_all,comigrations_matched)
      }
    } else {
      print(paste(prot_name," is not found in the your 'selected_conditions':",selected_conditions))
      comigrated_prots_all[[prot_name]] <- NULL
    }
  }
  rownames(matched_comigrations_all) <- found_proteins
  if (length(selected_conditions)>1){
    comigrated_prots_all[["comigrations"]] <- draw_alluvial_plot(clustering_result, selected_conditions, matched_comigrations_all,
                                                                 cex=cex, alpha=alpha)
  } else {
    comigrated_prots_all[["comigrations"]] <- matched_comigrations_all
  }
  return(comigrated_prots_all)
}

#' @title draw_alluvial_plot
#' @description 'draw_alluvial_plot' draw a alluvial plot
#' @param clustering_result A list containing XINA clustering results.
#' See \link[XINA]{xina_clustering}.
#' @param selected_conditions A vector of condition names used in XINA clustering results.
#' The number of selected conditions should be at least two.
#' @param count_table A data frame generated by using \link[plyr]{count}.
#' @param alluvia_colors A vector containing the user-defined colors for each alluvium.
#' @param cex Size of cluster number on block axis. Default if 0.7. See \link[alluvial]{alluvial}.
#' @param alpha Transparency of alluvia colors. Default is 0.3. See \link[alluvial]{alluvial}.
#' @return An alluvial plot displaying comigrations and the data frame containing the input count_table with colors.
#' @import alluvial
#' @export
#' @examples
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # get a vector of experimental conditions analyzed in the clustering results
#' classes <- as.vector(example_clusters$condition)
#'
#' comigrations_size_over5 <- alluvial_enriched(example_clusters, classes, comigration_size=5)
#' draw_alluvial_plot(example_clusters, classes, comigrations_size_over5)
#'
draw_alluvial_plot <- function(clustering_result, selected_conditions, count_table,
                               alluvia_colors=NULL, cex=0.7, alpha=0.3){
  # Set color codes
  if (is.null(alluvia_colors)) {
    alluvia_colors <- c()
    color_codes <- get_colors(clustering_result$nClusters, set='vivid')
    for (i in seq_len(nrow(count_table))){
      cn <- as.numeric(as.vector(count_table[i,1]))
      if (cn == 0){
        alluvia_colors <- c(alluvia_colors, "#BEBEBE")
      } else {
        alluvia_colors <- c(alluvia_colors, color_codes[cn])
      }
    }
  } else {
    if (length(alluvia_colors) > nrow(count_table)){
      alluvia_colors <- alluvia_colors[seq_len(nrow(count_table))]
    } else if (length(alluvia_colors) < nrow(count_table)){
      difference <- nrow(count_table) - length(alluvia_colors)
      for (i in seq_len(difference)) {
        alluvia_colors <- c(alluvia_colors, 'gray')
      }
      warning(paste(difference, "comigrations were colored by gray because alluvial_colors is not enough to color all the comigrations"))
    }
  }
  # For the dimension error when there is only one row in count_table
  if (nrow(count_table)==0){
    stop("No biased comigration was found")
  } else if (nrow(count_table)==1){
    Comigration_size <- c(count_table$Comigration_size,0)
    cols <- c(alluvia_colors,alluvia_colors)
    bds <- c(alluvia_colors,alluvia_colors)
  } else {
    Comigration_size <- count_table$Comigration_size
    cols <- alluvia_colors
    bds <- alluvia_colors
  }
  # Draw an alluvial plot
  # count_table[,1:length(selected_conditions)]
  alluvial(count_table[,selected_conditions],
           freq=Comigration_size,
           col=cols,
           border=bds,
           cex=cex,
           alpha=alpha)
  if (is.null(count_table$"Alluvia_color")) {
    return(cbind(count_table, Alluvia_color=alluvia_colors))
  } else {
    count_table$"Alluvia_color" <- alluvia_colors
    return(count_table)
  }
}


#' @title alluvial_enrichment_tests
#' @description Fisher's exact test to calculate the significance over all comigrations.
#' The following 2x2 table was used to calculate p-value from Fisher's exact test.
#' To evaluate significance of comigrated proteins from cluster #1 in control to cluster #2 in test condition,
#'      \tabular{rll}{
#'            \tab \strong{cluster #1 in control} \tab \strong{other clusters in control}\cr
#'       \strong{cluster #2 in test}      \tab 65 (TP) \tab 175 (FP)\cr
#'       \strong{other clusters in test}  \tab 35 (FN) \tab 979 (TN)\cr
#'      }
#' 'alluvial_enrichment_tests' also provides another statistical methods including Hypergeometric test and Chi-square test.
#' @param count_table A data frame generated by using \link[plyr]{count}.
#' @param c1 A selected cluster in the first condition.
#' @param c2 A selected cluster in the second condition.
#' @param non_cluster The cluster number for proteins that were not detected in a specific sample. Default is 0.
#' @param test_type Enrichment test type.
#' 'fisher' = Fisher's exact test, 'hyper' = Hypergeometric test, 'chisq' = Chi-square test
#' @return P-value of comigration enrichment test and 2x2 table information
alluvial_enrichment_tests <- function(count_table, c1, c2, non_cluster=0, test_type='fisher'){
  matched_in_list <- sum(subset(count_table, count_table[,1]==c1 & count_table[,2]==c2)$Comigration_size)
  non_matched_in_list <- sum(subset(count_table, count_table[,1]==c1 & count_table[,2]!=c2)$Comigration_size)
  matched_in_all <- sum(subset(count_table, count_table[,1]!=c1 & count_table[,2]==c2)$Comigration_size)
  non_matched_in_all <- sum(subset(count_table, count_table[,1]!=c1 & count_table[,2]!=c2)$Comigration_size)
  data_table <- c(matched_in_list, non_matched_in_list, matched_in_all, non_matched_in_all)
  counts = (matrix(data = data_table, nrow = 2))
  pval <- 1
  if (test_type=='fisher') {
    pval <- stats::fisher.test(counts)$p.value
  } else if (test_type=='hyper') {
    pval <- stats::dhyper(matched_in_list, matched_in_all, non_matched_in_all, matched_in_list+non_matched_in_list)
  } else if (test_type=='chisq') {
    pval <-  stats::chisq.test(counts)$p.value
  } else {
    stop("You chose wrong test_type that should be one of c('fisher', 'hyper', 'chisq')")
  }
  return_table <- c(data_table, pval)
  names(return_table) <- c("TP","FP","FN","TN","Pvalue")
  return(return_table)
}

#' @title generate_count_table
#' @description Count the number of comigrated proteins using \link[plyr]{count}
#' @param clustering_result A list containing XINA clustering results. See \link[XINA]{xina_clustering}
#' @param selected_conditions A vector of condition names used in XINA clustering results.
#' @param comigration_size The number of proteins comigrated together in the selected conditions of XINA clustering results. Default is 0.
#' @return A data frame containing comigrations.
generate_count_table <- function(clustering_result, selected_conditions, comigration_size) {
  # Generate count table
  aligned_clusters <- clustering_result$aligned
  selected_data <- data.frame(Gene_name=aligned_clusters$`Gene name`)
  for (i in selected_conditions){
    selected_data <- cbind(selected_data, as.integer(aligned_clusters[i][[1]]))
  }
  colnames(selected_data) <- c("Gene_name",selected_conditions)
  count_table <- plyr::count(selected_data[selected_conditions])
  count_table$"RowNum" <- seq_len(nrow(count_table))
  na_cluster <- count_table
  for (i in selected_conditions){
    if (nrow(na_cluster)>0) {
      na_cluster <- subset(na_cluster, na_cluster[i]==0)
    }
  }
  return_table <- count_table[!(count_table$RowNum %in% na_cluster$RowNum),]
  colnames(return_table) <- c(selected_conditions, "Comigration_size","RowNum")
  return(return_table)
}

#' @title alluvial_enriched
#' @description 'alluvial_enriched’ draws an alluvial plot and finds comigrated proteins.
#' The comigration is a group of proteins that show the same expression pattern,
#' classified and evaluated by XINA clustering, in at least two conditions.
#' XINA can reduce the dataset complexity by filtering based on
#' the number of comigrated proteins (size, ’comigration_size’ parameter) and
#' perform an enrichment test (P-value of Fisher’s exact test, ’pval_threshold’)
#' to determine significance of enriched comigrations.
#' The Fisher’s exact test can only be done for two conditions at a time.
#' The following 2x2 table was used to calculate the P-value from the Fisher’s exact test.
#' To evaluate significance of co-migrated proteins from cluster #1 in control to cluster #2 in test group,
#'      \tabular{rll}{
#'       - \tab cluster #1 in control \tab other clusters in control\cr
#'       cluster #2 in test      \tab 65 (TP) \tab 175 (FP)\cr
#'       other clusters in test  \tab 35 (FN) \tab 979 (TN)\cr
#'      }
#' @param clustering_result A list containing XINA clustering results. See \link[XINA]{xina_clustering}
#' @param selected_conditions A vector of condition names used in XINA clustering results.
#' The number of selected conditions should be at least two.
#' @param comigration_size The number of proteins comigrated together in the selected conditions of XINA clustering results.
#' Default is 0
#' @param pval_threshold This option is avaiable only when you selected two conditions for comigration search.
#' @param pval_method Method for p-value adjustment. See \link[stats]{p.adjust}
#' @param cex Scaling of fonts of category labels. Default if 0.7. See \link[alluvial]{alluvial}
#' @param alpha Transparency of the stripes. Default if 0.3. See \link[alluvial]{alluvial}
#' @return A data frame containing comigrations and an alluvial plot showing comigrations
#' @export
#' @examples
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # Get the experimental conditions in the example data
#' classes <- as.vector(example_clusters$condition)
#'
#' # Get comigrations without any thresholds
#' all_comigrations <- alluvial_enriched(example_clusters, classes)
#'
#' # Get comigrations that have >= 5 size (the number of comigrated proteins)
#' all_cor_enriched <- alluvial_enriched(example_clusters, classes, comigration_size=5)
#'
#' # Get all the comigrations between Control and Stimulus1
#' comigrations_Control_Stimulus1 <- alluvial_enriched(example_clusters,
#' c(classes[1],classes[2]))
#'
#' # Get comigrations between Control and Stimulus1, that have >=5 size
#' comigrations_Control_Stimulus1_over5 <- alluvial_enriched(example_clusters,
#' c(classes[1],classes[2]), comigration_size=5)
#'
#' # Get comigrations between Control and Stimulus1,
#' # that have >= 5 size and enrichment FDR <= 0.01
#' comigrations_Control_Stimulus1_pval0.01_size5 <- alluvial_enriched(example_clusters,
#' c(classes[1],classes[2]), comigration_size=5, pval_threshold=0.01)
#'
#' # Get  comigrations between Control and Stimulus1,
#' # that have >= 5 size and enrichment Benjamini & Yekutieli <= 0.01
#' comigrations_Control_Stimulus1_BY0.01_size5 <- alluvial_enriched(example_clusters,
#' c(classes[1],classes[2]), comigration_size=5, pval_threshold=0.01, pval_method="BY")
#'
alluvial_enriched <- function(clustering_result, selected_conditions, comigration_size=0,
                              pval_threshold=1, pval_method='fdr', cex=0.7, alpha=0.3) {
  Comigration_size <- Pvalue.adjusted <- NULL
  # For debugging
  # selected_conditions <- c(classes[1],classes[2],classes[3])
  # https://cran.r-project.org/web/packages/alluvial/vignettes/alluvial.html#changing-layers
  if (length(selected_conditions)<2){
    stop("'selected_conditions' requires at least two")
  } else if (length(selected_conditions)>2) {
    print("length(selected_conditions) > 2, so XINA can't apply the enrichment filter
            Can't apply the enrichment filter, so pval_threshold is ignored")
    pval_threshold <- 1
  }
  # generate a count_table
  count_table <- generate_count_table(clustering_result, selected_conditions, 0)

  # Do enrichment test
  tp <- c()
  fp <- c()
  fn <- c()
  tn <- c()
  p.values <- c()
  if (length(selected_conditions)==2){
    for (j in seq_len(nrow(count_table))) {
      c1 <- count_table[j,1]
      c2 <- count_table[j,2]
      test_result <- alluvial_enrichment_tests(count_table,c1,c2)
      tp <- c(tp, test_result["TP"])
      fp <- c(fp, test_result["FP"])
      fn <- c(fn, test_result["FN"])
      tn <- c(tn, test_result["TN"])
      p.values <- c(p.values, test_result["Pvalue"])
    }
    pval_adjusted <- stats::p.adjust(p.values,method=pval_method)
  } else {
    p.values <- rep(NA,nrow(count_table))
    pval_adjusted <- rep(NA,nrow(count_table))
    tp <- rep(NA,nrow(count_table))
    fp <- rep(NA,nrow(count_table))
    fn <- rep(NA,nrow(count_table))
    tn <- rep(NA,nrow(count_table))
  }
  count_table_all <- cbind(count_table,
                           PValue=p.values,
                           Pvalue.adjusted=pval_adjusted,
                           TP=tp,FP=fp,FN=fn,TN=tn)
  if (pval_threshold == 1) {
    count_table <- subset(count_table_all, Comigration_size>=comigration_size)
  } else {
    count_table <- subset(count_table_all, Pvalue.adjusted<=pval_threshold & Comigration_size>=comigration_size)
  }
  if (nrow(count_table)>0){
    count_table_colored <- draw_alluvial_plot(clustering_result, selected_conditions, count_table, cex=cex, alpha=alpha)
  } else {
    print("Can't draw alluvial plot because none of features passed the thresholds")
  }
  return(count_table_colored)
}

Try the XINA package in your browser

Any scripts or data that you put into this service are public.

XINA documentation built on Nov. 8, 2020, 5:32 p.m.