QC for brentlab RNAseq data

Documented in create_partitions rank_response_plot sort_rank_mean_expr

# TODO parameterize the column headeing names for the binding/expression dfs
# TODO check that the lists are named, and that the names are unique within/
# between lists

#'
#' Given expresion and binding signal, plot the rank response a la
#' doi:
#'
#' @import foreach
#' @importFrom purrr map
#' @importFrom dplyr mutate
#' @import ggplot2
#'
#' @param expression_df_list a NAMED list of gene expression dataframes. Each
#'   dataframe must have the following columns, at minimum:
#'   c('gene', "log2FoldChange", "padj"). Names of items in the list must
#'   correspond to the data source (eg, 'kemmeren', 'brentlab', etc)
#' @param binding_df_list a NAMED list of gene expression dataframes. Each
#'   dataframe must have the following columns, at minimum:
#'   c('gene', 'binding_signal'). Names of items in the list must
#'   correspond to the data source (eg, 'chip-chip', 'calling-cards', etc)
#' @param tf the name of the TF -- this will be the plot title
#' @param lfc_thres expression log2foldchange threshold. default is 0
#' @param padj_thres expression padj threshold. default is .05
#'
#' @return a ggplot object rank response plot
#'
#' @export
rank_response_plot = function(expression_df_list, binding_df_list,
                              tf, random_ratio, lfc_thres = 0, padj_thres = .05){

  # TODO -- ADD THE BELOW AS AN OPTION, RATHER THAN UNIVERSAL, so that the
  # plots can be looked at separately rather than standardizing all together

  # TODO (CRITICAL)
  #   first, identify responsive set in all expression list using both padj, lfc.
  #   Then rank responsive genes by one criteria (lfc or padj).
  #   then threshold such that the number of responsive genes is the same
  #
  #   Give user optino to use either lfc or padj (or both) for callign a gene DE (responsive)


 message("Summarizing data by rank response...")
  rank_res_df = foreach(
   expr_name = names(expression_df_list),
   .combine = 'rbind'
  ) %do% {
    df_list = map(names(binding_df_list),
        ~sort_rank_mean_expr(expression_df_list[[expr_name]],
                             expr_name,
                             binding_df_list[[.]],
                             .,
                             lfc_thres,
                             padj_thres))

    do.call('rbind', df_list)
  }


  # TODO chance is number of responsive genes / number of genes common between expr and binding (nrow)
  # if this is comparing TFs across expr data, then this would be min(# responsive genes) / common genes (note that you may have to deal with what common genes means )
 rank_res_df %>%
   mutate(expression_source = as.factor(expression_source),
          binding_source = as.factor(binding_source)) %>%
   ggplot() +
     geom_line(aes(rank,
                   response_ratio,
                   color = binding_source,
                   linetype = expression_source)) +
   ggtitle(tf) +
   scale_x_continuous(breaks = seq(0,150,10)) +
   scale_y_continuous(breaks = seq(0,1,.1)) +
   theme(text = element_text(size = 20),
         axis.text.x = element_text(angle = 45, hjust =1))
}

#'
#' Given a vector length adn the size of each parition, create a vector which
#' represents those partitions.
#' @description There may be one more partition of size less than the equally
#' divided parts. eg if the vector length is 14 and the desired partitions are
#' size 3, then there will be 4 partitions of length 3 and one of length 2,
#' 1 1 1 2 2 2 3 3 3 4 4 4 5 5.
#'
#' @import dplyr
#'
#' @param vector_length the total lenght of the partition vector
#' @param equal_parts how large should each partition be?
#'
#'
#' @return a vector of `vector_length` divded into `equal_parts` with possibly
#'   one additional vector of size less than `equal_parts`.
#'
#' @export
create_partitions = function(vector_length, equal_parts = 100){
  c(rep(seq(1,vector_length/equal_parts),
        each=equal_parts),
    rep(floor(vector_length/equal_parts)+1, vector_length%%equal_parts))
}

#'
#' Create a rank-response data frame
#'
#' @inheritParams rank_response_plot
#' @param expression_df a three column dataframe with the column names
#'   c('gene', 'log2FoldChange', 'padj') which describes expression data. It
#'   may have more columns, but those three must exist
#' @param binding_df a two column dataframe dataframe with the columns
#'   c('gene', 'binding_signal'). It may have more columns, but those must exist
#' @param expression_src the source of the data, eg 'kemmeren' or 'brentlab'
#' @param binding_src the source of the data, eg 'chip-chip' or 'calling-cards'
#' @param rank_resolution how to bin ranks. Default is 10, which means that
#'   ranks are binned into groups of 10 (eg the first group are the 10 highest)
#' @param num_rows the number of rows to return. Default is 15.
#'   For instance, if the rank_resolution is 10, and you only want to look at
#'   the first 150 ranked genes, then num_rows is 15.
#'
#' @return a rank_response dataframe
#'
#' @export
sort_rank_mean_expr = function(expression_df, expression_src,
                               binding_df, binding_src,
                               lfc_thres, padj_thres,
                               rank_resolution = 5,
                               num_rows = 30){

  # I think the todo below is already taken care of 20220902
  # TODO remove lfc and padj fltering from here -- do that earlier in rank-response-plot

  expression_cols = c('gene', "log2FoldChange", "padj")
  binding_cols = c('gene', 'binding_signal')

  stopifnot(expression_cols %in% colnames(expression_df))

  expression_df = select(expression_df, all_of(expression_cols)) %>%
    filter(!gene %in% setdiff(binding_df$gene,expression_df$gene))
  # %>%
  #   filter(complete.cases(.))
  stopifnot(binding_cols %in% colnames(binding_df))
  binding_df = select(binding_df, all_of(binding_cols)) %>%
    filter(!gene %in% setdiff(binding_df$gene,expression_df$gene))
  # %>%
  #   filter(complete.cases(.))

  # message(paste0("the overlap between the complete cases in the expression and binding ",
  #                sprintf("gene sets is: %s out of %s in the expression data ",
  #                        length(intersect(expression_df$gene, binding_df$gene)),
  #                        nrow(expression_df)),
  #                sprintf("and %s in the binding data", nrow(binding_df))))

  inner_join(expression_df, binding_df, by = "gene") %>%
    arrange(binding_signal) %>%
    mutate(rank = create_partitions(nrow(.), rank_resolution)*rank_resolution) %>%
    group_by(rank) %>%
    summarise(group_ratio = sum(abs(log2FoldChange) > lfc_thres & padj <= padj_thres)) %>%
    # check that this is the same as if it were done without the binning/grouping
    mutate(response_ratio = (cumsum(group_ratio)/rank)) %>%
    mutate(binding_source = binding_src) %>%
    mutate(expression_source = expression_src) %>%
    head(num_rows)
}