R/PRC_df.R

Defines functions PRC_df

Documented in PRC_df

#'@title Generate data.frame for the precision recall curve.
#'@param result_lst a list generated by the function \code{get_predictive_values}.
#'@param gt_gr a GRanges for the ground truce of positive methylation sites, recommended to be all single based in width.
#'@param N number of points returned for each curve, default = 60.
#'@param group_name a character for the name of the group returned, default = "prc".
#'
#'@examples
#' whistle_predict <- readRDS("predictedResults.rds")
#' whistle_gr <- readRDS("exon_DRACH.rds")
#' whistle_gr$prob <- whistle_predict
#' result_lst <- readRDS("hESC_C_noGC.rds")
#' gt_gr <- whistle_gr[whistle_gr$prob > 0.5]
#' plot_df <- PRC_df(result_lst = result_lst, gt_gr = gt_gr, N = 100, group_name = "no GC")
#'
#'@import magrittr SummarizedExperiment GenomicRanges
#'@export
PRC_df <- function(result_lst, gt_gr, N = 60, group_name = "prc") {

  pc_gr <- rowRanges( result_lst$dds )
  gt_gr <- subsetByOverlaps(gt_gr, pc_gr)
  pc_gr <- resize(pc_gr,201,fix = "center")

  #####################################################
  #        precision-recall for deseq2 p values       #
  #####################################################
  pred_pvalue <- result_lst$DESeq2Results$pvalue
  pred_pvalue[is.na(pred_pvalue)] <- 1
  pred_pvalue <- -log10(pred_pvalue)
  vec_pcc <- rep(NA,N)
  vec_recall <- rep(NA,N)

  pred_pvalue[pred_pvalue == Inf] <- max(pred_pvalue[pred_pvalue != Inf])

  cutoff <- quantile(pred_pvalue, seq(0,1,length.out = N), na.rm = TRUE)

  for(i in 1:N){
    pred_P <- pc_gr[which(pred_pvalue >= cutoff[i])]
    vec_pcc[i] <- mean(pred_P %over% gt_gr)
    vec_recall[i] <- mean(gt_gr %over% pred_P)
  }

  vec_pcc <- c(vec_pcc, 1)
  vec_recall <- c(vec_recall, 0)

  #plot(vec_recall,vec_pcc)

  plot_df <- data.frame(recall = vec_recall,
                        pcc = vec_pcc,
                        value = "deseq_pvalue",
                        group = group_name )

  #####################################################
  #        precision-recall for deseq2 log2FC         #
  #####################################################
  pred_log2FC <- result_lst$DESeq2Results$log2FoldChange
  vec_pcc <- rep(NA,N)
  vec_recall <- rep(NA,N)

  cutoff <- quantile(pred_log2FC, seq(0,1,length.out = N), na.rm = TRUE)

  for(i in 1:N){
    pred_P <- pc_gr[which(pred_log2FC >= cutoff[i])]
    vec_pcc[i] <- mean(pred_P %over% gt_gr)
    vec_recall[i] <- mean(gt_gr %over% pred_P)
  }

  vec_pcc <- c(vec_pcc, 1)
  vec_recall <- c(vec_recall, 0)

  #plot(vec_recall,vec_pcc)


  plot_df <- rbind(plot_df, data.frame(recall = vec_recall,
                                       pcc = vec_pcc,
                                       value = "deseq_lfc",
                                       group = group_name ))

  #####################################################
  #        precision-recall for c-test fdr            #
  #####################################################
  pred_cfdr <- result_lst$cons_pvalue_M
  pred_cfdr[is.na(pred_cfdr)] <- 1
  pred_cfdr <- -log10(pred_cfdr)
  pred_cfdr[pred_cfdr == Inf] <- max(pred_cfdr[pred_cfdr != Inf])

  vec_pcc <- rep(NA,N)
  vec_recall <- rep(NA,N)

  cutoff <- quantile(pred_cfdr, seq(0,1,length.out = N), na.rm = TRUE)

  for(i in 1:N){
    pred_P <- pc_gr[rowSums( cbind( pred_cfdr > cutoff[i] ) ) >= qbinom(0.95,ncol(cbind(pred_cfdr)),0.8)]
    vec_pcc[i] <- mean(pred_P %over% gt_gr)
    vec_recall[i] <- mean(gt_gr %over% pred_P)
  }

  vec_pcc <- c(vec_pcc, 1)
  vec_recall <- c(vec_recall, 0)

  #plot(vec_recall,vec_pcc)
  #plot_df <- plot_df[plot_df$value != "c-test_fdr",]
  plot_df <- rbind(plot_df, data.frame(recall = vec_recall,
                                       pcc = vec_pcc,
                                       value = "c-test_pvalue",
                                       group = group_name ))

  ###########################################################
  #        precision-recall for random positives : )        #
  ###########################################################

  vec_pcc <- rep(NA,N)
  vec_recall <- rep(NA,N)

  n_pos <- round(exp(seq(1,log(length(pc_gr)),length.out = N)),0)

  for(i in 1:N){
    pred_P <- pc_gr[sample.int(length(pc_gr),n_pos[i])]
    vec_pcc[i] <- mean(pred_P %over% gt_gr)
    vec_recall[i] <- mean(gt_gr %over% pred_P)
  }

  #plot_df <- plot_df[plot_df$value != "random",]

  plot_df <- rbind(plot_df, data.frame(recall = vec_recall,
                                       pcc = vec_pcc,
                                       value = "random",
                                       group = group_name ))

  #####################################################
  #        precision-recall for c-test log2FC         #
  #####################################################
  pred_clfc <- result_lst$cons_log2FC_M
  pred_clfc[is.na(pred_clfc)] <- 1

  vec_pcc <- rep(NA,N)
  vec_recall <- rep(NA,N)

  cutoff <- quantile(pred_clfc, seq(0,1,length.out = N), na.rm = TRUE)

  for(i in 1:N){
    pred_P <- pc_gr[rowSums( cbind( pred_clfc > cutoff[i] ) ) >= qbinom(0.95,ncol(cbind(pred_clfc)),0.8)]
    vec_pcc[i] <- mean(pred_P %over% gt_gr)
    vec_recall[i] <- mean(gt_gr %over% pred_P)
  }

  vec_pcc <- c(vec_pcc, 1)
  vec_recall <- c(vec_recall, 0)

  #plot(vec_recall,vec_pcc)
  #plot_df <- plot_df[plot_df$value != "c-test_fdr",]
  plot_df <- rbind(plot_df, data.frame(recall = vec_recall,
                                       pcc = vec_pcc,
                                       value = "c-test_lfc",
                                       group = group_name ))


  ######################################################################
  #        precision-recall for a combination of several metrics       #
  ######################################################################

  indx_ctest_lfc <- rowSums( cbind(result_lst$cons_log2FC_M > 1) ) >= qbinom(0.95, ncol(cbind(result_lst$cons_log2FC_M)), 0.8)

  indx_ctest_pvalue <- rowSums( cbind( pred_cfdr > -log10(0.05) ) ) >= qbinom(0.95, ncol(cbind(pred_cfdr)), 0.8)

  pval_subset <- result_lst$DESeq2Results$pvalue[indx_ctest_lfc&indx_ctest_pvalue]

  pval_subset <- -log10(pval_subset)

  pval_subset[pval_subset == Inf] <- max(pval_subset[pval_subset != Inf])

  vec_pcc <- rep(NA,N)
  vec_recall <- rep(NA,N)

  cutoff <- quantile(pval_subset, seq(0,1,length.out = N), na.rm = TRUE)

  indx_sub <- indx_ctest_lfc & indx_ctest_pvalue

  for(i in 1:N){
    pred_P <- pc_gr[which(pred_pvalue >= cutoff[i] & indx_sub)]
    vec_pcc[i] <- mean(pred_P %over% gt_gr)
    vec_recall[i] <- mean(gt_gr %over% pred_P)
  }

  vec_pcc <- c(vec_pcc, 1)
  vec_recall <- c(vec_recall, 0)

  #plot(vec_recall,vec_pcc)
  #plot_df <- plot_df[plot_df$value != "combinded_cutoff",]

  plot_df <- rbind(plot_df, data.frame(recall = vec_recall,
                        pcc = vec_pcc,
                        value = "combinded_cutoff",
                                       group = group_name ) )

  plot_df[is.na(plot_df)] = 1

  return(plot_df)
}
ZW-xjtlu/exomePeak2Test documentation built on Jan. 6, 2020, 3:36 p.m.