R/utils.R

Defines functions splice_junction_analysis_table make_homopolymer_table_to_plot make_homopolymer_plot method_homopolymer_indels gt_vt_method standardize_genotype add_variant_density_of_a_method add_some_read_statistic_from_bam_metacolumn add_format_tag_from_vcf add_info_tag_from_vcf calculate_precision_recall_for_n_cigar_read_count_intervarls calculate_precision_recall_for_coverage_ranges calculate_precision_recall_for_multi_master_tables2 calculate_precision_recall_for_multi_master_tables precision_recall_curve_per_coverage check_accuracy_per_coverage calc_accuracy_measures igv_batch_screenshots

Documented in add_format_tag_from_vcf add_info_tag_from_vcf add_some_read_statistic_from_bam_metacolumn add_variant_density_of_a_method calc_accuracy_measures calculate_precision_recall_for_coverage_ranges calculate_precision_recall_for_multi_master_tables calculate_precision_recall_for_multi_master_tables2 calculate_precision_recall_for_n_cigar_read_count_intervarls check_accuracy_per_coverage gt_vt_method igv_batch_screenshots make_homopolymer_plot make_homopolymer_table_to_plot method_homopolymer_indels precision_recall_curve_per_coverage splice_junction_analysis_table standardize_genotype

#' Generate igv batch screenshots script
#'
#' @param chrm A vector of string. Chromossome name, e.g. "chrm1".
#' @param pos A vector of integers. Positions to center the screenshot.
#' @param output_dir A 1-length string. Directory where the IGV-batch-screenshot 
#'   script is saved.
#' @param prefix A 1-length string.
#' @param snapshot_path A 1-length string. Directory where the IGV-batch-screenshot 
#'   script will save the screenshots.
#' @param windows_size 1-length integer. Window size of the screenshot.
#' @param screenshot_number 1-length integer. Number of screenshot to generate.
#' @param output_positions If TRUE, output a data.frame with variant positions and 
#'   the reagions to visualize.
#'
#' @return NULL
#' 
#' @importFrom IRanges IRanges resize start end
#' @importFrom rlang .data
#' @importFrom utils write.table
#' @import dplyr
#' 
#' @export
igv_batch_screenshots <- function(chrm, pos, output_dir, prefix, snapshot_path, windows_size=1501, screenshot_number=100, output_positions=FALSE) {
  ### create the bed file that store the area of interest to visualize
  bed <- IRanges(pos, width=1) %>% 
    resize(windows_size, "center") 
  bed <- data.frame(chrm=chrm, start=start(bed), end=end(bed))
  
  ### subset from bed
  if( nrow(bed) < screenshot_number ) screenshot_number <- nrow(bed)
  k <- {1:nrow(bed)} %>% 
    sample(screenshot_number)
  bed <- bed[k,]
  
  ### output
  if(output_positions) {
    output_dataframe_positions <- data.frame( variants= paste0(chrm[k], ":", pos[k]),
                                              regions= paste0(bed$chrm, ":", bed$start, "-", bed$end) )
  }
  
  ### write bed file
  bed_file <- file.path( output_dir, paste0(prefix, ".bed") )
  write.table(bed,
              bed_file,
              sep="\t",
              quote=FALSE,
              row.names=FALSE,
              col.names=FALSE)
  
  ### convert the bed file into an IGV batch script
  batch_file <- file.path(output_dir, paste0(prefix, ".batch"))
  cmds <- gettextf("bedtools igv -path %s -i %s > %s",
                   file.path(snapshot_path, prefix), 
                   bed_file, batch_file)
  system(cmds)
  unlink(bed_file)
  
  if(output_positions) {
    output_dataframe_positions
  }
}







#' Calculate precision, sensitivity and F1-score of a method in a master table
#'
#' @param input_table A data.frame. The master table.
#' @param method_name A 1-length string. The name of the method to calculate the
#'   accuracy measures.
#' @param truth_name A 1-length string. The name of the ground-truth to validate
#'   the method.
#'
#' @return A named vector with the accuracy measures.
#' @export
calc_accuracy_measures <- function(input_table, method_name, truth_name) {
  in_method <- paste0("in_", method_name)
  in_truth <- paste0("in_", truth_name)
  method_classification <- paste0(method_name, "_classification")
  
  tp_count <- sum(input_table[,method_classification] == "TP")
  positive_method <- sum(input_table[,in_method])
  positive_truth <- sum(input_table[,in_truth])
  
  precision <- tp_count/positive_method
  sensitivity <- tp_count/positive_truth
  f1Score <- 2*(sensitivity * precision) / (sensitivity + precision)
  
  c(precision=precision, sensitivity=sensitivity, f1Score=f1Score)
}







#' Barplot of accuracy measures for different methods and coverage
#' 
#' Return a ggplot object for visualization.
#'
#' @param master_table A data.frame. The input master table.
#' @param method_names A vector of strings. The names of the methods to be compared
#'   in the master table.
#' @param output_method_names A vector of strings. How to output the names of the
#'   methods to be compared. The defaut is NULL.
#' @param data_name A 1-length string. The name of the dataset used with the methods 
#'   to be compared.
#' @param truth_name A 1-length string. The name of the ground-truth.
#' @param coverage_threshold A vector of integers. The minimum thresholds to filer 
#'   by read coverage.
#'
#' @return A ggplot object.
#' 
#' @importFrom tibble rownames_to_column
#' @importFrom tidyr gather
#' @importFrom rlang .data
#' @import ggplot2
#' @import dplyr
#' 
#' @export
check_accuracy_per_coverage <- function(master_table, 
                                        method_names,
                                        output_method_names=NULL,
                                        data_name,
                                        truth_name, 
                                        coverage_threshold){
  
  mt_thresholdI_methodJ <- lapply(coverage_threshold, function(coverage_threshold_i) {
    k <- master_table[ ,paste0(data_name, "_coverage") ]
    mt_thresholdI <- master_table[ k>=coverage_threshold_i, ]
    
    accur_thresholdI_methodJ <- sapply(method_names, function(method_names_i) {
      calc_accuracy_measures(mt_thresholdI, method_names_i, truth_name)
    })
    
    k <- data.frame(accur_thresholdI_methodJ)
    k <- rownames_to_column(k, "measure")
    cbind( k, threshold=paste(coverage_threshold_i, collapse="-") )
  })
  
  k <- bind_rows(mt_thresholdI_methodJ) 
  mt_thresholdI_methodJ <- gather(k, "method", "score", -.data$measure, -.data$threshold, factor_key=TRUE)
  
  if( !is.null(output_method_names) ){
    if( length(method_names) != length(output_method_names) ){
      stop("The lengths of method_names and output_method_names must be equal.")
    }
    stopifnot( identical(method_names, levels(mt_thresholdI_methodJ$method)) )
    levels(mt_thresholdI_methodJ$method) <- output_method_names
  }
  
  mt_thresholdI_methodJ$measure <- factor(mt_thresholdI_methodJ$measure)
  
  mt_thresholdI_methodJ$threshold <- factor(mt_thresholdI_methodJ$threshold, 
                                            levels=sort(coverage_threshold),
                                            ordered=TRUE)
  
  ggplot(mt_thresholdI_methodJ, aes(x=.data$method, y=.data$score, fill=.data$method)) +
    facet_grid(.data$measure~.data$threshold) +
    geom_bar(stat="identity") +
    theme(axis.text.x = element_text(angle = 270)) +
    theme(legend.position="bottom") +
    scale_x_discrete(labels=method_names)
  
}










#' Precision-recall curves to compare methods, using different filtering on read
#'   coverage
#'
#' @param master_table A data.frame. The input master table.
#' @param method_names A vector of strings. The names of the methods to be compared.
#' @param output_method_names A vector of strings. How to output the names of the
#'   methods to be compared. The default is NULL.
#' @param data_name A 1-length string. The name of the dataset used with the methods 
#'   to be compared.
#' @param truth_name A 1-length string. The name of the ground-truth.
#' @param coverage_thresholds A vector of integers. The minimum thresholds to filer 
#'   by read coverage. Each element defines a point in the precision-recall curves.
#' @param what A 1-length string. Possible values are: "snps_indels" (default), to 
#'   make curves for SNPs and indels separately; "snps", to make curves only for
#'   SNPs; "indels", to make curves only for indels; "overall", to make curves 
#'   without distinguishing SNPs and indels.
#' 
#' @return A ggplot object.
#' 
#' @import ggplot2
#' @importFrom tibble rownames_to_column
#' @importFrom tidyr gather
#' @importFrom rlang .data
#' @importFrom dplyr bind_rows
#' @importFrom dplyr left_join
#' @importFrom dplyr filter
#' 
#' @export
precision_recall_curve_per_coverage <- function(master_table,
                                                method_names,
                                                output_method_names=NULL,
                                                data_name,
                                                truth_name,
                                                coverage_thresholds,
                                                what){
  
  if( !any(what %in% c("snps_indels", "snps", "indels", "overall")) ){
    stop("`what` argument must be either \"snps_indels\", \"snps\", \"indels\", or \"overall\"")
  }
  
  mt_thresholdI_methodJ <- lapply(coverage_thresholds, function(threshold_i) {
    k <- master_table[ ,paste0(data_name, "_coverage") ]
    mt_thresholdI <- master_table[ k>=threshold_i, ]
    
    if( what %in% c("snps_indels", "snps") ){
      snp_accur_thresholdI <- lapply(method_names, function(method_names_i){
        k <- paste0("is_indel_", method_names_i)
        k <- which( mt_thresholdI[,k] == 0)
        k <- mt_thresholdI[k,]
        calc_accuracy_measures(k, method_names_i, truth_name)
      })
      snp_accur_thresholdI <- do.call(rbind, snp_accur_thresholdI)
      snp_accur_thresholdI <- data.frame(snp_accur_thresholdI)
      snp_accur_thresholdI <- cbind(snp_accur_thresholdI,
                                    variant="snps",
                                    method=method_names,
                                    coverage_thresholds=threshold_i)
    }
    if( what %in% c("snps_indels", "indels") ){
      indel_accur_thresholdI <- lapply(method_names, function(method_names_i){
        k <- paste0("is_indel_", method_names_i)
        k <- which( mt_thresholdI[,k] == 1)
        k <- mt_thresholdI[k,]
        calc_accuracy_measures(k, method_names_i, truth_name)
      })
      indel_accur_thresholdI <- do.call(rbind, indel_accur_thresholdI)
      indel_accur_thresholdI <- data.frame(indel_accur_thresholdI)
      indel_accur_thresholdI <- cbind(indel_accur_thresholdI,
                                      variant="indels",
                                      method=method_names,
                                      coverage_thresholds=threshold_i)
    }
    if( what == "overall" ){
      overall_accur_thresholdI <- lapply(method_names, function(method_names_i){
        k <- mt_thresholdI
        calc_accuracy_measures(k, method_names_i, truth_name)
      })
      overall_accur_thresholdI <- do.call(rbind, overall_accur_thresholdI)
      overall_accur_thresholdI <- data.frame(overall_accur_thresholdI)
      overall_accur_thresholdI <- cbind(overall_accur_thresholdI,
                                        variant="overall",
                                        method=method_names,
                                        coverage_thresholds=threshold_i)
    }
    
    accur_thresholdI <- switch(
      what, 
      "snps_indels"={
        rbind(snp_accur_thresholdI, indel_accur_thresholdI)
      },
      "snps"={
        snp_accur_thresholdI
      },
      "indels"={
        indel_accur_thresholdI
      },
      "overall"={
        overall_accur_thresholdI
      }
    )
  })
  
  dat <- do.call(rbind, mt_thresholdI_methodJ) 
  names(dat) [names(dat) == "sensitivity"] <- "recall"
  dat$variant <- factor(dat$variant)
  dat$method <- factor(dat$method)
  dat$coverage_thresholds <- factor(dat$coverage_thresholds,
                                    levels=sort(coverage_thresholds),
                                    ordered=TRUE)
  if( !is.null(output_method_names) ){
    if( length(method_names) != length(output_method_names) ){
      stop("The lengths of `method_names` and `output_method_names` must be equal.")
    }
    stopifnot( identical(method_names, levels(dat$method)) )
    levels(dat$method) <- output_method_names
  }
  names(dat) [names(dat) == "coverage_thresholds"] <- "coverage >= n"
  
  
  p <- ggplot(dat, aes(.data$recall, .data$precision, fill=.data$method, colour=.data$method))
  if(what == "snps_indels"){
    p <- p +
      geom_point(aes(shape=.data$variant, size=.data$`coverage >= n`), alpha=.5) +
      geom_path(aes(linetype=.data$variant))
  }else{
    p <- p +
      geom_point(aes(size=.data$`coverage >= n`), alpha=.5) +
      geom_path()
  }
  p <- p +
    theme(text = element_text(size = 20))
  p
}








#' Calculate accuracy measures for multiple master tables
#' 
#' Create a data.frame with accuracy measures for multiple master tables.
#'
#' @param ... Master tables.
#' @param experiment_names Vector of strings. Name of the experiments in the same 
#'   order of the master tables.
#' @param method_names Vector or list of strings. Names of the methods to compare.
#'   If all experiments compare the same methods, `method_names` can be a single
#'   vector. Otherwise, `method_names` must be a list, in which each element 
#'   specifies the names of the methods to compare for each experiment.
#' @param output_method_names NULL (default), or a vector or a list of strings.
#'   Names of the methods to output. 
#' @param data_names Vector or list of strings.
#' @param truth_names Vector or list of strings.
#' @param coverage_thresholds Vector or list of integers.
#' @param what Vector or list of strings.
#'
#' @return A data.frame.
#' @export
calculate_precision_recall_for_multi_master_tables <- function(
  ...,
  experiment_names,
  method_names,
  output_method_names=NULL,
  data_names,
  truth_names,
  coverage_thresholds=c(3,15,40,100),
  what
){
  
  master_tables <- list(...)
  
  stopifnot( length(experiment_names) == length(master_tables) )
  
  if( length(master_tables) == 1 ){
    stopifnot( is.vector(method_names) & !is.list(method_names) )
    
    if( !is.null(output_method_names) ){
      stopifnot( is.vector(output_method_names) & !is.list(output_method_names) )
    }
    
    stopifnot( is.vector(data_names) & !is.list(data_names))
    stopifnot( length(data_names)==1 )
    
    stopifnot( is.vector(truth_names) & !is.list(truth_names) )
    stopifnot( length(truth_names)==1 )
    
    stopifnot( is.vector(coverage_thresholds) & !is.list(coverage_thresholds) )
    
    stopifnot( is.vector(what) & !is.list(what) )
    stopifnot( length(what)==1 )
    
    method_names <- list(method_names)
    output_method_names <- list(output_method_names)
    coverage_thresholds <- list(coverage_thresholds)
    
    mt_len <- 1
  }else{
    mt_len <- length(master_tables)
    
    if( is.null(output_method_names) ){
      output_method_names <- rep( list(output_method_names), mt_len )
    }else{
      if( is.vector(output_method_names) & !is.list(output_method_names) ){
        stopifnot( length(output_method_names) == length(method_names) )
        output_method_names <- rep( list(output_method_names), mt_len )
      }else{
        if( is.list(output_method_names) ){
          stopifnot( length(output_method_names) == length(master_tables) )
        }else{
          stop("Check argument output_method_names")
        }
      }
    }
    
    if( is.vector(method_names) & !is.list(method_names) ){
      method_names <- rep( list(method_names), mt_len )
    }else{
      if( is.list(method_names) ){
        stopifnot( length(method_names) == length(master_tables) )
      }else{
        stop("Check argument method_names")
      }
    }
    
    if( is.vector(data_names) & !is.list(data_names) ){
      if( length(data_names) == 1 ){
        data_names <- rep(data_names, mt_len)
      }else{
        stopifnot( length(data_names) == length(data_names) )
      }
    }else{
      stop("Check argument data_names")
    }
    
    if( is.vector(truth_names) & !is.list(truth_names) ){
      if( length(truth_names) == 1 ){
        truth_names <- rep(truth_names, mt_len)
      }else{
        stopifnot( length(truth_names) == length(truth_names) )
      }
    }else{
      stop("Check argument truth_names")
    }
    
    if( is.vector(coverage_thresholds) & !is.list(coverage_thresholds) ){
      coverage_thresholds <- rep( list(coverage_thresholds), mt_len )
    }else{
      if( is.list(coverage_thresholds) ){
        stopifnot( length(coverage_thresholds) == length(master_tables) )
      }else{
        stop("Check argument coverage_thresholds")
      }
    }
  }
  
  if( is.vector(what) & !is.list(what) ){
    if( length(what) == 1 ){
      what <- rep(what, mt_len)
    }else{
      stopifnot( length(what) == length(what) )
    }
  }else{
    stop("Check argument what")
  }
  
  
  
  dat_mts <- mapply(
    function(master_tables_l,
             method_names_l,
             output_method_names_l,
             data_names_l,
             truth_names_l,
             coverage_thresholds_l,
             what_l,
             experiment_names_l){
      
      # master_tables_l <- master_tables[[1]]
      # method_names_l <- method_names[[1]]
      # output_method_names_l <- output_method_names[[1]]
      # data_names_l <- data_names[[1]]
      # truth_names_l <- truth_names[[1]]
      # coverage_thresholds_l <- coverage_thresholds[[1]]
      # what_l <- what[[1]]
      # experiment_names_l <- experiment_names[[1]]
      
      if( !any(what_l %in% c("snps_indels", "snps", "indels", "overall")) ){
        stop("`what` argument must be either \"snps_indels\", \"snps\", \"indels\", or \"overall\"")
      }
      
      mt_thresholdI_methodJ <- lapply(coverage_thresholds_l, function(threshold_i) {
        k <- master_tables_l[ ,paste0(data_names_l, "_coverage") ]
        mt_thresholdI <- master_tables_l[ k>=threshold_i, ]
        
        if( what_l %in% c("snps_indels", "snps") ){
          snp_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- paste0("is_indel_", method_names_i)
            k <- which(mt_thresholdI[,k] == 0)
            k <- mt_thresholdI[k,]
            calc_accuracy_measures(k, method_names_i, truth_names_l)
          })
          snp_accur_thresholdI <- do.call(rbind, snp_accur_thresholdI)
          snp_accur_thresholdI <- data.frame(snp_accur_thresholdI)
          snp_accur_thresholdI <- cbind(snp_accur_thresholdI,
                                        variant="snps",
                                        method=method_names_l,
                                        coverage_thresholds=threshold_i)
        }
        if( what_l %in% c("snps_indels", "indels") ){
          indel_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- paste0("is_indel_", method_names_i)
            k <- which(mt_thresholdI[,k] == 1)
            k <- mt_thresholdI[k,]
            calc_accuracy_measures(k, method_names_i, truth_names_l)
          })
          indel_accur_thresholdI <- do.call(rbind, indel_accur_thresholdI)
          indel_accur_thresholdI <- data.frame(indel_accur_thresholdI)
          indel_accur_thresholdI <- cbind(indel_accur_thresholdI,
                                          variant="indels",
                                          method=method_names_l,
                                          coverage_thresholds=threshold_i)
        }
        if( what_l == "overall" ){
          overall_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- mt_thresholdI
            calc_accuracy_measures(k, method_names_i, truth_names_l)
          })
          overall_accur_thresholdI <- do.call(rbind, overall_accur_thresholdI)
          overall_accur_thresholdI <- data.frame(overall_accur_thresholdI)
          overall_accur_thresholdI <- cbind(overall_accur_thresholdI,
                                            variant="overall",
                                            method=method_names_l,
                                            coverage_thresholds=threshold_i)
        }
        
        accur_thresholdI <- switch(
          what_l, 
          "snps_indels"={
            rbind(snp_accur_thresholdI, indel_accur_thresholdI)
          },
          "snps"={
            snp_accur_thresholdI
          },
          "indels"={
            indel_accur_thresholdI
          },
          "overall"={
            overall_accur_thresholdI
          }
        )
      })
      
      dat <- do.call(rbind, mt_thresholdI_methodJ) 
      names(dat) [names(dat) == "sensitivity"] <- "recall"
      dat$variant <- factor(dat$variant)
      dat$method <- factor(dat$method, levels=method_names_l)
      dat$coverage_thresholds <- factor(dat$coverage_thresholds,
                                        levels=sort(coverage_thresholds_l),
                                        ordered=TRUE)
      if( !is.null(output_method_names_l) ){
        if( length(method_names_l) != length(output_method_names_l) ){
          stop("The lengths of `method_names` and `output_method_names` must be equal.")
        }
        stopifnot( identical(method_names_l, levels(dat$method)) )
        levels(dat$method) <- output_method_names_l
      }
      names(dat) [names(dat) == "coverage_thresholds"] <- "coverage >= n"
      
      dat <- cbind(dat, experiment=experiment_names_l)
      
    },
    master_tables,
    method_names,
    output_method_names,
    data_names,
    truth_names,
    coverage_thresholds,
    what,
    experiment_names,
    SIMPLIFY=FALSE
  )
  dat_mts <- do.call(rbind, dat_mts)
  
  dat_mts
  
  # p <- ggplot(dat_mts, aes(.data$recall, .data$precision,
  #                          fill=.data$method, colour=.data$method)) +
  #   facet_grid(experiment~.)    ####### <<<<<--------=== not using .data$
  # if(what == "snps_indels"){
  #   p <- p +
  #     geom_point(aes(shape=.data$variant, size=.data$`coverage >= n`), alpha=.5) +
  #     geom_path(aes(linetype=.data$variant))
  # }else{
  #   p <- p +
  #     geom_point(aes(size=.data$`coverage >= n`), alpha=.5) +
  #     geom_path()
  # }
  # p <- p +
  #   coord_fixed(ratio=1) +
  #   theme(text = element_text(size = 20))
  # 
  # p
}





#' Calculate accuracy measures for multiple master tables and number of true 
#'   variant per coverage
#' 
#' Create a data.frame with accuracy measures for multiple master tables.
#' 
#' Like `calculate_precision_recall_for_multi_master_tables2`, but also outputs
#'   number of true variants per read coverage threshold.
#'
#' @param ... Master tables.
#' @param experiment_names Vector of strings. Name of the experiments in the same 
#'   order of the master tables.
#' @param method_names Vector or list of strings. Names of the methods to compare.
#'   If all experiments compare the same methods, `method_names` can be a single
#'   vector. Otherwise, `method_names` must be a list, in which each element 
#'   specifies the names of the methods to compare for each experiment.
#' @param output_method_names NULL (default), or a vector or a list of strings.
#'   Names of the methods to output. 
#' @param data_names Vector or list of strings.
#' @param truth_names Vector or list of strings.
#' @param coverage_thresholds Vector or list of integers.
#' @param what Vector or list of strings.
#'
#' @return A data.frame.
#' @export
calculate_precision_recall_for_multi_master_tables2 <- function(
  ...,
  experiment_names,
  method_names,
  output_method_names=NULL,
  data_names,
  truth_names,
  coverage_thresholds=c(3,15,40,100),
  what
){
  
  master_tables <- list(...)
  
  stopifnot( length(experiment_names) == length(master_tables) )
  
  if( length(master_tables) == 1 ){
    stopifnot( is.vector(method_names) & !is.list(method_names) )
    
    if( !is.null(output_method_names) ){
      stopifnot( is.vector(output_method_names) & !is.list(output_method_names) )
    }
    
    stopifnot( is.vector(data_names) & !is.list(data_names))
    stopifnot( length(data_names)==1 )
    
    stopifnot( is.vector(truth_names) & !is.list(truth_names) )
    stopifnot( length(truth_names)==1 )
    
    stopifnot( is.vector(coverage_thresholds) & !is.list(coverage_thresholds) )
    
    stopifnot( is.vector(what) & !is.list(what) )
    stopifnot( length(what)==1 )
    
    method_names <- list(method_names)
    output_method_names <- list(output_method_names)
    coverage_thresholds <- list(coverage_thresholds)
    
    mt_len <- 1
  }else{
    mt_len <- length(master_tables)
    
    if( is.null(output_method_names) ){
      output_method_names <- rep( list(output_method_names), mt_len )
    }else{
      if( is.vector(output_method_names) & !is.list(output_method_names) ){
        stopifnot( length(output_method_names) == length(method_names) )
        output_method_names <- rep( list(output_method_names), mt_len )
      }else{
        if( is.list(output_method_names) ){
          stopifnot( length(output_method_names) == length(master_tables) )
        }else{
          stop("Check argument output_method_names")
        }
      }
    }
    
    if( is.vector(method_names) & !is.list(method_names) ){
      method_names <- rep( list(method_names), mt_len )
    }else{
      if( is.list(method_names) ){
        stopifnot( length(method_names) == length(master_tables) )
      }else{
        stop("Check argument method_names")
      }
    }
    
    if( is.vector(data_names) & !is.list(data_names) ){
      if( length(data_names) == 1 ){
        data_names <- rep(data_names, mt_len)
      }else{
        stopifnot( length(data_names) == length(data_names) )
      }
    }else{
      stop("Check argument data_names")
    }
    
    if( is.vector(truth_names) & !is.list(truth_names) ){
      if( length(truth_names) == 1 ){
        truth_names <- rep(truth_names, mt_len)
      }else{
        stopifnot( length(truth_names) == length(truth_names) )
      }
    }else{
      stop("Check argument truth_names")
    }
    
    if( is.vector(coverage_thresholds) & !is.list(coverage_thresholds) ){
      coverage_thresholds <- rep( list(coverage_thresholds), mt_len )
    }else{
      if( is.list(coverage_thresholds) ){
        stopifnot( length(coverage_thresholds) == length(master_tables) )
      }else{
        stop("Check argument coverage_thresholds")
      }
    }
  }
  
  if( is.vector(what) & !is.list(what) ){
    if( length(what) == 1 ){
      what <- rep(what, mt_len)
    }else{
      stopifnot( length(what) == length(what) )
    }
  }else{
    stop("Check argument what")
  }
  
  
  
  dat_mts <- mapply(
    function(master_tables_l,
             method_names_l,
             output_method_names_l,
             data_names_l,
             truth_names_l,
             coverage_thresholds_l,
             what_l,
             experiment_names_l){
      
      # master_tables_l <- master_tables[[1]]
      # method_names_l <- method_names[[1]]
      # output_method_names_l <- output_method_names[[1]]
      # data_names_l <- data_names[[1]]
      # truth_names_l <- truth_names[[1]]
      # coverage_thresholds_l <- coverage_thresholds[[1]]
      # what_l <- what[[1]]
      # experiment_names_l <- experiment_names[[1]]
      
      if( !any(what_l %in% c("snps_indels", "snps", "indels", "overall")) ){
        stop("`what` argument must be either \"snps_indels\", \"snps\", \"indels\", or \"overall\"")
      }
      
      mt_thresholdI_methodJ <- lapply(coverage_thresholds_l, function(threshold_i) {
        # threshold_i <- coverage_thresholds_l[[1]]
        
        k <- master_tables_l[ ,paste0(data_names_l, "_coverage") ]
        mt_thresholdI <- master_tables_l[ k>=threshold_i, ]
        
        if( what_l %in% c("snps_indels", "snps") ){
          snp_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            # method_names_i <- method_names_l[[1]]
            
            k <- paste0("is_indel_", method_names_i)
            k <- which(mt_thresholdI[,k] == 0)
            k <- mt_thresholdI[k,]
            res <- calc_accuracy_measures(k, method_names_i, truth_names_l)
            
            in_truth <- paste0("in_", truth_names_l)
            nTrueVar <- sum( k[,in_truth] == 1 )
            c(res, nTrueVariants=nTrueVar)
          })
          snp_accur_thresholdI <- do.call(rbind, snp_accur_thresholdI)
          snp_accur_thresholdI <- data.frame(snp_accur_thresholdI)
          snp_accur_thresholdI <- cbind(snp_accur_thresholdI,
                                        variant="snps",
                                        method=method_names_l,
                                        coverage_thresholds=threshold_i)
        }
        if( what_l %in% c("snps_indels", "indels") ){
          indel_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- paste0("is_indel_", method_names_i)
            k <- which(mt_thresholdI[,k] == 1)
            k <- mt_thresholdI[k,]
            res <- calc_accuracy_measures(k, method_names_i, truth_names_l)
            
            in_truth <- paste0("in_", truth_names_l)
            nTrueVar <- sum( k[,in_truth] == 1 )
            c(res, nTrueVariants=nTrueVar)
          })
          indel_accur_thresholdI <- do.call(rbind, indel_accur_thresholdI)
          indel_accur_thresholdI <- data.frame(indel_accur_thresholdI)
          indel_accur_thresholdI <- cbind(indel_accur_thresholdI,
                                          variant="indels",
                                          method=method_names_l,
                                          coverage_thresholds=threshold_i)
        }
        if( what_l == "overall" ){
          overall_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- mt_thresholdI
            res <- calc_accuracy_measures(k, method_names_i, truth_names_l)
            
            in_truth <- paste0("in_", truth_names_l)
            nTrueVar <- sum( k[,in_truth] == 1 )
            c(res, nTrueVariants=nTrueVar)
          })
          overall_accur_thresholdI <- do.call(rbind, overall_accur_thresholdI)
          overall_accur_thresholdI <- data.frame(overall_accur_thresholdI)
          overall_accur_thresholdI <- cbind(overall_accur_thresholdI,
                                            variant="overall",
                                            method=method_names_l,
                                            coverage_thresholds=threshold_i)
        }
        
        accur_thresholdI <- switch(
          what_l, 
          "snps_indels"={
            rbind(snp_accur_thresholdI, indel_accur_thresholdI)
          },
          "snps"={
            snp_accur_thresholdI
          },
          "indels"={
            indel_accur_thresholdI
          },
          "overall"={
            overall_accur_thresholdI
          }
        )
      })
      
      dat <- do.call(rbind, mt_thresholdI_methodJ) 
      names(dat) [names(dat) == "sensitivity"] <- "recall"
      dat$variant <- factor(dat$variant)
      dat$method <- factor(dat$method, levels=method_names_l)
      dat$coverage_thresholds <- factor(dat$coverage_thresholds,
                                        levels=sort(coverage_thresholds_l),
                                        ordered=TRUE)
      if( !is.null(output_method_names_l) ){
        if( length(method_names_l) != length(output_method_names_l) ){
          stop("The lengths of `method_names` and `output_method_names` must be equal.")
        }
        stopifnot( identical(method_names_l, levels(dat$method)) )
        levels(dat$method) <- output_method_names_l
      }
      names(dat) [names(dat) == "coverage_thresholds"] <- "coverage >= n"
      
      dat <- cbind(dat, experiment=experiment_names_l)
      
    },
    master_tables,
    method_names,
    output_method_names,
    data_names,
    truth_names,
    coverage_thresholds,
    what,
    experiment_names,
    SIMPLIFY=FALSE
  )
  dat_mts <- do.call(rbind, dat_mts)
  
  dat_mts
  
  # p <- ggplot(dat_mts, aes(.data$recall, .data$precision,
  #                          fill=.data$method, colour=.data$method)) +
  #   facet_grid(experiment~.)    ####### <<<<<--------=== not using .data$
  # if(what == "snps_indels"){
  #   p <- p +
  #     geom_point(aes(shape=.data$variant, size=.data$`coverage >= n`), alpha=.5) +
  #     geom_path(aes(linetype=.data$variant))
  # }else{
  #   p <- p +
  #     geom_point(aes(size=.data$`coverage >= n`), alpha=.5) +
  #     geom_path()
  # }
  # p <- p +
  #   coord_fixed(ratio=1) +
  #   theme(text = element_text(size = 20))
  # 
  # p
}







#' Calculate accuracy measures, per read coverage ranges, for multiple master
#'   tables, and output number of true variant per range
#' 
#' This is an alternative function for `calculate_precision_recall_for_multi_master_tables2`.
#'   It differs from the latter by using ranges of read coverage, i.e., there are
#'   maximums. 
#'
#' @param ... Master tables.
#' @param experiment_names Vector of strings. Name of the experiments in the same 
#'   order of the master tables.
#' @param method_names Vector or list of strings. Names of the methods to compare.
#'   If all experiments compare the same methods, `method_names` can be a single
#'   vector. Otherwise, `method_names` must be a list, in which each element 
#'   specifies the names of the methods to compare for each experiment.
#' @param output_method_names NULL (default), or a vector or a list of strings.
#'   Names of the methods to output. 
#' @param data_names Vector or list of strings.
#' @param truth_names Vector or list of strings.
#' @param coverage_ranges Data.frame of two columns. The first column stores the
#'   begining of the ranges and the second the end.
#' @param what Vector or list of strings.
#'
#' @return A data.frame.
#' @export
calculate_precision_recall_for_coverage_ranges <- function(
  ...,
  experiment_names,
  method_names,
  output_method_names=NULL,
  data_names,
  truth_names,
  coverage_ranges,
  what
){
  
  master_tables <- list(...)
  
  stopifnot( length(experiment_names) == length(master_tables) )
  
  if( length(master_tables) == 1 ){
    stopifnot( is.vector(method_names) & !is.list(method_names) )
    
    if( !is.null(output_method_names) ){
      stopifnot( is.vector(output_method_names) & !is.list(output_method_names) )
    }
    
    stopifnot( is.vector(data_names) & !is.list(data_names))
    stopifnot( length(data_names)==1 )
    
    stopifnot( is.vector(truth_names) & !is.list(truth_names) )
    stopifnot( length(truth_names)==1 )
    
    stopifnot( is.data.frame(coverage_ranges) )
    
    stopifnot( is.vector(what) & !is.list(what) )
    stopifnot( length(what)==1 )
    
    method_names <- list(method_names)
    output_method_names <- list(output_method_names)
    coverage_ranges <- list(coverage_ranges)
    
    mt_len <- 1
  }else{
    mt_len <- length(master_tables)
    
    if( is.null(output_method_names) ){
      output_method_names <- rep( list(output_method_names), mt_len )
    }else{
      if( is.vector(output_method_names) & !is.list(output_method_names) ){
        stopifnot( length(output_method_names) == length(method_names) )
        output_method_names <- rep( list(output_method_names), mt_len )
      }else{
        if( is.list(output_method_names) ){
          stopifnot( length(output_method_names) == length(master_tables) )
        }else{
          stop("Check argument output_method_names")
        }
      }
    }
    
    if( is.vector(method_names) & !is.list(method_names) ){
      method_names <- rep( list(method_names), mt_len )
    }else{
      if( is.list(method_names) ){
        stopifnot( length(method_names) == length(master_tables) )
      }else{
        stop("Check argument method_names")
      }
    }
    
    if( is.vector(data_names) & !is.list(data_names) ){
      if( length(data_names) == 1 ){
        data_names <- rep(data_names, mt_len)
      }else{
        stopifnot( length(data_names) == length(data_names) )
      }
    }else{
      stop("Check argument data_names")
    }
    
    if( is.vector(truth_names) & !is.list(truth_names) ){
      if( length(truth_names) == 1 ){
        truth_names <- rep(truth_names, mt_len)
      }else{
        stopifnot( length(truth_names) == length(truth_names) )
      }
    }else{
      stop("Check argument truth_names")
    }
    
    if( is.data.frame(coverage_ranges) ){
      coverage_ranges <- rep( list(coverage_ranges), mt_len )
    }else{
      if( is.list(coverage_ranges) ){
        stopifnot( length(coverage_ranges) == length(master_tables) )
      }else{
        stop("Check argument coverage_ranges")
      }
    }
  }
  
  if( is.vector(what) & !is.list(what) ){
    if( length(what) == 1 ){
      what <- rep(what, mt_len)
    }else{
      stopifnot( length(what) == length(what) )
    }
  }else{
    stop("Check argument what")
  }
  
  
  
  dat_mts <- mapply(
    function(master_tables_l,
             method_names_l,
             output_method_names_l,
             data_names_l,
             truth_names_l,
             coverage_ranges_l,
             what_l,
             experiment_names_l){
      
      # master_tables_l <- master_tables[[1]]
      # method_names_l <- method_names[[1]]
      # output_method_names_l <- output_method_names[[1]]
      # data_names_l <- data_names[[1]]
      # truth_names_l <- truth_names[[1]]
      # coverage_ranges_l <- coverage_ranges[[1]]
      # what_l <- what[[1]]
      # experiment_names_l <- experiment_names[[1]]
      
      if( !any(what_l %in% c("snps_indels", "snps", "indels", "overall")) ){
        stop("`what` argument must be either \"snps_indels\", \"snps\", \"indels\", or \"overall\"")
      }
      
      range_indexes <- seq_len( nrow(coverage_ranges_l) )
      mt_thresholdI_methodJ <- lapply(range_indexes, function(range_index_i) {
        # range_index_i <- range_indexes[[1]]
        
        threshold_i <- coverage_ranges_l[range_index_i,]
        
        k <- master_tables_l[ ,paste0(data_names_l, "_coverage") ]
        k <- k >= threshold_i$s & k <= threshold_i$e
        mt_thresholdI <- master_tables_l[k,]
        
        if( what_l %in% c("snps_indels", "snps") ){
          snp_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            # method_names_i <- method_names_l[[1]]
            
            k <- paste0("is_indel_", method_names_i)
            k <- which(mt_thresholdI[,k] == 0)
            k <- mt_thresholdI[k,]
            res <- calc_accuracy_measures(k, method_names_i, truth_names_l)
            
            in_truth <- paste0("in_", truth_names_l)
            nTrueVar <- sum( k[,in_truth] == 1 )
            c(res, nTrueVariants=nTrueVar)
          })
          snp_accur_thresholdI <- do.call(rbind, snp_accur_thresholdI)
          snp_accur_thresholdI <- data.frame(snp_accur_thresholdI)
          
          threshold_i <- paste( as.numeric(threshold_i), collapse="--")
          snp_accur_thresholdI <- cbind(snp_accur_thresholdI,
                                        variant="snps",
                                        method=method_names_l,
                                        coverage_ranges=threshold_i)
        }
        if( what_l %in% c("snps_indels", "indels") ){
          indel_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- paste0("is_indel_", method_names_i)
            k <- which(mt_thresholdI[,k] == 1)
            k <- mt_thresholdI[k,]
            res <- calc_accuracy_measures(k, method_names_i, truth_names_l)
            
            in_truth <- paste0("in_", truth_names_l)
            nTrueVar <- sum( k[,in_truth] == 1 )
            c(res, nTrueVariants=nTrueVar)
          })
          indel_accur_thresholdI <- do.call(rbind, indel_accur_thresholdI)
          indel_accur_thresholdI <- data.frame(indel_accur_thresholdI)
          
          indel_accur_thresholdI <- cbind(indel_accur_thresholdI,
                                          variant="indels",
                                          method=method_names_l,
                                          coverage_ranges=threshold_i)
        }
        if( what_l == "overall" ){
          overall_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- mt_thresholdI
            res <- calc_accuracy_measures(k, method_names_i, truth_names_l)
            
            in_truth <- paste0("in_", truth_names_l)
            nTrueVar <- sum( k[,in_truth] == 1 )
            c(res, nTrueVariants=nTrueVar)
          })
          overall_accur_thresholdI <- do.call(rbind, overall_accur_thresholdI)
          overall_accur_thresholdI <- data.frame(overall_accur_thresholdI)
          
          overall_accur_thresholdI <- cbind(overall_accur_thresholdI,
                                            variant="overall",
                                            method=method_names_l,
                                            coverage_ranges=threshold_i)
        }
        
        accur_thresholdI <- switch(
          what_l, 
          "snps_indels"={
            rbind(snp_accur_thresholdI, indel_accur_thresholdI)
          },
          "snps"={
            snp_accur_thresholdI
          },
          "indels"={
            indel_accur_thresholdI
          },
          "overall"={
            overall_accur_thresholdI
          }
        )
      })
      
      dat <- do.call(rbind, mt_thresholdI_methodJ) 
      names(dat) [names(dat) == "sensitivity"] <- "recall"
      dat$variant <- factor(dat$variant)
      dat$method <- factor(dat$method, levels=method_names_l)
      
      coverage_ranges_l <- coverage_ranges_l [ order(coverage_ranges_l$s) , ]
      range_levels <- apply(coverage_ranges_l, 1, paste, collapse="--") 
      dat$coverage_ranges <- factor(dat$coverage_ranges,
                                        levels=range_levels,
                                        ordered=TRUE)
      if( !is.null(output_method_names_l) ){
        if( length(method_names_l) != length(output_method_names_l) ){
          stop("The lengths of `method_names` and `output_method_names` must be equal.")
        }
        stopifnot( identical(method_names_l, levels(dat$method)) )
        levels(dat$method) <- output_method_names_l
      }
      names(dat) [names(dat) == "coverage_ranges"] <- "Coverage Ranges"
      
      dat <- cbind(dat, experiment=experiment_names_l)
      
    },
    master_tables,
    method_names,
    output_method_names,
    data_names,
    truth_names,
    coverage_ranges,
    what,
    experiment_names,
    SIMPLIFY=FALSE
  )
  dat_mts <- do.call(rbind, dat_mts)
  
  dat_mts
}









#' For each N-cigar-read-count interval, calculate accuracy measures for 
#'   multiple master tables
#' 
#' Create a data.frame with accuracy measures for multiple master tables and 
#'   different intervals for N-cigar-read counts.
#'
#' @param ... Master tables.
#' @param experiment_names Vector of strings. Name of the experiments in the same 
#'   order of the master tables.
#' @param method_names Vector or list of strings. Names of the methods to compare.
#'   If all experiments compare the same methods, `method_names` can be a single
#'   vector. Otherwise, `method_names` must be a list, in which each element 
#'   specifies the names of the methods to compare for each experiment.
#' @param output_method_names NULL (default), or a vector or a list of strings.
#'   Names of the methods to output. 
#' @param data_names Vector or list of strings.
#' @param truth_names Vector or list of strings.
#' @param start_n_cigar_read_percent_intervals vector of doubles. The start of
#'   each interval of N-cigar-read counts.
#' @param end_n_cigar_read_percent_intervals vector of doubles. The end of
#'   each interval of N-cigar-read counts.
#' @param what Vector or list of strings.
#'
#' @return A data.frame.
#' @export
calculate_precision_recall_for_n_cigar_read_count_intervarls <- function(
  ...,
  experiment_names,
  method_names,
  output_method_names=NULL,
  data_names,
  truth_names,
  start_n_cigar_read_percent_intervals=c(0, 0.05, 0.9),
  end_n_cigar_read_percent_intervals=c(0.05, 0.9, 1),
  what
){
  
  
  # master_tables <- list(dat_jurkat_cover10to20, dat_wtc11_cover10to20)
  # experiment_names <- c("jurkat", "wtc11")
  # method_names <- c("dv", "dv_s", "dv_s_fc", "gatk_s")
  # output_method_names = c("DV", "SNCR+DV", "SNCR+FC+DV", "SNCR+GATK")
  # data_names <- "isoSeq"
  # truth_names <- c("jurkat_dna_merged", "allen")
  # start_n_cigar_read_percent_intervals <- c(0, 0.05, 0.9)
  # end_n_cigar_read_percent_intervals <- c(0.05, 0.9, 1)
  # what <- "snps_indels"
  
  
  n_cigar_read_percent_intervals <- matrix(
    c(start_n_cigar_read_percent_intervals,
      end_n_cigar_read_percent_intervals),
    ncol=2
  )
  
  
  master_tables <- list(...)
  
  stopifnot( length(experiment_names) == length(master_tables) )
  
  if( length(master_tables) == 1 ){
    stopifnot( is.vector(method_names) & !is.list(method_names) )
    
    if( !is.null(output_method_names) ){
      stopifnot( is.vector(output_method_names) & !is.list(output_method_names) )
    }
    
    stopifnot( is.vector(data_names) & !is.list(data_names))
    stopifnot( length(data_names)==1 )
    
    stopifnot( is.vector(truth_names) & !is.list(truth_names) )
    stopifnot( length(truth_names)==1 )
    
    # stopifnot( is.vector(n_cigar_read_percent_intervals) & !is.list(n_cigar_read_percent_intervals) )
    
    stopifnot( is.vector(what) & !is.list(what) )
    stopifnot( length(what)==1 )
    
    method_names <- list(method_names)
    output_method_names <- list(output_method_names)
    n_cigar_read_percent_intervals <- list(n_cigar_read_percent_intervals)
    
    mt_len <- 1
  }else{
    mt_len <- length(master_tables)
    
    if( is.null(output_method_names) ){
      output_method_names <- rep( list(output_method_names), mt_len )
    }else{
      if( is.vector(output_method_names) & !is.list(output_method_names) ){
        stopifnot( length(output_method_names) == length(method_names) )
        output_method_names <- rep( list(output_method_names), mt_len )
      }else{
        if( is.list(output_method_names) ){
          stopifnot( length(output_method_names) == length(master_tables) )
        }else{
          stop("Check argument output_method_names")
        }
      }
    }
    
    if( is.vector(method_names) & !is.list(method_names) ){
      method_names <- rep( list(method_names), mt_len )
    }else{
      if( is.list(method_names) ){
        stopifnot( length(method_names) == length(master_tables) )
      }else{
        stop("Check argument method_names")
      }
    }
    
    if( is.vector(data_names) & !is.list(data_names) ){
      if( length(data_names) == 1 ){
        data_names <- rep(data_names, mt_len)
      }else{
        stopifnot( length(data_names) == length(data_names) )
      }
    }else{
      stop("Check argument data_names")
    }
    
    if( is.vector(truth_names) & !is.list(truth_names) ){
      if( length(truth_names) == 1 ){
        truth_names <- rep(truth_names, mt_len)
      }else{
        stopifnot( length(truth_names) == length(truth_names) )
      }
    }else{
      stop("Check argument truth_names")
    }
    
    # if( is.vector(n_cigar_read_percent_intervals) & !is.list(n_cigar_read_percent_intervals) ){
    #   n_cigar_read_percent_intervals <- rep( list(n_cigar_read_percent_intervals), mt_len )
    # }else{
    #   if( is.list(n_cigar_read_percent_intervals) ){
    #     stopifnot( length(n_cigar_read_percent_intervals) == length(master_tables) )
    #   }else{
    #     stop("Check argument n_cigar_read_percent_intervals")
    #   }
    # }
    n_cigar_read_percent_intervals <- rep(
      list(n_cigar_read_percent_intervals),
      mt_len
    )
  }
  
  
  ### any check for argument `what`?
  # if( is.vector(what) & !is.list(what) ){
  #   if( length(what) == 1 ){
  #     what <- rep(what, mt_len)
  #   }else{
  #     stopifnot( length(what) == length(what) )
  #   }
  # }else{
  #   stop("Check argument what")
  # }
  
  
  
  dat_mts <- mapply(
    function(master_tables_l,
             method_names_l,
             output_method_names_l,
             data_names_l,
             truth_names_l,
             n_cigar_read_percent_intervals_l,
             what_l,
             experiment_names_l){
      
      # master_tables_l <- master_tables[[2]]
      # method_names_l <- method_names[[2]]
      # output_method_names_l <- output_method_names[[2]]
      # data_names_l <- data_names[[2]]
      # truth_names_l <- truth_names[[2]]
      # n_cigar_read_percent_intervals_l <- n_cigar_read_percent_intervals[[2]]
      # what_l <- what[[2]]
      # experiment_names_l <- experiment_names[[2]]
      
      
      
      
      if( !any(what_l %in% c("snps_indels", "snps", "indels", "overall")) ){
        stop("`what` argument must be either \"snps_indels\", \"snps\", \"indels\", or \"overall\"")
      }
      
      mt_intervalI_methodJ <- apply(n_cigar_read_percent_intervals_l, 1, function(interval_i) {
        # interval_i <- n_cigar_read_percent_intervals_l[1,]
        
        k <- master_tables_l[ ,paste0("percent_n_cigar_reads") ]
        mt_intervalI <- master_tables_l[ k>=interval_i[1] & k<interval_i[2], ]
        
        if( what_l %in% c("snps_indels", "snps") ){
          snp_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            # method_names_i <- method_names_l[[1]]
            
            k <- paste0("is_indel_", method_names_i)
            k <- which( mt_intervalI[,k]==0 )
            k <- mt_intervalI[k,]
            calc_accuracy_measures(k, method_names_i, truth_names_l)
          })
          snp_accur_thresholdI <- do.call(rbind, snp_accur_thresholdI)
          snp_accur_thresholdI <- data.frame(snp_accur_thresholdI)
          interval_i <- paste0("[", interval_i[1], "-", interval_i[2], ")")
          snp_accur_thresholdI <- cbind(snp_accur_thresholdI,
                                        variant="snps",
                                        method=method_names_l,
                                        n_cigar_read_percent_intervals=interval_i)
        }
        if( what_l %in% c("snps_indels", "indels") ){
          indel_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            
            
            k <- paste0("is_indel_", method_names_i)
            k <- which( mt_intervalI[,k]==1 )
            k <- mt_intervalI[k,]
            calc_accuracy_measures(k, method_names_i, truth_names_l)
          })
          indel_accur_thresholdI <- do.call(rbind, indel_accur_thresholdI)
          indel_accur_thresholdI <- data.frame(indel_accur_thresholdI)
          indel_accur_thresholdI <- cbind(indel_accur_thresholdI,
                                          variant="indels",
                                          method=method_names_l,
                                          n_cigar_read_percent_intervals=interval_i)
        }
        if( what_l == "overall" ){
          overall_accur_thresholdI <- lapply(method_names_l, function(method_names_i){
            k <- mt_intervalI
            calc_accuracy_measures(k, method_names_i, truth_names_l)
          })
          overall_accur_thresholdI <- do.call(rbind, overall_accur_thresholdI)
          overall_accur_thresholdI <- data.frame(overall_accur_thresholdI)
          overall_accur_thresholdI <- cbind(overall_accur_thresholdI,
                                            variant="overall",
                                            method=method_names_l,
                                            n_cigar_read_percent_intervals=interval_i)
        }
        
        accur_thresholdI <- switch(
          what_l, 
          "snps_indels"={
            rbind(snp_accur_thresholdI, indel_accur_thresholdI)
          },
          "snps"={
            snp_accur_thresholdI
          },
          "indels"={
            indel_accur_thresholdI
          },
          "overall"={
            overall_accur_thresholdI
          }
        )
      })
      
      dat <- do.call(rbind, mt_intervalI_methodJ) 
      names(dat) [names(dat) == "sensitivity"] <- "recall"
      dat$variant <- factor(dat$variant)
      dat$method <- factor(dat$method)
      
      
      k <- apply(n_cigar_read_percent_intervals_l, 1, function(x){
        paste0()
      })
      dat$n_cigar_read_percent_intervals <- factor(dat$n_cigar_read_percent_intervals,
                                                   levels=sort(unique(dat$n_cigar_read_percent_intervals)),
                                                   ordered=TRUE)
      if( !is.null(output_method_names_l) ){
        if( length(method_names_l) != length(output_method_names_l) ){
          stop("The lengths of `method_names` and `output_method_names` must be equal.")
        }
        stopifnot( identical(method_names_l, levels(dat$method)) )
        levels(dat$method) <- output_method_names_l
      }
      names(dat) [names(dat) == "n_cigar_read_percent_intervals"] <- "% N-cigar reads"
      
      dat <- cbind(dat, experiment=experiment_names_l)
      
    },
    master_tables,
    method_names,
    output_method_names,
    data_names,
    truth_names,
    n_cigar_read_percent_intervals,
    what,
    experiment_names,
    SIMPLIFY=FALSE
  )
  dat_mts <- do.call(rbind, dat_mts)
  
  dat_mts
  
  # p <- ggplot(dat_mts, aes(.data$recall, .data$precision,
  #                          fill=.data$method, colour=.data$method)) +
  #   facet_grid(experiment~.)    ####### <<<<<--------=== not using .data$
  # if(what == "snps_indels"){
  #   p <- p +
  #     geom_point(aes(shape=.data$variant, size=.data$`coverage >= n`), alpha=.5) +
  #     geom_path(aes(linetype=.data$variant))
  # }else{
  #   p <- p +
  #     geom_point(aes(size=.data$`coverage >= n`), alpha=.5) +
  #     geom_path()
  # }
  # p <- p +
  #   coord_fixed(ratio=1) +
  #   theme(text = element_text(size = 20))
  # 
  # p
}












#' Add a INFO tag from a VCF file to a master table
#' 
#' If a variant in the VCF file doesn'r contain the specified tag, the function
#' return -1 for that variant.
#' 
#' @param input_table A data.frame. The input master table.
#' @param col_name A 1-length string. the name of the column to add. If NULL (default),
#'   `col_name` is equal to `tag`.
#' @param tag A 1-length string. The name of the tag to add. It must be exactly how
#'   the tag is written in the VCF file (case sensitive).
#' @param vcf_file A 1-length string. The path to the VCF file from which the tag
#'   is extracted.
#'
#' @return A data.frame.
#' 
#' @importFrom vcfR read.vcfR
#' @importFrom dplyr left_join
#' 
#' @export
add_info_tag_from_vcf <- function(input_table, col_name=NULL, tag, vcf_file){
  if(is.null(col_name)){
    col_name <- tag
  }
  
  vcf <- read.vcfR(vcf_file)
  
  k <- gettextf(".*%s=(.+?);.*", tag)
  k <- sub( k, "\\1", 
            paste0(vcf@fix[,8], ";") )
  k <- as.numeric(k)
  k[is.na(k)] <- -1
  tag_values <- data.frame(chrm=vcf@fix[,1],
                           pos=as.integer(vcf@fix[,2]),
                           tag=k)
  names(tag_values)[3] <- col_name
  
  left_join(input_table, tag_values)
}







#' Add a FORMAT tag from a VCF file to a master table
#' 
#' The function adds to `input_table` the new column <tagName>_<methodName>,
#'   where <tagName> is `tag` in lower case and <methodName> is `method_name`.
#' 
#' @param input_table A data.frame. The input master table.
#' @param method_name A 1-length string. The name of the method.
#' @param tag A 1-length string. The name of the tag to add. It must be exactly
#'   how the tag is written in the VCF file (case sensitive).
#' @param vcf_file A 1-length string. The path to the VCF file from which the tag
#'   is extracted.
#'
#' @return A data.frame.
#' 
#' @importFrom vcfR read.vcfR
#' @importFrom dplyr left_join
#' 
#' @export
add_format_tag_from_vcf <- function(input_table, method_name, tag, vcf_file){
  
  vcf <- read.vcfR(vcf_file)
  
  cmds <- gettextf("bcftools query -f '%%CHROM %%POS[ %%%s]\n' %s",
                   tag, vcf_file)
  x <- system(cmds, intern=TRUE)
  x <- strsplit(x, " ")
  x <- do.call(rbind, x)
  stopifnot( all(vcf@fix[,1:2] == x[,-3]) )
  x <- data.frame(x)
  col_name <- paste( tolower(tag), method_name, sep="_")
  names(x) <- c("chrm", "pos", col_name)
  x$pos <- as.integer(x$pos)
  res <- left_join(input_table, x)
  stopifnot( all(input_table[,1:2] == res[,1:2]) )
  
  res
}








#' Add some statistic of a tag about reads of a BAM file
#'
#' @param input_table A data.frame. The input master table.
#' @param col_name A 1-length string. The name of the new column to add.
#' @param galn A GAlignment object. The alingnment from where the tag is 
#'   extracted.
#' @param tag A 1-length string. The name of tag that is wanted to calculate
#'   the statistic `stat`. The name must be exactly the same of one column
#'   name from `mcols(galn)`.
#' @param stat A function used to calculate the desired statistic.
#'
#' @return A data.frame.
#' 
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom GenomicAlignments findOverlaps
#' @importFrom S4Vectors mcols
#' 
#' @export
add_some_read_statistic_from_bam_metacolumn <- function(input_table, col_name, galn, tag, stat){
  dat_pos <- GRanges(input_table$chrm, IRanges(input_table$pos, width=1))
  ovl <- findOverlaps(dat_pos, galn)
  output_stat <- tapply( subjectHits(ovl), queryHits(ovl), function(i){
    stat( mcols(galn) [,tag] [i] )
  })
  input_table[,col_name] <- NA
  k <- as.integer( names(output_stat) )
  input_table[k ,col_name] <- unname(output_stat)
  
  input_table
}





#' Add column of density of variants around each variant called by a method
#'
#' @param input_table A data.frame. The input master table.
#' @param window_size A 1-length integer. The size of the window where variants
#'   are used to calculate the density of variants around a each variant called
#'   by the specified method.
#' @param used_methods A vector of strings. The names of the methods to be used.
#'
#' @return A data.frame.
#' 
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom IRanges resize
#' @importFrom IRanges findOverlaps
#' @importFrom S4Vectors queryHits
#' @importFrom S4Vectors subjectHits
#' @importFrom snakecase to_lower_camel_case
#' 
#' @export
add_variant_density_of_a_method <- function(input_table, window_size, used_methods){
  # input_table <- dat
  # window_size <- 101
  # used_methods <- "dv_s_fc"
  # used_methods <- c("dv_s_fc", "gatk_s")
  
  
  
  # method_calls <- paste0("in_", method)
  # in_method <- input_table[,method_calls] == 1
  # in_method_range <- GRanges( input_table$chrm[in_method], 
  #                             IRanges(input_table$pos[in_method], width=1) )
  methods_calls <- paste0("in_", used_methods)
  k <- input_table[,methods_calls, drop=FALSE]
  
  k <- rowSums(k)
  in_methods <- k > 0
  in_methods_range <- GRanges( input_table$chrm[in_methods],
                               IRanges(input_table$pos[in_methods], width=1) )
  
  
  
  
  all_variants <- GRanges( input_table$chrm,
                           IRanges(input_table$pos, width=1) )
  all_windows <- resize(all_variants, window_size, "center")
  
  ovl <- findOverlaps(in_methods_range, all_windows)
  variant_density <- tapply(queryHits(ovl), subjectHits(ovl), length)
  
  variant_density_column <- paste(
    c( "variantDensity", to_lower_camel_case(used_methods) ),
    collapse="_"
  )
  input_table[,variant_density_column] <- 0
  k <- as.numeric( names(variant_density) )
  input_table[k, variant_density_column] <- variant_density
  
  input_table
}






#' Standardize genotypes
#' 
#' This function rewrite genotype like:
#' * 0|1 to 0/1;
#' * 0/2, 0/3, ..., to 0/1;
#' * 2/2, 3/3, ..., to 1/1;
#' * 1/3, 2/3, ..., to 1/2
#'
#' @param gt A vector of strings. The input genotypes.
#' 
#' @importFrom stringr str_split
#'
#' @return A vector of strings.
#' @export
standardize_genotype <- function(gt){
  gt_sd <- sub("\\|", "/", gt)
  gt_sd_split <- str_split(gt_sd, "/", simplify=TRUE)
  # heterozigous
  het <- apply(gt_sd_split=="0", 1, sum)
  het <- het==1
  gt_sd[het] <- "0/1"
  gt_sd_split[het, 2] <- "1"
  # homozigous alternative
  homAlt <- gt_sd_split[,1] != "0" & gt_sd_split[,1]==gt_sd_split[,2]
  gt_sd[homAlt] <- "1/1"
  gt_sd_split[homAlt, 1] <- "1"
  gt_sd_split[homAlt, 2] <- "1"
  # heterozygous alternative
  hetRef <- gt_sd=="0/0"
  hetAlt <- !hetRef & !het & !homAlt
  gt_sd[hetAlt] <- "1/2"
  gt_sd
}




#' get the genotype and the variant type of the calls of a primary and 
#' secundary method
#' 
#' Create a new data.frame from a input master table. The rows correspond to
#'   the same variants in `input_table` and in the same order.
#'
#' @param input_table A data.frame. The input master table.
#' @param gt_first A 1-length string. The name of the column that stores the
#'   genotypes called by a method. This method is called the first method.
#' @param gt_second A 1-length string. If the first method does not call some 
#'   variants, extract their genotype from column `gt_second`.
#' @param vt_first A 1-length string. The name of the column that stores the
#'   variant type called by a method. This method is called the first method.
#' @param vt_second A 1-length string. If the first method does not call some 
#'   variants, extract their variant type from column `vt_second`.
#'
#' @return A data.frame.
#' 
#' @export
gt_vt_method <- function(input_table, gt_first, gt_second, vt_first, vt_second){
  
  ### genotype (gt)
  gt <- input_table[,gt_first]
  k <- is.na(gt) | gt=="0/0"
  gt[k] <- input_table[,gt_second] [k]
  
  ### gt -- standardized 
  gt_sd <- standardize_genotype(gt)
  
  ### variant type (vt)
  vt <- input_table[,vt_first]
  k <- is.na(vt)
  vt[k] <- input_table[,vt_second] [k]
  
  data.frame(gt, gt_sd, vt)
}











#' Get indels in homopolymers
#' 
#' Create a new data.frame that stores deletions of inside homopolymers, and
#' insertions of a same nucleotide type that happen inside homopolymers of
#' that same nucleotide type. Homopolymer length equal to 1 means
#' non-homopolymers.
#'
#' @param input_table A data.frame. The input master table.
#' @param first_method_name A 1-length string of the name of a method. First,
#'   the variant information that is taken is related to this method. If a
#'   variant is not called by the method, its information is taken relating
#'   to the method specifiec in `second_method_name`.
#' @param second_method_name A 1-length string. The name of a method.
#' @param vcf_first A 1-length string. The path of the VCF file of the first
#'   method.
#' @param vcf_second A 1-length string. The path of the VCF file of the
#'   second method.
#' @param method_dataset_name A 1-length string. Name of the dataset used to
#'   call variants by the methods to be compared.
#' @param homopolymers A CompressedIRangesList object. It should store all 
#'   homopolymers, it's nucleotive types and lengths, of the genome used as
#'   the reference to call the variants. It is gerated by the function
#'   `sarlacc::homopolymerFinder`.
#' @param ref_fasta_seqs A DNAStringSet object. The sequences of the genome
#'   used as the reference to call the variants. It's names must be in the
#'   form like "chr1", "chr2", ..., "chrX", "chrY".
#' @param min_isoseq_coverage min iso-seq read coverage to filter.
#' @param genotyped_alt One of the strings "find" or "same". If "find", the
#'   function finds the correct alternative allele based on the genotype. This
#'   option must be chosen if there are multiple values for column ALT, which
#'   is the case of VCF files output by DeepVariant and Clair3. Since GATK's
#'   VCF files keep only the genotyped alternative allele in column ALT, this
#'   argument should be "same".
#'
#' @return A data.frame.
#' 
#' @importFrom vcfR read.vcfR
#' @importFrom dplyr left_join
#' @importFrom Biostrings readDNAStringSet
#' @importFrom IRanges IRanges
#' @importFrom XVector subseq compact
#' @importFrom dplyr rename
#' 
#' @export
method_homopolymer_indels <- function(input_table, first_method_name, second_method_name,
                                      vcf_first, vcf_second, method_dataset_name, homopolymers,
                                      ref_fasta_seqs, min_isoseq_coverage, genotyped_alt){
  
  stopifnot( genotyped_alt %in% c("find", "same") )
  k <- grep("dv|deepvariant|c3|clair3", second_method_name, ignore.case=TRUE)
  if( identical(k,1L) ){
    if(genotyped_alt!="find"){
      message("It looks like the second method is DeepVariant or Clair3, but genotyped_alt is not 'find'.")
      invisible(readline(prompt="Press [enter] to continue"))
    }
  }else{
    k <- grep("gatk", second_method_name, ignore.case=TRUE)
    if( identical(k,1L) ){
      if(genotyped_alt!="same"){
        message("It looks like the second method is GATK, but genotyped_alt is not 'same'.")
        invisible(readline(prompt="Press [enter] to continue"))
      }
    }
  }
  
  ### genotype (gt)
  gt_first <- paste0("gt_", first_method_name)
  gt_second <- paste0("gt_", second_method_name)
  gt <- input_table[,gt_first]
  het <- is.na(gt) | gt=="0/0"
  gt[het] <- input_table[,gt_second] [het]
  
  ### gt -- standardized 
  gt_sd <- standardize_genotype(gt)
  
  ### variant type (vt)
  vt_first <- paste0("variantType_", first_method_name)
  vt_second <- paste0("variantType_", second_method_name)
  vt <- input_table[,vt_first]
  vt[het] <- input_table[,vt_second] [het]
  
  ### add ref/alt alleles from `vcf_first` to `input_table`
  vcf_fst <- read.vcfR(vcf_first)
  k <- vcf_fst@fix[ ,c("CHROM", "POS", "REF", "ALT") ]
  k <- data.frame(k)
  k <- rename(k, "chrm"="CHROM", "pos"="POS", "ref_fst"="REF", "alt_fst"="ALT")
  k$pos <- as.integer(k$pos)
  input_table <- left_join(input_table, k)
  
  ### add ref/alt alleles from `vcf_second` to `input_table`
  vcf_snd <- read.vcfR(vcf_second)
  k <- vcf_snd@fix[ ,c("CHROM", "POS", "REF", "ALT") ]
  k <- data.frame(k)
  k <- rename(k, "chrm"="CHROM", "pos"="POS", "ref_snd"="REF", "alt_snd"="ALT")
  k$pos <- as.integer(k$pos)
  input_table <- left_join(input_table, k)
  
  ### ref/alt alleles
  ref_alt_alleles <- input_table[, c("ref_fst", "alt_fst")]
  ref_alt_alleles[het,] <- input_table[het, c("ref_snd", "alt_snd")]
  names(ref_alt_alleles) <- c("ref_allele", "alt_allele")
  
  ### make a data frame and filter
  k <- paste0( "in_",  c(first_method_name, second_method_name) )
  k <- c( k, paste0(second_method_name, "_classification") )
  coverage_dataset_column <- paste0(method_dataset_name, "_coverage")
  k <- c("chrm", "pos", "homopolymer_length_indel", coverage_dataset_column, k)
  res <- cbind( input_table[,k], gt, gt_sd, vt, ref_alt_alleles )
  k <- (res$gt_sd %in% c("0/1", "1/1")) & (res$vt %in% c("insertion", "deletion"))
  res <- res[k,]
  
  ### filter by iso-seq coverage
  res <- res[ res[,coverage_dataset_column] >= min_isoseq_coverage, ]
  
  ### genotyped aternative allele
  if(genotyped_alt=="find"){
    k <- sub("^[0-9]+/", "", res$gt_sd)
    k <- as.integer(k)
    alt_split <- strsplit(res$alt_allele, ",")
    alt_genotyped <- mapply( function(a, ki){
      a[ki]
    }, alt_split, k)
    res <- cbind(res, alt_genotyped)
  }else{
    res$alt_genotyped <- res$alt_allele
  }
  
  ### there might be a mix between insertions and deletions. to make it simple,
  ### removed all those cases
  res <- res[ !(res$vt=="deletion" & nchar(res$alt_genotyped)>1), ]
  res <- res[ !(res$vt=="insertion" & nchar(res$ref_allele)>1), ]
  
  ### deleted nts must be the same
  res$ref_nts <- sub(".", "", res$ref_allele)
  res$alt_nts <- sub(".", "", res$alt_genotyped)
  
  k <- strsplit(res$ref_nts, NULL)
  k <- sapply(k, function(x){
    unique(x)
  })
  k <- lengths(k)
  stopifnot( all(k[res$vt=="insertion"]==0) )
  res <- res[ !(res$vt=="deletion" & k!=1), ]
  
  ### inserted nts must be the same
  k <- strsplit(res$alt_nts, NULL)
  k <- sapply(k, function(x){
    unique(x)
  })
  k <- lengths(k)
  stopifnot( all(k[res$vt=="deletion"]==0) )
  res <- res[ !(res$vt=="insertion" & k!=1), ]
  
  ### nts of indels and homopolymers (from genome of reference) must be the same
  # add nt type of homopolymers
  add_homopolymer_nt_when_indels <- add_homopolymer_length_when_indels
  res <- add_homopolymer_nt_when_indels(res, homopolymers, ouput_what="nts")
  # add nt type of non-homopolymers
  res_split <- split(res, res$chrm)
  ref_fasta_seqs <- ref_fasta_seqs[names(res_split)]
  res_split <- lapply( seq_along(res_split), function(i){
    r <- res_split[[i]]
    s <- ref_fasta_seqs[i]
    is_not_hom <- is.na(r$homopolymer_nt_indel)
    k_pos <- r$pos[is_not_hom] +1
    s <- rep(s, length(k_pos))
    nts <- compact(
      subseq(
        s,
        k_pos,
        k_pos
      )
    )
    r$homopolymer_nt_indel[is_not_hom] <- unname( as.vector(nts) )
    r
  })
  res <- do.call(rbind, res_split)
  
  ### filter out insertions that the nts of the alternative allele are not the
  ### same of the nts of the homopolymer in the reference fasta
  k <- substring(res$alt_nts, 1, 1)
  k <- res$vt=="insertion" & res$homopolymer_nt_indel!=k
  res <- res[!k,]
  
  res
}











#' Make plot to compare accuracy per homopolymer length
#' 
#' The function uses the output from function `method_homopolymer_indels`
#'   to make a plot to compare how the accuracy vary according to the
#'   homopolymer length or not within homopolymers. Those accuracy measures
#'   can be either rates of TPs, FNs and FPs, or the precision, recall
#'   and F1-score.
#'
#' @param input_hom_table A data.frame. The output of the function 
#'   `method_homopolymer_indels`.
#' @param variant_type A 1-length string. The variant type. Possivle
#'   values are: "snp", "deletion", or "insertion".
#' @param method_name A 1-length string. The name of the method to analyse.
#' @param truth_name A 1-length string. The name of the ground-truth.
#' @param hom_length_intervals A vector of integers. Must be the same
#'   length of `interval_names`. The inferior limit for each interval for
#'   homopolymer length.
#' @param interval_names A vector of strings. Must be the same length of
#'   `hom_length_intervals`. The name of each interval of homopolymer 
#'   length.
#' @param to_calculate A 1-length string. Possible values are: "rates" or
#'   "pre_rec_f1".
#'
#' @return A list, in which the first element, named `p`, is a ggplot
#'   object, and the second element, named `interval_counts`, is a named
#'   vector of integers that stores the counts of variants in each 
#'   interval specified by the x-axis of the plot in `p`.
#' 
#' @importFrom tibble rownames_to_column
#' @importFrom dplyr rename
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr group_by
#' @importFrom dplyr left_join
#' @importFrom dplyr summarise
#' @importFrom tidyr gather
#' @import ggplot2
#' @importFrom rlang .data
#' 
#' @export
make_homopolymer_plot <- function(input_hom_table, variant_type,
                                  method_name, truth_name, hom_length_intervals,
                                  interval_names, to_calculate){
  
  k <- to_calculate %in% c("rates", "pre_rec_f1")
  if(!k){
    stop("'to_calculate' argument must be either 'rates' or 'pre_rec_f1'.")
  }
  
  input_hom_table <- input_hom_table[input_hom_table$vt==variant_type,]
  
  k <- findInterval(input_hom_table$homopolymer_length_indel,
                    hom_length_intervals)
  stopifnot( !any(k==0) )
  k <- interval_names[k]
  k <- factor(k, levels=interval_names, ordered=TRUE)
  input_hom_table$homopolymer_length_intervals <- k
  
  method_class_name <- paste0(method_name, "_classification")
  k <- input_hom_table[,method_class_name] %in% c("FN", "TP", "FP")
  input_hom_table <- input_hom_table[k,]
  
  if(to_calculate=="rates"){
    k <- rename(input_hom_table, "Classification"=all_of(method_class_name))
    k <- mutate(k, Classification=droplevels(.data$Classification))
    k <- group_by(k, .data$homopolymer_length_intervals, .data$Classification)
    class_counts <- summarise(k, count=n())
    k <- group_by(class_counts, .data$homopolymer_length_intervals)
    total_counts <- summarise(k, total_count=sum(.data$count))
    k <- left_join(class_counts, total_counts)
    class_counts <- mutate(k, percent=.data$count/.data$total_count)
  }else{
    input_hom_table_split <- split(input_hom_table,
                                   input_hom_table$homopolymer_length_intervals)
    k <- lapply(input_hom_table_split, function(x){
      y <- calc_accuracy_measures(x, method_name, truth_name)
      c(y, total_count=nrow(x))
    })
    k <- do.call(rbind, k)
    k <- data.frame(k)
    k <- rownames_to_column(k, "homopolymer_length_intervals")
    k$homopolymer_length_intervals <- factor(k$homopolymer_length_intervals,
                                             levels=interval_names, ordered=TRUE)
    class_counts <- gather(k, "Classification", "percent", .data$precision:.data$f1Score,
                           factor_key=TRUE)
  }
  
  k <- split(class_counts$total_count, class_counts$homopolymer_length_intervals)
  interval_counts <- sapply(k, unique)
  stopifnot( is.atomic(interval_counts) )
  dat_text <- data.frame(
    label=paste0("n=", interval_counts),
    x=names(interval_counts),
    y=1.05*max(class_counts$percent),
    Classification=NA
  )
  
  p <- ggplot(class_counts, aes(x=.data$homopolymer_length_intervals, y=.data$percent,
                                group=.data$Classification, colour=.data$Classification)) +
    geom_point() +
    geom_line() +
    xlab("Variant is in a homopolymer of length n") +
    ylab("Proportion of each classification") +
    # ggtitle("Classifications from SNCR+FC+DeepVariant") +
    theme(text = element_text(size=18)) +
    # ylim(0,1) +
    geom_text( data=dat_text, mapping= aes(x=x, y=y, label=label) ) +
    NULL
  p
}







#' Organize the data used to make plots about homopolymer analysis
#' 
#' The user may want to use this function several times to pull information about
#'   different combinations between `variant_type` and `method_name` from
#'   `input_hom_table`. In the future, the function should make the job automatically
#'   using loops.
#'
#' @param input_hom_table A data.frame generated by function `method_homopolymer_indels`.
#' @param variant_type A 1-length string. Possible values are "insertion" or "deletion".
#' @param method_name A 1-length string. The name of the method from which is desired
#'   to extract information.
#' @param truth_name A 1-length string. The name of the ground-truth.
#' @param hom_length_intervals A vector of integers. The minimum for each interval of
#'   homopolymer length. Each interval `i` ranges from `hom_length_intervals[i]` to
#'   `hom_length_intervals[i+1]`, except the last interval which upper limit is `Inf`.
#' @param interval_names A vector of characters with the same length of
#'   `hom_length_intervals`. The name for each interval of homopolymer length.
#' @param to_calculate A 1-length string. Possible values are "rates" or "pre_rec_f1".
#'   If "rates", the functin calculates the rates of TPs, FNs and FPs. If "pre_rec_f1",
#'   it calculates the precision, the recall and the F1-score.
#' @param output_method_name A 1-length string. The label of the method specified in
#'   `method_name` to be output.
#'
#' @return A 2-length list (`class_counts` and `dat_text`).
#' 
#' @importFrom tibble rownames_to_column
#' @importFrom tidyr gather
#' @importFrom rlang .data
#' 
#' @export
make_homopolymer_table_to_plot <- function(input_hom_table, variant_type,
                                           method_name, truth_name,
                                           hom_length_intervals, interval_names,
                                           to_calculate, output_method_name){
  
  k <- to_calculate %in% c("rates", "pre_rec_f1")
  if(!k){
    stop("'to_calculate' argument must be either 'rates' or 'pre_rec_f1'.")
  }
  
  input_hom_table <- input_hom_table[input_hom_table$vt==variant_type,]
  
  k <- findInterval(input_hom_table$homopolymer_length_indel,
                    hom_length_intervals)
  stopifnot( !any(k==0) )
  k <- interval_names[k]
  k <- factor(k, levels=interval_names, ordered=TRUE)
  input_hom_table$homopolymer_length_intervals <- k
  
  method_class_name <- paste0(method_name, "_classification")
  k <- input_hom_table[,method_class_name] %in% c("FN", "TP", "FP")
  input_hom_table <- input_hom_table[k,]
  
  input_hom_table_split <- split(input_hom_table,
                                 input_hom_table$homopolymer_length_intervals)
  k <- lapply(input_hom_table_split, function(x){
    y <- calc_accuracy_measures(x, method_name, truth_name)
    
    # count covered true variants
    k <- paste0(method_name, "_classification")
    k <- sum( x[,k] %in% c("TP", "FN") )
    c(y, total_count=k)
  })
  k <- do.call(rbind, k)
  k <- data.frame(k)
  k <- rownames_to_column(k, "homopolymer_length_intervals")
  k$homopolymer_length_intervals <- factor(k$homopolymer_length_intervals,
                                           levels=interval_names, ordered=TRUE)
  class_counts <- gather(k, "Measures", "percent",
                         .data$precision:.data$f1Score, factor_key=TRUE)
  
  k <- split(class_counts$total_count, class_counts$homopolymer_length_intervals)
  interval_counts <- sapply(k, unique)
  stopifnot( is.atomic(interval_counts) )
  dat_text <- data.frame(
    label=paste0("n=", interval_counts),
    x=names(interval_counts),
    y=1.05*max(class_counts$percent),
    Measures=NA
  )
  
  class_counts$method <- dat_text$method <- output_method_name
  class_counts$variant_type <- dat_text$variant_type <- variant_type
  
  res <- list(class_counts=class_counts, dat_text=dat_text)
  res
}







#' Generate data for splice-junction analysis using multiple master tables
#' 
#' Generate data to plot variant performance of sites near to and far from splice junctions
#'   comparisons using multiple master table as facets.
#'
#' @param ... Data.frames. Each data.frame is a master table.
#' @param experiment_names A vector of strings with length equal to the number of master
#'   tables input in `...`. Names of the datasets for each data table.
#' @param truth_names A vector of strings. Names of the ground truth in each data table
#'   especified in `...`.
#' @param method_dataset_name A vector of strings. Name of the datasets used to call
#'   variants, by the methods to be compared, in each input master table.
#' @param method_names A vector of strings. The name of the methods to be compared defined in
#'   the input master tables. All master table must have the same `method_names`.
#' @param output_method_names A vector of strings. The output name of the methods to be
#'   compared defined in the input master tables. The output name of the methods will be
#'   the same for all master table.
#' @param variant_type A 1-length string. Possible values are "snp" or "indel".
#' @param min_isoseq_coverage A 1-length integer. The threshod value for the minimum
#'   Iso-Seq read coverage.
#'
#' @return A list of two elements (`acc_sj` and `n_test`) to be used to draw a chart to
#'  analyse the variant-calling performance of sites near to and far from splice junctions.
#'  `acc_sj` stores the calculated performance measures, and `n_test` stores the number of
#'  observed variants in each case.
#' 
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr recode
#' @importFrom rlang .data
#' 
#' @export
splice_junction_analysis_table <- function(..., experiment_names, truth_names, method_dataset_name,
                                           method_names, output_method_names, variant_type,
                                           min_isoseq_coverage) {
  
  data_tables <- list(...)
  
  if( !all( sapply(data_tables, class) == "data.frame" ) ){
    stop("All objects in `...` must be data.frames. Make sure it's a master table.")
  }
  
  if( !(variant_type %in% c("snp", "indel")) ){
    stop("`variant_type` argument must be either 'snp' or 'indel'.")
  }else{
    variant_type <- ifelse(variant_type=="snp", 0, 1)
  }
  
  acc_n_experiments <- mapply(function(data_tables_i, experiment_names_i, truth_names_i, method_dataset_name_i){
    ### filter master table by iso-seq read coverage
    dataset_coverage_column <- paste0(method_dataset_name_i, "_coverage")
    k <- data_tables_i[,dataset_coverage_column] >= min_isoseq_coverage
    data_tables_i <- data_tables_i[k,]
    
    ### take variants near and far from splice junctions
    ### if near, many reads must contain the splice junction (at least 50% of them)
    ### if far, no one read that contain a splice junction
    ### if we do not consider n-cigar reads, percent_ss may be higher than 1.
    ###   but this is ok, because we remove variants that overlap n-cigar reads
    k <- data_tables_i$ss_highest_num / data_tables_i[,dataset_coverage_column]
    data_tables_i <- mutate(data_tables_i, "percent_ss" = k)
    data_tables_i <- filter(data_tables_i, .data$percent_ss>=.5 | .data$is_near_ss==0)
    
    ### remove variants that overlap with any intronic region
    dataset_ncrNum_column <- paste0(method_dataset_name_i, "_ncr_num")
    k <- data_tables_i[,dataset_ncrNum_column] == 0
    data_tables_i <- data_tables_i[k,]
    
    ### filter out variant that are not in the ground truth and not called by any method
    in_wtc11_truth <- paste0("in_", truth_names_i)
    in_wtc11_methods <- paste0("in_", method_names)
    k <- data_tables_i[ ,c(in_wtc11_methods, in_wtc11_truth) ]
    k <- apply(k, 1, function(x) all(x==0))
    data_tables_i <- data_tables_i[!k,]
    dim(data_tables_i)
    
    ### get the distance of the most frequent variant
    data_tables_i <- mutate(data_tables_i, ss_dist_of_the_most_freq={
      mapply(function(num, dis){
        if( any(is.na(dis)) ){
          stopifnot( length(dis)==1 )
          NA
        }else{
          dis[ which.max(num) ]
        }
      }, .data$ss_num, .data$ss_dist)
    })
    
    ### get the is_acceptor_site of the most frequent variant
    data_tables_i <- mutate(data_tables_i, is_acceptor_site_of_the_most_freq={
      mapply(function(num, acc){
        if( any(is.na(acc)) ){
          stopifnot( length(acc)==1 )
          NA
        }else{
          acc[ which.max(num) ]
        }
      }, .data$ss_num, .data$is_acceptor_site)
    })
    
    ### for each method to compare, calculate performance measures of
    ### variant calling from sites near to and far from splice junctions
    data_tables_i_ori <- data_tables_i
    env <- environment()
    env[["n_methods"]] <- NULL
    acc_methods <- mapply(function(method_names_i, output_method_names_i){
      k <- paste0("is_indel_", method_names_i)
      k <- data_tables_i_ori[,k] == variant_type
      k <- which(k)
      data_tables_i <- data_tables_i_ori[k,]
      
      # number of true variants covered by iso-seq near to and far from splice junctions
      k <- paste0("in_", truth_names_i)
      k <- split( data_tables_i[,k], data_tables_i$is_near_ss == 1 )
      k <- sapply(k, sum)
      env[["n_methods"]] <- c( env[["n_methods"]], paste0("n=", k) )
      
      k <- split(data_tables_i, data_tables_i$is_near_ss)
      k <- sapply(k, calc_accuracy_measures, method_name=method_names_i, truth_name=truth_names_i)
      k <- as.table(k)
      acc <- as.data.frame(k)
      names(acc) <- c("Measures", "is_near", "Score")
      cbind(acc, Method=output_method_names_i)
    }, method_names, output_method_names, SIMPLIFY=FALSE)
    acc_methods <- do.call(rbind, acc_methods)
    
    acc_methods$is_near <- recode(acc_methods$is_near, "0"="No", "1"="Yes")
    acc_methods$Measures <- recode(acc_methods$Measures, "precision"="Precision",
                                   "sensitivity"="Recall", "f1Score"="F1-score")
    acc_methods$Method <- factor(acc_methods$Method, levels=output_method_names, ordered=TRUE)
    acc_methods$experiment <- experiment_names_i
    
    list(acc_methods=acc_methods, n_methods=n_methods)
  }, data_tables, experiment_names, truth_names, method_dataset_name, SIMPLIFY=FALSE)
  
  ### table of performace measures
  k <- lapply(acc_n_experiments, function(acc_n_experiments_i){
    acc_n_experiments_i$acc_methods
  })
  k <- unname(k)
  acc_sj <- do.call(rbind, k)
  rownames(acc_sj) <- NULL
  
  ### table of text annotation. usefull to inclute text into future charts to create
  k <- lapply(acc_n_experiments, function(acc_n_experiments_i){
    acc_n_experiments_i$n_methods
  })
  k <- unname(k)
  k <- do.call(c, k)
  n_text <- data.frame(
    label=k,
    experiment=rep(
      experiment_names,
      rep(length(method_names)*2,
          length(experiment_names))
    ),
    Method=gl(n=length(method_names),
              k=2,
              length=length(method_names)*2*length(experiment_names),
              labels=output_method_names,
              ordered=TRUE
    ),
    x=rep(
      c("No", "Yes"),
      length(method_names)*length(experiment_names)
    ),
    y=1.10*max(acc_sj$Score, na.rm=TRUE),
    Measures=NA
  )
  
  list(acc_sj=acc_sj, n_text=n_text)
}
vladimirsouza/lrRNA-seq_benchmark documentation built on March 25, 2023, 9:29 p.m.