R/processBadPIDs.R

Defines functions processBadPIDs saveToDisk.processBadPIDs shortReadQ_forced_append genSummary_processBadPIDs computeMetrics.processBadPIDs print.processBadPIDs

Documented in processBadPIDs shortReadQ_forced_append

#' Removes sequences shorter than a given cutoff
#' @inheritParams applyOperation
#' @export

processBadPIDs <- function(all_results, config)
{
  op_number <- config$current_op_number
  op_args <- config$operation_list[[op_number]]
  op_full_name <- paste(op_number, op_args$name, sep = '_')

  op_dir <- file.path(config$output_dir, config$base_for_names, op_full_name)
  dir.create(op_dir, showWarnings = FALSE, recursive = TRUE)
  
  data_source_indx <- grep(op_args$data_source, names(all_results))
  stopifnot(length(data_source_indx) == 1)
  stopifnot(all(sort(names(all_results[[data_source_indx]]$seq_dat)) == c("fwd", "rev")))
  seq_dat_fwd <- all_results[[data_source_indx]]$seq_dat$fwd
  seq_dat_rev <- all_results[[data_source_indx]]$seq_dat$rev
  stopifnot(all(seq_dat_fwd@id == seq_dat_rev@id))

  per_read_metrics <- data.frame(read_name = as.character(seq_dat_fwd@id),
                                 stringsAsFactors = F)
  per_read_metrics$pid <- gsub("^.*_PID:" , "", per_read_metrics$read_name)
  per_read_metrics$clean_pid <- gsub("_" , "", per_read_metrics$pid)

### group on pids and compare to consensus cutoff
  bin_metrics <- data.frame(pid = names(table(per_read_metrics$clean_pid)),
                            raw_size = as.numeric(table(per_read_metrics$clean_pid)),
                            stringsAsFactors = F)
  pid_len <- nchar(bin_metrics$pid[1])
  cc <- get_consensus_cutoff(pid_len, max(bin_metrics$raw_size),
                             get_cutoff_models(), phi = 1/100)
  bin_metrics$big_enough <- bin_metrics$raw_size > 2*cc

### find potential parents
  bin_metrics$dist_to_big <- -1
  big_enough_pids <- bin_metrics$pid[bin_metrics$big_enough]
  bin_metrics$parent_pid <- ""
  bin_metrics$parent_size <- -1
  bin_metrics$parentage_conflict <- -1
  for (i in 1:nrow(bin_metrics)){
    if (!bin_metrics$big_enough[i]){
      dist_to_bigs <- stringdist(bin_metrics$pid[i], big_enough_pids, 'hamming', nthread=config$ncpu)
      bin_metrics$dist_to_big[i] <- min(dist_to_bigs)
      #if (min(dist_to_bigs) <= 2){
      if (min(dist_to_bigs) < Inf){
        can_pids <- big_enough_pids[which(dist_to_bigs == min(dist_to_bigs))]
        can_pids_size <- bin_metrics$raw_size[match(can_pids, bin_metrics$pid)]
        biggest_can_id <- which(can_pids_size == max(can_pids_size))
        bin_metrics[i,'parent_pid'] <- can_pids[biggest_can_id[1]]
        bin_metrics[i,'parent_size'] <- can_pids_size[biggest_can_id[1]]
        if (length(biggest_can_id)>1){
          bin_metrics$parentage_conflict[i] <- length(biggest_can_id)
        }
      }
    }
  }

  # TODO: make these real parameters
  seq_err <- 1/100
  no_seq_err <- 99/100
  min_bin_size <- 3
  max_chimeric_content <- 0.3
  offspring_prob_cutoff <- sum(bin_metrics$big_enough)/(4^pid_len)

  bin_metrics$off_spring_prob <- NA_real_
### compute prob that content from parent
  for (i in 1:nrow(bin_metrics)){
    if (!bin_metrics$big_enough[i]){
      #if (bin_metrics$parent_pid[i] == ""){
      if (bin_metrics$dist_to_big[i] > 2){
        off_spring_prob <- 0
      } else {
        success_prob <- ((seq_err/3)^bin_metrics$dist_to_big[i])*(no_seq_err^(pid_len - bin_metrics$dist_to_big[i]))
        off_spring_prob <- 1-pbinom(trunc(bin_metrics$raw_size[i]*max_chimeric_content), round(bin_metrics$parent_size[i]/no_seq_err,0), success_prob)
      }
      bin_metrics$off_spring_prob[i] <- off_spring_prob 
    }
  }
  bin_metrics$is_offspring <- bin_metrics$off_spring_prob > offspring_prob_cutoff
  bin_metrics$true_bin <- bin_metrics$big_enough
  for (i in 1:nrow(bin_metrics)){
    if (!bin_metrics$big_enough[i]){
      if (!bin_metrics$is_offspring[i] & bin_metrics$raw_size[i] > min_bin_size){
        bin_metrics$true_bin[i] <- TRUE
      }
    }
  }
  bin_metrics[is.na(bin_metrics$off_spring_prob),'off_spring_prob'] <- 0

  per_read_metrics <- merge(per_read_metrics, bin_metrics[,c('pid', 'off_spring_prob', 'true_bin', 'raw_size')], 
                            by.x = 'clean_pid', by.y = 'pid', all.x=T)
  stopifnot(sum(is.na(per_read_metrics$off_spring_prob)) == 0)
  per_read_metrics <- per_read_metrics[match(as.character(seq_dat_fwd@id), per_read_metrics$read_name),]
  stopifnot(all(per_read_metrics$read_name == as.character(seq_dat_fwd@id)))

  trim_steps <- list(step1 = list(name = 'off_spring_prob',
                                  threshold = offspring_prob_cutoff,
                                  comparator = `<=`,
                                  breaks = c(-Inf, offspring_prob_cutoff*c(0.1, 1, 10), Inf)
                                  ),
                     step2 = list(name = 'raw_size',
                                  threshold = min_bin_size,
                                  comparator = `>`,
                                  breaks = c(Inf, min_bin_size + c(1,0,-1), -Inf)
                                  )
                    )

  result <- list(trim_steps = trim_steps,
                 metrics = list(per_read_metrics = per_read_metrics,
                                bin_metrics = bin_metrics,
                                consensus_cutoff = cc,
                                offspring_prob_cutoff = offspring_prob_cutoff))
  
  kept_dat <- list('fwd' = seq_dat_fwd[per_read_metrics$true_bin],
                   'rev' = seq_dat_rev[per_read_metrics$true_bin])
  trim_dat <- list('fwd' = seq_dat_fwd[!per_read_metrics$true_bin],
                   'rev' = seq_dat_rev[!per_read_metrics$true_bin])
  
  class(result) <- 'processBadPIDs'
  if (!op_args$cache){stop('must cache on processBadPIDs')}

  result$seq_dat <- kept_dat
  result$trim_dat <- trim_dat
  result$input_dat <- list(fwd = seq_dat_fwd,
                           rev = seq_dat_rev)

  result$config <- list(op_number = op_number,
                        op_args = op_args,
                        op_full_name = op_full_name,
                        op_dir = op_dir)
  return(result)
}

saveToDisk.processBadPIDs <- function(result, config, seq_dat)
{
  kept <- result$seq_dat
  trimmed <- result$trim_dat

  if (length(kept[['fwd']]) > 0)
  {
    tmp_name <- file.path(result$config$op_dir, 
      paste(config$base_for_names, '_fwd_kept_', result$config$op_args$name, '.fastq', sep = ''))
    writeFastq(kept[['fwd']], tmp_name, compress=F)
  }
  if (length(kept[['rev']]) > 0)
  {
    tmp_name <- file.path(result$config$op_dir, 
      paste(config$base_for_names, '_rev_kept_', result$config$op_args$name, '.fastq', sep = ''))
    writeFastq(kept[['rev']], tmp_name, compress=F)
  }
  if (length(trimmed[['fwd']]) > 0)
  {
    tmp_name <- file.path(result$config$op_dir, 
      paste(config$base_for_names, '_fwd_trimmed_', result$config$op_args$name, '.fastq', sep = ''))
    writeFastq(trimmed[['fwd']], tmp_name, compress=F)
  }
  if (length(trimmed[['rev']]) > 0)
  {
    tmp_name <- file.path(result$config$op_dir, 
      paste(config$base_for_names, '_rev_trimmed_', result$config$op_args$name, '.fastq', sep = ''))
    writeFastq(trimmed[['rev']], tmp_name, compress=F)
  }
  return(result)
}

#' Forces ShortReadQ objects to append into a single ShortReadQ object
#' @param x list of ShortReadQ objects
#' @export

shortReadQ_forced_append <- function(x)
{
  stopifnot(class(x) == 'list')
  if (length(x) == 1){
    return(x[[1]])
  } else {
    sread <- DNAStringSet(unlist(lapply(x, {function(y) as.character(y@sread)})))
    ids <- BStringSet(unlist(lapply(x, {function(y) as.character(y@id)})))
    quals <- FastqQuality(unlist(lapply(x, {function(y) as.character(y@quality@quality)})))
    return (ShortReadQ(sread = sread, id = ids, qual = quals))
  }
}

genSummary_processBadPIDs <- function(result)
{
  # started work on allowing this operation to run on merged reads - but
  # it stalled
  if (all(sort(names(result$seq_dat)) == c("fwd", "rev"))){
    class(result) <- 'processBadPIDs_fwd'
    seq_dat_fwd <- shortReadQ_forced_append(list(result$seq_dat$fwd, result$trim_dat$fwd))
    return(genSummary_case4(result, NULL, seq_dat_fwd))
#    summary_tab_fwd <- genSummary_case4(result, NULL, seq_dat_fwd)
#    class(result) <- 'processBadPIDs_rev'
#    seq_dat_rev <- shortReadQ_forced_append(list(result$seq_dat$rev, result$trim_dat$rev))
#    summary_tab_rev <- genSummary_case4(result, NULL, seq_dat_rev)
#    return(rbind(summary_tab_fwd, summary_tab_rev))
  } else {
    seq_dat_all <- shortReadQ_forced_append(list(result$seq_dat, result$trim_dat))
    genSummary_case4(result, NULL, seq_dat)
  }
}

computeMetrics.processBadPIDs <- function(result, config, seq_dat)
{
  return(result)
}

print.processBadPIDs <- function(result, config)
{
  cat('\n-------------------')
  cat('\nOperation: processBadPIDs')
  cat('\n-------------------')
  cat('\nKept Sequences:\n')
  print(result$summary[,c('parameter', 'k_seqs', 'k_mean_length', 'k_mean_qual')])
  cat('\n-------------------')
  cat('\nTrimmed Sequences:\n')
  print(result$summary[,c('parameter', 't_seqs', 't_mean_length', 't_mean_qual')])
  invisible(result)
}


#  result <- operation_function(all_results, config)
#  result <- saveToDisk(result, config)
#  result <- genReport(result, config)
#  result <- genSummary(result, config)
#  result <- print(result, config)
HIVDiversity/MotifBinner2 documentation built on May 6, 2019, 6:44 p.m.