R/helpers.R

Defines functions make_clean_unique_names make_clean_names splitFastqPairs splitFastq concatFastq sampleFastq trim_marker_panel

#' @export
#' @importFrom dplyr "%>%" mutate group_by add_count ungroup pull if_else
#' @importFrom stringr str_c
make_clean_unique_names <- function(x) {
  tibble(x = x) %>%
    mutate(x = make_clean_names(x)) %>%
    group_by(x) %>%
    add_count() %>%
    mutate(suffix = if_else(n>1, str_c('-', seq_along(x)), '')) %>%
    ungroup() %>%
    mutate(x = str_c(x, suffix)) %>%
    pull(x)
}

#' @export
#' @importFrom dplyr "%>%"
#' @importFrom stringr str_remove str_replace str_replace_all
make_clean_names <- function(x) {
  str_remove(x, '[^A-z0-9_\\-]+$') %>%
    str_remove('^[^A-z0-9_\\-]+') %>%
    str_replace_all('[^A-z0-9_\\-]+', '_') %>%
    str_replace('^$', 'x')
}

#' @export
#' @importFrom ShortRead writeFastq yield FastqStreamer
#' @importFrom dplyr tibble bind_rows
#' @importFrom rlang is_string is_scalar_integerish
#' @importFrom future future plan multisession tweak value
splitFastqPairs <- function(fq1, fq2, out_dir, prefix = 'reads', suffix = '.fastq.gz', nreads=5e5){

  stopifnot(is_string(fq1), file.exists(fq1),
            is_string(fq2), file.exists(fq2),
            is_string(out_dir), dir.exists(out_dir),
            is_string(prefix), is_string(suffix),
            is_scalar_integerish(nreads), nreads > 0)


  f_fq1fns <- future({
    HaplotypReportR::splitFastq(fqfn = fq1,
                                out_dir = out_dir,
                                prefix = prefix,
                                suffix = str_c('R1', suffix),
                                nreads = nreads)})

  f_fq2fns <- future({
    HaplotypReportR::splitFastq(fqfn = fq2,
                                out_dir = out_dir,
                                prefix = prefix,
                                suffix = str_c('R2', suffix),
                                nreads = nreads)})

  fq1s <- value(f_fq1fns)
  fq2s <- value(f_fq2fns)


  if (length(fq1s) != length(fq2s)) {
    warning("Fastq files were of different lengths")
  }

  n <- min(length(fq1s), length(fq2s))

  tibble(i = seq_len(n),
         fq1 = fq1s[seq_len(n)],
         fq2 = fq2s[seq_len(n)])
}

#' @export
#' @importFrom ShortRead writeFastq yield FastqStreamer
#' @importFrom future future plan multisession value
splitFastq <- function(fqfn, out_dir, prefix, suffix, nreads){

  fs <- FastqStreamer(fqfn, n = nreads)
  i <- 1L
  out_fns <- character()
  f_write <- NULL

  while(length(sr <- yield(fs)) > 0){

    if (!is.null(f_write)) {
      if (value(f_write) < 1) {
        warning('writeFastq possibly failed')
      }
    }

    if (length(sr) == 0)
      break

    fn <- file.path(out_dir, str_c(prefix, '_', i, '_', suffix))
    f_write <- future(writeFastq(sr, fn))

    out_fns <- c(out_fns, fn)

    i <- i + 1L
  }
  close(fs)

  if (!is.null(f_write)) {
    if (value(f_write) < 1) {
      warning('writeFastq possibly failed')
    }
  }

  return(out_fns)
}

#' @export
#' @importFrom ShortRead writeFastq yield FastqStreamer
#' @importFrom dplyr tibble bind_rows
#' @importFrom rlang is_string is_scalar_integerish
concatFastq <- function(in_fqs, out_fq){

  stopifnot(is.character(in_fqs),
            all(file.exists(in_fqs)),
            is_string(out_fq))

  mode <- 'w'
  for (fq in in_fqs) {

    fs <- FastqStreamer(fq)

    while(length(sr <- yield(fs)) > 0) {
      writeFastq(sr, out_fq, mode = mode)
      mode <- 'a'
    }

    close(fs)
  }

  invisible(out_fq)
}

#' @export
#' @importFrom ShortRead writeFastq yield FastqSampler
#' @importFrom rlang is_string is_scalar_integerish
sampleFastq <- function(in_fq, nreads, pattern = '.fastq.gz$', suffix = '_sub.fastq.gz', seed = 0L){

  stopifnot(is_string(in_fq), file.exists(in_fq),
            is_scalar_integerish(nreads))

  out_fq = str_c(str_remove(in_fq, pattern), suffix)
  if(file.exists(out_fq))
    file.remove(out_fq)

  set.seed(0L)
  fs <- FastqSampler(in_fq, n = nreads)
  sr <- yield(fs)
  writeFastq(sr, out_fq)
  close(fs)

  return(out_fq)
}

#' @importFrom dplyr inner_join "%>%"
#' @importFrom stringr str_extract str_count
#' @importFrom Biostrings DNAString DNAStringSet pairwiseAlignment alignedSubject subseq
trim_marker_panel <- function(trimmed_ref_seq, marker_panel) {

  trimmed_panel <-
    names(trimmed_ref_seq) %>%
    map_df(function(marker) {

      panel_seqs <-
        marker_panel %>%
        filter(MarkerID == marker) %>%
        with(magrittr::set_names(DNAStringSet(Sequence), Haplotype))

      aln <- pairwiseAlignment(panel_seqs, trimmed_ref_seq[marker], type="global")
      subject <- alignedSubject(aln) %>% as.character()
      gaps_left <- str_extract(subject, '^-*') %>% str_count('-')
      gaps_right <- str_extract(subject, '-*$') %>% str_count('-')

      trimmed_panel <-
        panel_seqs %>%
        subseq(., gaps_left + 1, width(.) - gaps_right) %>%
        unique()

      tibble(MarkerID = marker,
             Haplotype = names(trimmed_panel),
             Sequence = as.character(trimmed_panel))
    })
}

#' @importFrom dplyr inner_join "%>%" tibble
#' @importFrom Biostrings DNAString DNAStringSet pairwiseAlignment mismatchSummary
get_panel_snps <- function(trimmed_ref_seq, marker_panel) {

  marker_ref <-
    tibble(MarkerID = names(trimmed_ref_seq),
           ReferenceSequence = as.character(trimmed_ref_seq))

  inner_join(marker_ref, marker_panel, 'MarkerID') %>%
    select(MarkerID, ReferenceSequence, Sequence) %>%
    group_by(MarkerID, ReferenceSequence) %>%
    summarise(Sequence = list(Sequence)) %>%
    ungroup() %>%
    mutate(snps = map2(ReferenceSequence, Sequence, function(ref, seq) {
      aln <- pairwiseAlignment(DNAStringSet(seq), DNAString(ref), type="global")
      mismatchSummary(aln)$subject %>% select(Pos = SubjectPosition, Ref = Subject, Alt = Pattern)
    })) %>%
    unnest(snps) %>%
    select(MarkerID, Pos, Ref, Alt) %>%
    mutate_if(is.factor, as.character) %>%
    distinct() %>%
    arrange_all()
}

format_int <- function(x, min_width = 1L) {
  stopifnot(rlang::is_integerish(x))
  width <- max(min_width, 1 + floor(log10(max(x))))
  stringr::str_pad(x, pad = "0",side = 'left', width = width)
}
bahlolab/HaplotypReportR documentation built on Dec. 2, 2019, 7:36 p.m.