R/inspect_read_quality.R

Defines functions summarise_read_quality get_quality_trim_pos plot_quality_profile .plot_quality_profile

#' @importFrom dplyr filter "%>%" summarise group_by mutate ungroup bind_rows left_join
summarise_read_quality <- function(fwd_fqs, rev_fqs, num=50000) {

  stopifnot(all(file.exists(fwd_fqs)),
            all(file.exists(rev_fqs)))

  smry1 <-
    bind_rows(
      ShortRead::qa(fwd_fqs, n=num)[["perCycle"]]$quality %>%
        mutate_if(is.factor, as.character) %>%
        mutate(Direction = 'forward'),
      ShortRead::qa(rev_fqs, n=num)[["perCycle"]]$quality %>%
        mutate_if(is.factor, as.character) %>%
        mutate(Direction = 'reverse')) %>%
    group_by(Direction, Cycle, Score) %>%
    summarise(Count = sum(Count, na.rm = T)) %>%
    group_by(Direction, Cycle) %>%
    mutate(Prop = Count / sum(Count, na.rm = T)) %>%
    ungroup()

  smry2 <-
    smry1 %>%
    group_by(Direction, Cycle) %>%
    summarise(Total = sum(Count, na.rm = T),
              MeanScore = sum(Score * Count, na.rm = T) / Total) %>%
    ungroup() %>%
    left_join(smry1 %>%
                filter(Cycle == 1) %>%
                group_by(Direction) %>%
                summarise(nReads=sum(Count)) %>%
                ungroup(),
              'Direction') %>%
    mutate(PropReadLen = Total / nReads)

  list(denisty_by_pos_and_qual = smry1, summary_by_pos = smry2)
}


#' @importFrom dplyr arrange desc filter "%>%" summarise group_by mutate ungroup bind_rows left_join slice rename
#' @importFrom tidyr nest unnest spread
get_quality_trim_pos <- function(denisty_by_pos_and_qual,
                                 summary_by_pos,
                                 trim_target_qual = 35,
                                 trim_target_qual_percent = 75,
                                 trim_target_len_percent = 95) {

  len_trim <-
    filter(summary_by_pos, PropReadLen > (trim_target_len_percent / 100)) %>%
    group_by(Direction) %>%
    summarise(pos = max(Cycle)) %>%
    ungroup() %>%
    select(direction=Direction, pos) %>%
    mutate(type = 'len')

  qual_trim <-
    denisty_by_pos_and_qual %>%
    mutate(above_target = Score >= trim_target_qual) %>%
    group_by(Direction, Cycle, above_target) %>%
    summarise(Prop = sum(Prop, na.rm = T)) %>%
    group_by(Direction) %>%
    nest() %>%
    mutate(data = map(data, function(data){
      data %>%
        dplyr::rename(pos = Cycle) %>%
        group_by(pos) %>%
        summarise(prob_above = {
          total_above <-
            filter(data, Cycle <= pos[1], above_target) %>%
            pull(Prop) %>% sum(na.rm = T)
          total_below <-
            filter(data, Cycle <= pos[1], !above_target) %>%
            pull(Prop) %>% sum(na.rm = T)
          total_above / (total_below + total_above)
        }) %>%
      filter(prob_above > (trim_target_qual_percent/100)) %>%
      arrange(desc(pos)) %>%
      slice(1)
    })) %>%
    unnest(data) %>%
    select(direction=Direction, pos) %>%
    mutate(type = 'qual')

  bind_rows(len_trim, qual_trim) %>%
    group_by(direction) %>%
    arrange(pos) %>%
    ungroup() %>%
    spread(direction, pos) %>%
    rename(trim_fwd = forward,
           trim_rev = reverse) %>%
    split.data.frame(.$type) %>%
    map(~ select(., -type))
}

#' @export
plot_quality_profile <- function(reads_fwd, reads_rev, title) {

  quality_smry <- summarise_read_quality(fwd_fqs = reads_fwd, rev_fqs = reads_rev)
  .plot_quality_profile(quality_smry$denisty_by_pos_and_qual,
                        quality_smry$summary_by_pos,
                        title)
}

#' @importFrom ggplot2 ggplot geom_tile geom_line ylab scale_fill_continuous facet_wrap theme element_blank
#' @importFrom cowplot plot_grid ggdraw draw_label
#' @importFrom dplyr "%>%"
.plot_quality_profile <- function(denisty_by_pos_and_qual, summary_by_pos, title) {

  stopifnot(is.data.frame(denisty_by_pos_and_qual),
            is.data.frame(summary_by_pos),
            is_string(title))
  p1 <-
    denisty_by_pos_and_qual %>%
    ggplot() +
    geom_tile(aes(Cycle, Score, fill = Prop)) +
    geom_line(data = summary_by_pos, aes(Cycle, MeanScore), col = 'red') +
    ylab('Quality Score') +
    scale_fill_continuous(high = "gray15", low = "gray75", limits = c(0,1)) +
    facet_wrap(~Direction, nrow=1) +
    theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank())

  p2 <-
    summary_by_pos %>%
    ggplot() +
    geom_tile(aes(y='PropReads', x = Cycle, fill = PropReadLen)) +
    scale_fill_continuous(high = "gray15", low = "gray75", limits = c(0,1)) +
    facet_wrap(~Direction, nrow=1) +
    theme(axis.title.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'none',
          strip.background = element_blank(), strip.text = element_blank())


  plot_grid(ggdraw() + draw_label(paste0('Quality profile - ', title)),
            p1, p2, ncol = 1, rel_heights = c(0.05, 0.80, 0.15), align = 'v', axis = 'lr')
}
bahlolab/HaplotypReportR documentation built on Dec. 2, 2019, 7:36 p.m.