In BushmanLab/ltrparser: LTR Parser

knitr::opts_chunk$set(echo = FALSE, warnings = FALSE, message = FALSE, warning = FALSE, results = "asis")
options(knitr.table.format = "latex") 

inputFastq <- try_element(params, "inputFastq")
configFile <- try_element(params, "configFile")
expectedLTR <- try_element(params, "expectedLTR")
expectedLinker <- try_element(params, "expectedLinker")
seqsDf <- try_element(params, "seqsDf")
summaryStats <- try_element(params, "summaryStats")
candidates <- try_element(params, "candidates")
candidateSummary <- try_element(params, "candidateSummary")
segmentedSeqsDf <- try_element(params, "segmentedSeqsDf")
filteredSeqsSummary <- try_element(params, "filteredSeqsSummary")
mappingLengthDistribution <- try_element(params, "mappingLengthDistribution")
description <- try_element(params, "description")

config <- read_config_file(configFile)

if(0) {
  cat(names(params))
  knitr::knit_exit()
}

Overview

This report contains the results of attempted primer & LTR identification in the following FASTQ file:

r inputFastq

r description

Analysis of common primer-LTR pairs

if(!is.null(candidateSummary)) {
  cat(paste0("  \nA total of ", nrow(candidateSummary) - 1, " potential primer-LTR pairs were identified."))
  cat("  \nThis table shows how the reads captured by each primer-LTR pair compare to each other and to the raw FASTQ file.")

  if("LANL" %in% colnames(candidateSummary)) {
    cat(
      candidateSummary %>%
        filter(id > 0) %>%
        select(id, primer, LTR, expectedLTR, LANL) %>%
        kableExtra::kable(longtable = TRUE) %>%
        kableExtra::kable_styling(font_size = 7, bootstrap_options = c("striped", "condensed"),
                                  position = "left")
    )
    table <- candidateSummary %>%
      select(-primer, -LTR, -expectedLTR, -LANL) %>%
      gather("Variable", "Value", -"id") %>%
      spread(id, Value) %>%
      dplyr::rename(Total="0")
  } else {
    cat(
      candidateSummary %>%
        filter(id > 0) %>%
        select(id, primer, LTR, expectedLTR) %>%
        kableExtra::kable(longtable = TRUE) %>%
        kableExtra::kable_styling(font_size = 7, bootstrap_options = c("striped", "condensed"),
                                  position = "left")
    )
    table <- candidateSummary %>%
      select(-primer, -LTR, -expectedLTR) %>%
      gather("Variable", "Value", -"id") %>%
      spread(id, Value) %>%
      dplyr::rename(Total="0")
  }

  # Set each cell to show counts and proportions of total reads
  colsToModify <- colnames(table)[-c(1,2)]
  for(colName in colsToModify) {
    table[,colName] <- paste0(table[,colName], " (", round(table[,colName] / table$Total * 100, 1), "%)")
  }
  cat(
    table %>%
      kableExtra::kable(longtable = TRUE) %>%
      kableExtra::kable_styling(font_size = 10, bootstrap_options = c("striped", "condensed"),
                                position = "left")
  )
} else {
  cat("  \nNo common primer-LTR pairs were identified.")
  summaryStats %>%
    gather("Variable", "Value") %>%
    kableExtra::kable(col.names = NULL)
}

\newpage

Visualization of raw reads

dnaplotr::plotDNA(sort(seqsDf %>% sample_n(min(1000, n()), weight = count) %>% pull(seq)))

primerBit <- seqsDf %>%
  mutate(primerBit=substr(seq, 1, config[["Primer_Length"]])) %>%
  group_by(primerBit) %>%
  summarize(count=sum(count)) %>%
  arrange(-count) %>%
  ungroup() %>% as.data.frame() %>%
  head(1) %>%
  pull(primerBit)

commonPrimerSeqsDf <- seqsDf %>%
  filter(substr(seq,1,config[["Primer_Length"]])==primerBit)

dnaplotr::plotDNA(sort(commonPrimerSeqsDf %>% sample_n(min(1000, n()), weight = count) %>% pull(seq)))

\newpage

Visualization of filtered / processed reads

Reads are sectioned off into sections separated by white (nucleotide color legend is below the plot).

Primer /// LTR /// CA /// (unmatched) /// human-matching /// linker-matching /// (unmatched)

if(!is.null(candidateSummary)) {

  seqsToDisplay <- segmentedSeqsDf %>%
    sample_n(min(1000, n()), replace = FALSE) %>%
    mutate(maxPrimerLength=max(nchar(primer)),
           maxLTRlength=max(nchar(LTR)),
           maxRemainderU5Length=max(nchar(remainder_u5)),
           maxMatchLength=max(nchar(matchSeq)),
           maxLinkerLength=max(nchar(linkerSeq)),
           maxRemainderU3Length=max(nchar(remainder_u3))) %>%
    rowwise() %>%
    mutate(displaySeq=create_display_seq(primer, LTRsegment, remainder_u5, matchSeq, linkerSeq, remainder_u3,
                                         maxPrimerLength, maxLTRlength, maxRemainderU5Length,
                                         maxMatchLength, maxLinkerLength, maxRemainderU3Length)) %>%
    arrange(matchSeq, linkerSeq, remainder_u3) %>%
    ungroup()
  dnaplotr::plotDNA(seqsToDisplay %>% pull(displaySeq))
  #dnaplotr::plotDNA(revComp(expectedLinker))
  dnaplotr::plotDNA(seqsToDisplay %>% arrange(seq) %>% pull(seq))

  charRange <- candidateSummary %>%
    filter(LTR != "") %>%
    mutate(charLength=nchar(primer) + nchar(LTR)) %>%
    pull(charLength)

  maxX <- max(charRange) + 5
  minX <- min(charRange) - 5

  clipData <- segmentedSeqsDf %>%
    filter(LTR %in% candidateSummary$LTR) %>%
    filter(initialSoftClip <= maxX & initialSoftClip >= minX) %>%
    #filter(originalSoftClip <= maxX & originalSoftClip >= minX) %>%
    group_by(LTR, initialSoftClip) %>%
    #group_by(LTR, originalSoftClip) %>%
    summarize(count=n()) %>%
    group_by(LTR) %>%
    left_join(candidateSummary %>% select(LTR, id), by="LTR") %>%
    mutate(prop=count/sum(count))
  clipData$id <- factor(clipData$id, levels = sort(unique(clipData$id), decreasing = TRUE))

  if(nrow(clipData) == 0) knitr::knit_exit()

} else {
  dnaplotr::plotDNA(seqsDf %>%
                    sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>%
                    arrange(seq) %>%
                    pull(seq))

  knitr::knit_exit()
}

\newpage

#ggplot(clipData, aes(y=prop, x=originalSoftClip, fill=LTR)) +
ggplot(clipData, aes(y=prop, x=initialSoftClip, fill=LTR)) +
  geom_bar(stat = "identity") +
  labs(title="Distribution of clipping points across reads",
       x="Position on read",
       y="Number of reads") +
  scale_x_continuous(breaks=seq(minX, maxX)) +
  theme(
    legend.position = "none"
  ) +
  facet_grid(id~.)

orderedLTR <- clipData %>%
  #group_by(LTR, originalSoftClip) %>%
  group_by(LTR, initialSoftClip) %>%
  summarize(count=n()) %>%
  group_by(LTR) %>%
  top_n(1, count) %>%
  rowwise() %>%
  mutate(displayLTR=substr(LTR, minX - config[["Primer_Length"]], max(nchar(LTR)))) %>%
  select(LTR, displayLTR) %>%
  left_join(candidateSummary %>% select(LTR, id), by="LTR") %>%
  arrange(id) %>%
  distinct()

# posChar <- paste0("1    6   10",
#                   paste0("   ",seq(15,ceiling(maxX/5)*5, 5), collapse = ""))
# orderedLTRdf <- data.frame(x=cell_spec(c(posChar, orderedLTR, posChar), monospace = TRUE, format = "latex"))

dnaplotr::plotDNA(orderedLTR$displayLTR,
                  xStart = minX)

knitr::knit_exit()

\newpage

Summary Statistics for raw FASTQ file

summaryStats %>%
  gather("Variable", "Value") %>%
  kableExtra::kable(col.names = NULL)

Explanation of terms

Total: Number of reads

Distinct: Number of distinct sequences

Max: Number of copies of most abundant sequence

TotalLinker: Number of reads containing R1 linker sequence

DisctinctLinker: Number of distinct sequences containing R1 linker sequence

MaxLinker: Number of copies of most abundant sequence containing R1 linker sequence

TotalHuman: Number of reads mapping to human genome

DisctinctHuman: Number of distinct sequences mapping to human genome

MaxHuman: Number of copies of most abundant sequence mapping to human genome

\newpage

Key metrics

plotStatInRange <- function(stat, name, low, high, min, max, log = FALSE) {
  stat = round(stat, 4)
  idealMidpoint <- if_else(log, 10^((log10(low) + log10(high))/2), (low + high)/2)
  plot <- data.frame(stat=stat, low=low, high=high) %>%
    ggplot() +
    geom_segment(aes(x=min, xend=min, y=-0.1, yend=0.1)) +
    geom_segment(aes(x=max, xend=max, y=-0.1, yend=0.1)) +
    geom_segment(aes(x=min, xend=max, y=0, yend=0)) +
    geom_segment(aes(x=low, xend=high, y=0, yend=0, size=5), color="green") +
    geom_point(aes(x=stat, y=0)) +
    geom_text(aes(label=stat, x=stat, y=1.5)) +
    geom_text(aes(label="Ideal range", x=idealMidpoint, y=-1.5)) +
    xlim(min, max) + ylim(-2,2) +
    xlab("") + ylab("") +
    ggtitle(name) +
    theme(
      legend.position = "none",
      panel.background = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.title.y = element_text(angle=0)
    )
  if(log) plot <- plot + scale_x_log10(labels = scales::trans_format("log10", scales::math_format(10^.x)))
  return(plot)
}

print(plotStatInRange(summaryStats$Total, "Total Reads", 5e4, 1e8, 1, 1e10, log = TRUE))
cat("  \n")
print(plotStatInRange(summaryStats$Max / summaryStats$Total, "Max Read Proportion", 0.01, 0.2, 0, 1))

\newpage

if(any(seqsDf$human)) {
  ggplot(mappingLengthDistribution, aes(x=matchLength, y=count)) +
    geom_histogram(stat="identity", binwidth = 5) +
    labs(title="Distribution of sequence lengths mapping to human genome",
         x="Length of sequence match",
         y="Number of reads")
}

\newpage

Visualization of raw FASTQ reads

Random sample of r min(1000, sum(seqsDf$count)) reads, arranged by nucleotide sequence:

dnaplotr::plotDNA(seqsDf %>%
                    sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>%
                    arrange(seq) %>%
                    pull(seq))

Random sample of r min(1000, nrow(seqsDf)) distinct reads (i.e. after removing duplicates), arranged by nucleotide sequence:

dnaplotr::plotDNA(seqsDf %>%
                    sample_n(min(1000, n()), replace = FALSE) %>%
                    arrange(seq) %>%
                    pull(seq))

\newpage

Visualization of raw FASTQ reads with R1 linker sequences

if(any(seqsDf$hasLinker) & any(!seqsDf$hasLinker)) {
  cat(paste0("  \nRandom sample of ", min(1000, sum(seqsDf %>% filter(hasLinker) %>% pull(count))), " reads containing R1 linker sequence, arranged by nucleotide sequence:  \n"))
  dnaplotr::plotDNA(seqsDf %>%
                      filter(hasLinker) %>%
                      sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>%
                      arrange(seq) %>%
                      pull(seq))
    cat(paste0("  \nRandom sample of ", min(1000, sum(seqsDf %>% filter(!hasLinker) %>% pull(count))), " reads that do NOT contain R1 linker sequence, arranged by nucleotide sequence:  \n"))
  dnaplotr::plotDNA(seqsDf %>%
                      filter(!hasLinker) %>%
                      sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>%
                      arrange(seq) %>%
                      pull(seq))
} else {
  cat("No reads contained R1 linker sequences (or no R1 linker sequence was provided).")
}

\newpage

if(is.null(candidateSummary)) {
  cat(paste0("No primer LTR pairs appeared in more than ", config[["minProportion"]], " of total reads."))
  knitr::knit_exit()
}

Analysis of common primer-LTR pairs

A total of r nrow(candidates) potential primer-LTR pairs were identified.

This table shows how the reads captured by each primer-LTR pair compare to each other and to the raw FASTQ file.

candidateSummary %>%
  filter(id > 0) %>%
  select(primer, LTR, expectedLTR, id) %>%
  kableExtra::kable(longtable = TRUE) %>%
  kableExtra::kable_styling(font_size = 7, bootstrap_options = c("striped", "condensed"))

candidateSummary %>%
  select(-primer, -LTR, -expectedLTR) %>%
  gather("Variable", "Value", -"id") %>%
  spread(id, Value) %>%
  rename(Total="0")

expectedLTRproportion <-
  (candidateSummary %>% filter(expectedLTR=="TRUE") %>% summarize(Total=sum(Total)) %>% pull(Total)) /
  (summaryStats$Total)

print(plotStatInRange(expectedLTRproportion, "% reads with expected LTR", 0.4, 0.9, 0, 1))

\newpage

Visualization of FASTQ reads, filtered for common primer & LTR

Random sample of r min(1000, sum(segmentedSeqsDf$count)) reads, filtered for common primer & LTR and with primer & LTR sequences trimmed, arranged by nucleotide sequence:

dnaplotr::plotDNA(
  segmentedSeqsDf %>%
    sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>%
    mutate(maxPrimerLength=max(nchar(primer)),
           maxLTRlength=max(nchar(LTR)),
           maxMatchLength=max(matchLength),
           maxRemainderLength=max(nchar(remainder))) %>%
    rowwise() %>%
    mutate(displaySeq=create_display_seq(primer, LTR, matchSeq, remainder,
                                         maxPrimerLength, maxLTRlength,
                                         maxMatchLength, maxRemainderLength)) %>%
    arrange(matchLength, displaySeq) %>%
    pull(displaySeq))

Random sample of r min(1000, nrow(seqsDf)) distinct reads (i.e. after removing duplicates), filtered for common rimer & LTR and with primer & LTR sequences trimmed, arranged by nucleotide sequence:

dnaplotr::plotDNA(
  segmentedSeqsDf %>%
    sample_n(min(1000, n()), replace = FALSE) %>%
    mutate(maxPrimerLength=max(nchar(primer)),
           maxLTRlength=max(nchar(LTR)),
           maxMatchLength=max(matchLength),
           maxRemainderLength=max(nchar(remainder))) %>%
    rowwise() %>%
    mutate(displaySeq=create_display_seq(primer, LTR, matchSeq, remainder,
                                         maxPrimerLength, maxLTRlength,
                                         maxMatchLength, maxRemainderLength)) %>%
    arrange(matchLength, displaySeq) %>%
    pull(displaySeq))

if(!config[["Map_To_LANL"]]) {
  knitr::knit_exit()
}

Analysis of FASTQ reads mapped to LTR regions of HIV genomes in the Los Alamos National Laboratory database

r nrow(candidates %>% filter(LANL)) candidates have an LTR sequence that mapped to an LTR region of at least one genome in the Los Alamos National Laboratories HIV database.

BushmanLab/ltrparser documentation built on Jan. 27, 2021, 9:02 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

BushmanLab/ltrparser
LTR Parser

In BushmanLab/ltrparser: LTR Parser

Overview

Analysis of common primer-LTR pairs

Visualization of raw reads

Visualization of filtered / processed reads

Summary Statistics for raw FASTQ file

Explanation of terms

Key metrics

Visualization of raw FASTQ reads

Visualization of raw FASTQ reads with R1 linker sequences

Analysis of common primer-LTR pairs

Visualization of FASTQ reads, filtered for common primer & LTR

Analysis of FASTQ reads mapped to LTR regions of HIV genomes in the Los Alamos National Laboratory database

R Package Documentation

Browse R Packages

We want your feedback!

BushmanLab/ltrparser LTR Parser

In BushmanLab/ltrparser: LTR Parser

Overview

Analysis of common primer-LTR pairs

Visualization of raw reads

Visualization of filtered / processed reads

Summary Statistics for raw FASTQ file

Explanation of terms

Key metrics

Visualization of raw FASTQ reads

Visualization of raw FASTQ reads with R1 linker sequences

Analysis of common primer-LTR pairs

Visualization of FASTQ reads, filtered for common primer & LTR

Analysis of FASTQ reads mapped to LTR regions of HIV genomes in the Los Alamos National Laboratory database

R Package Documentation

Browse R Packages

We want your feedback!

BushmanLab/ltrparser
LTR Parser