knitr::opts_chunk$set(echo = FALSE, warnings = FALSE, message = FALSE, warning = FALSE, results = "asis") options(knitr.table.format = "latex") inputFastq <- try_element(params, "inputFastq") configFile <- try_element(params, "configFile") expectedLTR <- try_element(params, "expectedLTR") expectedLinker <- try_element(params, "expectedLinker") seqsDf <- try_element(params, "seqsDf") summaryStats <- try_element(params, "summaryStats") candidates <- try_element(params, "candidates") candidateSummary <- try_element(params, "candidateSummary") segmentedSeqsDf <- try_element(params, "segmentedSeqsDf") filteredSeqsSummary <- try_element(params, "filteredSeqsSummary") mappingLengthDistribution <- try_element(params, "mappingLengthDistribution") description <- try_element(params, "description") config <- read_config_file(configFile)
if(0) { cat(names(params)) knitr::knit_exit() }
This report contains the results of attempted primer & LTR identification in the following FASTQ file:
r inputFastq
r description
if(!is.null(candidateSummary)) { cat(paste0(" \nA total of ", nrow(candidateSummary) - 1, " potential primer-LTR pairs were identified.")) cat(" \nThis table shows how the reads captured by each primer-LTR pair compare to each other and to the raw FASTQ file.") if("LANL" %in% colnames(candidateSummary)) { cat( candidateSummary %>% filter(id > 0) %>% select(id, primer, LTR, expectedLTR, LANL) %>% kableExtra::kable(longtable = TRUE) %>% kableExtra::kable_styling(font_size = 7, bootstrap_options = c("striped", "condensed"), position = "left") ) table <- candidateSummary %>% select(-primer, -LTR, -expectedLTR, -LANL) %>% gather("Variable", "Value", -"id") %>% spread(id, Value) %>% dplyr::rename(Total="0") } else { cat( candidateSummary %>% filter(id > 0) %>% select(id, primer, LTR, expectedLTR) %>% kableExtra::kable(longtable = TRUE) %>% kableExtra::kable_styling(font_size = 7, bootstrap_options = c("striped", "condensed"), position = "left") ) table <- candidateSummary %>% select(-primer, -LTR, -expectedLTR) %>% gather("Variable", "Value", -"id") %>% spread(id, Value) %>% dplyr::rename(Total="0") } # Set each cell to show counts and proportions of total reads colsToModify <- colnames(table)[-c(1,2)] for(colName in colsToModify) { table[,colName] <- paste0(table[,colName], " (", round(table[,colName] / table$Total * 100, 1), "%)") } cat( table %>% kableExtra::kable(longtable = TRUE) %>% kableExtra::kable_styling(font_size = 10, bootstrap_options = c("striped", "condensed"), position = "left") ) } else { cat(" \nNo common primer-LTR pairs were identified.") summaryStats %>% gather("Variable", "Value") %>% kableExtra::kable(col.names = NULL) }
\newpage
dnaplotr::plotDNA(sort(seqsDf %>% sample_n(min(1000, n()), weight = count) %>% pull(seq))) primerBit <- seqsDf %>% mutate(primerBit=substr(seq, 1, config[["Primer_Length"]])) %>% group_by(primerBit) %>% summarize(count=sum(count)) %>% arrange(-count) %>% ungroup() %>% as.data.frame() %>% head(1) %>% pull(primerBit) commonPrimerSeqsDf <- seqsDf %>% filter(substr(seq,1,config[["Primer_Length"]])==primerBit) dnaplotr::plotDNA(sort(commonPrimerSeqsDf %>% sample_n(min(1000, n()), weight = count) %>% pull(seq)))
\newpage
Reads are sectioned off into sections separated by white (nucleotide color legend is below the plot).
Primer /// LTR /// CA /// (unmatched) /// human-matching /// linker-matching /// (unmatched)
if(!is.null(candidateSummary)) { seqsToDisplay <- segmentedSeqsDf %>% sample_n(min(1000, n()), replace = FALSE) %>% mutate(maxPrimerLength=max(nchar(primer)), maxLTRlength=max(nchar(LTR)), maxRemainderU5Length=max(nchar(remainder_u5)), maxMatchLength=max(nchar(matchSeq)), maxLinkerLength=max(nchar(linkerSeq)), maxRemainderU3Length=max(nchar(remainder_u3))) %>% rowwise() %>% mutate(displaySeq=create_display_seq(primer, LTRsegment, remainder_u5, matchSeq, linkerSeq, remainder_u3, maxPrimerLength, maxLTRlength, maxRemainderU5Length, maxMatchLength, maxLinkerLength, maxRemainderU3Length)) %>% arrange(matchSeq, linkerSeq, remainder_u3) %>% ungroup() dnaplotr::plotDNA(seqsToDisplay %>% pull(displaySeq)) #dnaplotr::plotDNA(revComp(expectedLinker)) dnaplotr::plotDNA(seqsToDisplay %>% arrange(seq) %>% pull(seq)) charRange <- candidateSummary %>% filter(LTR != "") %>% mutate(charLength=nchar(primer) + nchar(LTR)) %>% pull(charLength) maxX <- max(charRange) + 5 minX <- min(charRange) - 5 clipData <- segmentedSeqsDf %>% filter(LTR %in% candidateSummary$LTR) %>% filter(initialSoftClip <= maxX & initialSoftClip >= minX) %>% #filter(originalSoftClip <= maxX & originalSoftClip >= minX) %>% group_by(LTR, initialSoftClip) %>% #group_by(LTR, originalSoftClip) %>% summarize(count=n()) %>% group_by(LTR) %>% left_join(candidateSummary %>% select(LTR, id), by="LTR") %>% mutate(prop=count/sum(count)) clipData$id <- factor(clipData$id, levels = sort(unique(clipData$id), decreasing = TRUE)) if(nrow(clipData) == 0) knitr::knit_exit() } else { dnaplotr::plotDNA(seqsDf %>% sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>% arrange(seq) %>% pull(seq)) knitr::knit_exit() }
\newpage
#ggplot(clipData, aes(y=prop, x=originalSoftClip, fill=LTR)) + ggplot(clipData, aes(y=prop, x=initialSoftClip, fill=LTR)) + geom_bar(stat = "identity") + labs(title="Distribution of clipping points across reads", x="Position on read", y="Number of reads") + scale_x_continuous(breaks=seq(minX, maxX)) + theme( legend.position = "none" ) + facet_grid(id~.) orderedLTR <- clipData %>% #group_by(LTR, originalSoftClip) %>% group_by(LTR, initialSoftClip) %>% summarize(count=n()) %>% group_by(LTR) %>% top_n(1, count) %>% rowwise() %>% mutate(displayLTR=substr(LTR, minX - config[["Primer_Length"]], max(nchar(LTR)))) %>% select(LTR, displayLTR) %>% left_join(candidateSummary %>% select(LTR, id), by="LTR") %>% arrange(id) %>% distinct() # posChar <- paste0("1 6 10", # paste0(" ",seq(15,ceiling(maxX/5)*5, 5), collapse = "")) # orderedLTRdf <- data.frame(x=cell_spec(c(posChar, orderedLTR, posChar), monospace = TRUE, format = "latex")) dnaplotr::plotDNA(orderedLTR$displayLTR, xStart = minX)
knitr::knit_exit()
\newpage
summaryStats %>% gather("Variable", "Value") %>% kableExtra::kable(col.names = NULL)
Total: Number of reads
Distinct: Number of distinct sequences
Max: Number of copies of most abundant sequence
TotalLinker: Number of reads containing R1 linker sequence
DisctinctLinker: Number of distinct sequences containing R1 linker sequence
MaxLinker: Number of copies of most abundant sequence containing R1 linker sequence
TotalHuman: Number of reads mapping to human genome
DisctinctHuman: Number of distinct sequences mapping to human genome
MaxHuman: Number of copies of most abundant sequence mapping to human genome
\newpage
plotStatInRange <- function(stat, name, low, high, min, max, log = FALSE) { stat = round(stat, 4) idealMidpoint <- if_else(log, 10^((log10(low) + log10(high))/2), (low + high)/2) plot <- data.frame(stat=stat, low=low, high=high) %>% ggplot() + geom_segment(aes(x=min, xend=min, y=-0.1, yend=0.1)) + geom_segment(aes(x=max, xend=max, y=-0.1, yend=0.1)) + geom_segment(aes(x=min, xend=max, y=0, yend=0)) + geom_segment(aes(x=low, xend=high, y=0, yend=0, size=5), color="green") + geom_point(aes(x=stat, y=0)) + geom_text(aes(label=stat, x=stat, y=1.5)) + geom_text(aes(label="Ideal range", x=idealMidpoint, y=-1.5)) + xlim(min, max) + ylim(-2,2) + xlab("") + ylab("") + ggtitle(name) + theme( legend.position = "none", panel.background = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_text(angle=0) ) if(log) plot <- plot + scale_x_log10(labels = scales::trans_format("log10", scales::math_format(10^.x))) return(plot) } print(plotStatInRange(summaryStats$Total, "Total Reads", 5e4, 1e8, 1, 1e10, log = TRUE)) cat(" \n") print(plotStatInRange(summaryStats$Max / summaryStats$Total, "Max Read Proportion", 0.01, 0.2, 0, 1))
\newpage
if(any(seqsDf$human)) { ggplot(mappingLengthDistribution, aes(x=matchLength, y=count)) + geom_histogram(stat="identity", binwidth = 5) + labs(title="Distribution of sequence lengths mapping to human genome", x="Length of sequence match", y="Number of reads") }
\newpage
Random sample of r min(1000, sum(seqsDf$count)) reads, arranged by nucleotide sequence:
dnaplotr::plotDNA(seqsDf %>% sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>% arrange(seq) %>% pull(seq))
Random sample of r min(1000, nrow(seqsDf)) distinct reads (i.e. after removing duplicates), arranged by nucleotide sequence:
dnaplotr::plotDNA(seqsDf %>% sample_n(min(1000, n()), replace = FALSE) %>% arrange(seq) %>% pull(seq))
\newpage
if(any(seqsDf$hasLinker) & any(!seqsDf$hasLinker)) { cat(paste0(" \nRandom sample of ", min(1000, sum(seqsDf %>% filter(hasLinker) %>% pull(count))), " reads containing R1 linker sequence, arranged by nucleotide sequence: \n")) dnaplotr::plotDNA(seqsDf %>% filter(hasLinker) %>% sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>% arrange(seq) %>% pull(seq)) cat(paste0(" \nRandom sample of ", min(1000, sum(seqsDf %>% filter(!hasLinker) %>% pull(count))), " reads that do NOT contain R1 linker sequence, arranged by nucleotide sequence: \n")) dnaplotr::plotDNA(seqsDf %>% filter(!hasLinker) %>% sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>% arrange(seq) %>% pull(seq)) } else { cat("No reads contained R1 linker sequences (or no R1 linker sequence was provided).") }
\newpage
if(is.null(candidateSummary)) { cat(paste0("No primer LTR pairs appeared in more than ", config[["minProportion"]], " of total reads.")) knitr::knit_exit() }
A total of r nrow(candidates) potential primer-LTR pairs were identified.
This table shows how the reads captured by each primer-LTR pair compare to each other and to the raw FASTQ file.
candidateSummary %>% filter(id > 0) %>% select(primer, LTR, expectedLTR, id) %>% kableExtra::kable(longtable = TRUE) %>% kableExtra::kable_styling(font_size = 7, bootstrap_options = c("striped", "condensed")) candidateSummary %>% select(-primer, -LTR, -expectedLTR) %>% gather("Variable", "Value", -"id") %>% spread(id, Value) %>% rename(Total="0")
expectedLTRproportion <- (candidateSummary %>% filter(expectedLTR=="TRUE") %>% summarize(Total=sum(Total)) %>% pull(Total)) / (summaryStats$Total) print(plotStatInRange(expectedLTRproportion, "% reads with expected LTR", 0.4, 0.9, 0, 1))
\newpage
Random sample of r min(1000, sum(segmentedSeqsDf$count)) reads, filtered for common primer & LTR and with primer & LTR sequences trimmed, arranged by nucleotide sequence:
dnaplotr::plotDNA( segmentedSeqsDf %>% sample_n(min(1000, sum(count)), replace = TRUE, weight = count) %>% mutate(maxPrimerLength=max(nchar(primer)), maxLTRlength=max(nchar(LTR)), maxMatchLength=max(matchLength), maxRemainderLength=max(nchar(remainder))) %>% rowwise() %>% mutate(displaySeq=create_display_seq(primer, LTR, matchSeq, remainder, maxPrimerLength, maxLTRlength, maxMatchLength, maxRemainderLength)) %>% arrange(matchLength, displaySeq) %>% pull(displaySeq))
Random sample of r min(1000, nrow(seqsDf)) distinct reads (i.e. after removing duplicates), filtered for common rimer & LTR and with primer & LTR sequences trimmed, arranged by nucleotide sequence:
dnaplotr::plotDNA( segmentedSeqsDf %>% sample_n(min(1000, n()), replace = FALSE) %>% mutate(maxPrimerLength=max(nchar(primer)), maxLTRlength=max(nchar(LTR)), maxMatchLength=max(matchLength), maxRemainderLength=max(nchar(remainder))) %>% rowwise() %>% mutate(displaySeq=create_display_seq(primer, LTR, matchSeq, remainder, maxPrimerLength, maxLTRlength, maxMatchLength, maxRemainderLength)) %>% arrange(matchLength, displaySeq) %>% pull(displaySeq))
if(!config[["Map_To_LANL"]]) { knitr::knit_exit() }
r nrow(candidates %>% filter(LANL)) candidates have an LTR sequence that mapped to an LTR region of at least one genome in the Los Alamos National Laboratories HIV database.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.