#' Correct uneven gaps in an assembled genome
#'
#' Uneven gap sizes can form during genome assembly as different
#' assembling, polishing, and gap closing tools pile on top of each
#' other. This makes submission of a genome assembly problematic,
#' because varying gap sizes are ambiguous, and ideally we want unknown
#' gaps to have a standardised size. \cr\cr
#' This function takes a genome assembly and corrects gap sizes given
#' a user-define threshold. It is assumed that any gap larger than the
#' threshold is an unknown gap.
#'
#' @param genomeSS DNAStringSet: the genome assembly as a 'DNAStringSet'
#' object from the \code{Biostrings} package.
#'
#' @param threshold Integer: the minimum string of 'N's to consider a gap.
#'
#' @param correct Integer: the standardised number of 'N's for gaps of unknown size.
#' @returns Returns the genome assembly back as a DNAStringSet object with all
#' gaps meeting the threshold size standardized to the corrected size.
#' Sequences with corrected gaps are labelled with '_correct_gaps'.
#'
#' @export
assembly_correct_gaps <- function(genomeSS, threshold, correct){
require(Biostrings); require(data.table); require(doParallel); require(tidyverse)
# Quick checks
if(!'DNAStringSet' %in% class(genomeSS)){
stop('Argument `genomeSS` must be a "DNAStringSet" class. See ?assembly_correct_gaps.')
}
if(class(threshold)!='integer' | threshold<1){
stop('Argument `treshold` must be an "integer" class with a values >0. See ?assembly_correct_gaps.')
}
if(class(correct)!='integer' | correct<1){
stop('Argument `correct` must be an "integer" class with a values >0. See ?assembly_correct_gaps.')
}
# Iterate through each ith genomic sequence
resultsList <- list()
num_seqs <- length(genomeSS)
for(i in 1:num_seqs){
cat(i, '/', num_seqs, '\n')
require(Biostrings); require(data.table); require(tidyverse)
# Convert the sequence into a long-format data table
seq_i_tab <- genomeSS[i] %>%
as.matrix() %>%
t() %>%
as.data.table() %>%
setnames(., new='DNA')
# Add in an index and test whether the bases are Ns
seq_i_tab[, INDEX:=1:.N]
seq_i_tab[, IS.N:=DNA=='N']
# Subset and sort indexes with Ns. Gap IDs are currently unknown.
n_i_tab <- seq_i_tab[IS.N==TRUE] %>%
setorder(., INDEX) %>%
.[, GAP:=0L]
# Counter of unique gap IDs.
uniq_gap_count <- 1L
# Iterate through each jth index where there is an N.
# Test whether the jth index is exactly j+1 the previous index.
# If yes, then they are consecutive and part of the same gap.
# If no, then they are different gaps, and the counter goes up
for(j in 1:nrow(n_i_tab)){
if(j == 1){
n_i_tab$GAP[j] <- uniq_gap_count
} else if(j > 1){
index_j <- n_i_tab$INDEX[j]
index_j_m1 <- n_i_tab$INDEX[j-1]
index_consec <- index_j - 1 == (index_j_m1)
if(index_consec==TRUE){
n_i_tab$GAP[j] <- uniq_gap_count
} else if(index_consec==FALSE){
uniq_gap_count <- uniq_gap_count + 1
n_i_tab$GAP[j] <- uniq_gap_count
}
}
}
# Summarise the size of unique gaps
size_i_tab <- n_i_tab[, .(SIZE=length(INDEX)), by=GAP] %>%
.[, KEEP:=SIZE>=threshold]
keep_gaps_i <- size_i_tab[KEEP==TRUE]$GAP
# Return original sequence, or sequence with standardised gaps
if(nrow(size_i_tab)==0){
result_i <- genomeSS[i]
} else {
# The N string to replace corrected gaps
n_string <- paste(rep('N',correct), collapse='')
# Starting site for each gap
keep_i_tab <- n_i_tab[GAP %in% keep_gaps_i] %>%
copy %>%
.[, START:=INDEX==min(INDEX), by=GAP]
# Add in the info for the gaps to keep
seq_i_tab <- left_join(
seq_i_tab, keep_i_tab,
by=c('DNA','INDEX','IS.N')
) %>%
as.data.table()
# Replace the string of the start site for each gap
seq_i_tab[!is.na(GAP) & START==TRUE, DNA:=n_string]
# Keep only sites that are 'A', 'T', 'G', 'C', or the first
# site for each gap. Make sure the sites are in order then
# paste them back together and convert into a string set.
new_seq_i <- seq_i_tab[is.na(GAP) | (!is.na(GAP) & START==TRUE)] %>%
setorder(., INDEX) %>%
.[['DNA']] %>%
paste(., collapse='') %>%
DNAStringSet()
# Add in the sequence name, but note that it is corrected.
names(new_seq_i) <- paste0(names(genomeSS[i]), '_correct_gaps')
# Add to results
result_i <- new_seq_i
}
# Cleanup and return results
gc()
resultsList[[i]] <- result_i
# End ith iteration
}
do.call('c', resultsList)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.