#' Checks if provided object is of class DNAString
#'
#' @param x - R object
#'
#' @return True if object is DNAString, false otherwise
#' @export
#'
#' @examples
#' is_DNAstring(Biostrings::DNAString("AGCTAGCA"))
#' is_DNAstring("AGCTAGCA")
is_DNAstring <- function(x) {
return(isTRUE(class(x)[1] == 'DNAString'))
}
#' Checks if provided object is of class DNAStringSet
#'
#' @param x - R object
#'
#' @return True if object is DNAString, false otherwise
#' @export
#'
#' @examples
#' is_DNAstringSet(Biostrings::DNAStringSet(c("AGCTAGCA","CACGATC","CGACGAC")))
#' is_DNAstringSet("AGCTAGCA")
is_DNAstringSet <- function(x) {
return(isTRUE(class(x)[1]=='DNAStringSet'))
}
#' Calculates %GC content for a single DNAString object
#'
#' @param x DNAString object
#'
#' @return %GC content of provided sequence
#' @export
#'
#' @examples
#' calc_GC_single(Biostrings::DNAString("CGAGCACACATCACAC"))
calc_GC_single <- function(x) {
#check if Biostrings package is loaded
sequence_length_min = 4 # set the minimum sequence length requried for %GC calculation
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#check if parameters are provided
if (missing(x)) {
stop("The sequence object (argument x) is missing",
call. = FALSE)
}
#check if input is of DNAString class
assertthat::assert_that(is_DNAstring(x))
#get length of provided DNAString object (sequence)
seq_length = length(x)
#calculate %GC if sequence length is > sequence_length_min (default = 4)
if(seq_length>=sequence_length_min) {
GC <- sum(Biostrings::alphabetFrequency(x)[c(2,3)])/seq_length*100
}
else{
#if sequence is too short - return NA
GC <- NA
}
return(GC)
}
#' Split a DNAString object in smaller chunks
#'
#' @param x DNAString object
#' @param window size of a sequence chunk
#' @param overlap size of the overlap between chunks (have to be < window, default = 0)
#'
#' @return DNAStringSet object
#' @export
#'
#' @examples
#' split_in_windows(Biostrings::DNAString("ACGTAGCATGCTAGCTACGTAGCTA"),4,1)
split_in_windows <- function(x,window,overlap = 0) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#check if parameters are provided
if (missing(x)) {
stop("The sequence object (argument x) is missing",
call. = FALSE)
}
if (missing(window)) {
stop("Window argument is missing",
call. = FALSE)
}
#check if DNAString object is provided as input
assertthat::assert_that(is_DNAstring(x))
#make sure that window and overlap parameters are numeric
assertthat::assert_that(is.numeric(window))
assertthat::assert_that(is.numeric(overlap))
#assure that window size is less (or equal) than the length of sequence
assertthat::assert_that(window<=length(x))
#assure that window size is a positive integer
assertthat::assert_that(window>0)
#make sure that overlap is not less than 0
assertthat::assert_that(overlap>=0)
#assure that overlap is not larger (or equal to) than window
assertthat::assert_that(overlap<window)
#get provided sequence length
seq_length = length(x)
#initialize vector for return values
return_vector <- character(0)
#initialize start position for the first chunk
start=1
#initialize end position for the first chunk
end=window
#chunk number iterator
chunk_no = 1
#while the end fo the chunk is not after the sequence end, keep processing
while (end<=seq_length) {
#create temporary chunk, using subseq function from Biostrings
temp_seq<-as.character(Biostrings::subseq(x,start,end))
#store created chunk in the return vector
return_vector[chunk_no]<-temp_seq
#increase iterators
chunk_no=chunk_no+1
start=start+window-overlap
end=end+window-overlap
}
#if after the while loop start position is still within the sequence, create the last chunk
if(start<seq_length) {
temp_seq<-as.character((Biostrings::subseq(x,start,seq_length)))
return_vector[chunk_no]<-temp_seq
}
#create the DNAStringSet object, which will be returned by function
return_set<-Biostrings::DNAStringSet(return_vector)
return(return_set)
}
#' Wrapper for calc_GC_single
#'
#' @param x DNAString or DNAStringSet object
#'
#' @return %GC for single sequence or vector of %GC if DNAStringSet was provided as input
#' @export
#'
#' @examples
#' calc_GC(Biostrings::DNAString("ACAGCAGTCA"))
#' calc_GC(Biostrings::DNAStringSet(c("ACAGCAGTCA","ACGTAGCATCGA","CGATCGACA")))
calc_GC <- function(x) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#check if parameters are provided
if (missing(x)) {
stop("The sequence object (argument x) is missing",
call. = FALSE)
}
#make sure that DNAString of DNAStringSet is provided as an input
assertthat::assert_that(is_DNAstring(x) || is_DNAstringSet(x))
#if DNAString is an input simply call calc_GC_single
if(is_DNAstring(x)) {
GC = calc_GC_single((x))
}
#if an input is a DNAStringSet object, calculate GC for each element fro mthe set
else if (is_DNAstringSet(x)) {
#get the number of elements in DNAStringSet
no_elements_in_set = length(x)
#initialize numeric vector for return values
GC <- numeric(0)
#calculate %GC for each element in set
for (z in seq(1,no_elements_in_set)) {
GC_temp <- calc_GC_single(x[[z]])
GC<-c(GC,GC_temp)
}
}
return(GC)
}
#' Split provided DNAString in chunks and calculate %GC for each chunk
#'
#' @param x DNAString object
#' @param window size of a sequence chunk
#' @param overlap size of the overlap between chunks (have to be < window, default =0)
#'
#' @return vector of calculated %GC content for each chunk generated
#' @export
#'
#' @examples
#' calc_windowed_GC(Biostrings::DNAString("CAGTCAGTCGATC"),10,2)
calc_windowed_GC <- function(x,window,overlap = 0) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#check if parameters are provided
if (missing(x)) {
stop("The sequence object (argument x) is missing",
call. = FALSE)
}
if (missing(window)) {
stop("Window argument is missing",
call. = FALSE)
}
#make sure that DNAString object is provided as an input
assertthat::assert_that(is_DNAstring(x))
#split provided input in chunks
splitted_seq <- split_in_windows(x,window,overlap)
#calculate %GC for created chunks
GC <- calc_GC(splitted_seq)
return(GC)
}
#' Title Get kmer which is most frequent in provided sequence
#'
#' @param x DNAString object
#' @param k kmer length
#' @param force_large_kmer logical, should calcualtion of large kmers be possible (but shorter than 15)
#'
#' @return named numeric, with count of given kmer and its sequence as name
#' @export
#'
#' @examples
#' get_max_kmer_single(Biostrings::DNAString("CAGCTGCACACATCGTACA"),3)
get_max_kmer_single <- function(x,k,force_large_kmer = FALSE) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#check if parameters are provided
if (missing(x)) {
stop("The sequence object (argument x) is missing",
call. = FALSE)
}
if (missing(k)) {
stop("Kmer size(k) argument is missing",
call. = FALSE)
}
#make sure that DNAString object is provided as an input
assertthat::assert_that(is_DNAstring(x))
#make sure that kmer size k is a numeric variable
assertthat::assert_that(is.numeric(k))
#make sure that kmer size k is a positive number
assertthat::assert_that(k>0)
#make sure that kmer size k is not larger than sequence length
assertthat::assert_that(k<length(x))
#if kmer size k is larger than 10 - stop calculation unless force_large_kmer is TRUE
if (k>10 && !isTRUE(force_large_kmer)) {
stop("Calculation of kmers larger than 10 is computationally intensive and may take a long time.
If you need that, specify force_large_kmer=TRUE (use with caution!)",
call. = FALSE)
}
#if mer size is larger than 15 - stop calculation without any option of forcing calculation
if (k>15) {
stop("Calculation of kmers larger than 15 is extremally computationally intensive and it is discouraged to use.",
call. = FALSE)
}
#calculate counts for each possible kmer of size k
kmersCount <- Biostrings::oligonucleotideFrequency(x,k)
#get most frequent kmer
max_kmer_count <- max(kmersCount)
#get sequence of the most frequent kmer
max_kmer_seq <- names(which.max(kmersCount))
#create return variable
names(max_kmer_count) <- max_kmer_seq
return(max_kmer_count)
}
#' Calculates maximum kmer for DNA sequence object (DNAString or DNAStringSet)
#'
#' @param x DNAString or DNAStringSet object
#' @param mer kmer size
#'
#' @return named vector of max kmer counts with kmer sequences as names
#' @export
#'
#' @examples
#' get_max_kmer(Biostrings::DNAString("CAGCGATCGTACACGATC"),4)
#' get_max_kmer(Biostrings::DNAStringSet(c("CAGCGATCGTACACGATC","CAGCGTAGCTAG","CAGCATCACACACAC")),3)
get_max_kmer <- function(x,k) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#check if parameters are provided
if (missing(x)) {
stop("The sequence object (argument x) is missing",
call. = FALSE)
}
if (missing(k)) {
stop("Kmer size(k) argument is missing",
call. = FALSE)
}
#make sure an input is provided as DNAString or DNAStringSet object
assertthat::assert_that(is_DNAstring(x) || is_DNAstringSet(x))
#make sure that kmer size k is a numeric variable
assertthat::assert_that(is.numeric(k))
#make sure k is a positive number
assertthat::assert_that(k>0)
#if DNAString object is provided as an input - simply call get_max_kmer_single
if(is_DNAstring(x)) {
max_kmer = get_max_kmer_single(x,k)
}
#for DNAStringSet object - calculate get_max_kmer_single for each sequence in the set
else if (is_DNAstringSet(x)) {
no_elements_in_set = length(x)
max_kmer <- numeric(0)
for (z in seq(1,no_elements_in_set)) {
#if a sequence is longer than kmer size k, get max kmer
if (length(x[[z]])>k) {
max_kmer_temp <- get_max_kmer_single(x[[z]],k)
}
else {
#if kmer size k is larger than sequence length - return NA
max_kmer_temp <- NA
}
max_kmer<-c(max_kmer,max_kmer_temp)
}
}
return(max_kmer)
}
#' Plot %GC distribution across sequence in fragments of given size
#'
#' @param x DNAString object
#' @param window size of the sequence chunk
#' @param overlap size of the overlap between chunks
#'
#' @return ggplot2 geom_smooth plot with %GC distribution across sequence
#' @export
#'
#' @examples
#' plot_GC_distribution(Biostrings::DNAString("CAGCTAGCTAGCTAGGCAGCTAGCTAGCTAGCATGCTAGCTAGCTAGCTACGTACGTAGCTACACTAGCTAGCTAGCTAGCTAATAATTATGGCGCGC"),6,2)
plot_GC_distribution <- function(x,window,overlap) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#make sure an input is provided as DNAString object
assertthat::assert_that(is_DNAstring(x))
#make sure that input sequence will be divided in proper number of chunks (at least 20)
assertthat::assert_that(length(x)/(window-overlap)>20)
#calculate step size - how sliding window is moving
step=window-overlap
#calculate start position for each chunk based on calculated step size
positions<-seq(1,length(x)-window+step+1,step)
#calculate %GC for chunks
GC <- calc_windowed_GC(x,window,overlap)
#name each calculated value with the start position of chunk
GC_frame <- data.frame(position=positions,GC=GC)
#create plot using geom_smooth from ggplot2
plot <- ggplot2::ggplot(GC_frame,ggplot2::aes(x=position,y=GC)) + ggplot2::geom_smooth(span=0.1) + ggplot2::scale_y_continuous() + ggplot2::ggtitle("%GC content distribution accross sequence")
return(plot)
}
#' Plots a distribution of given sequence fragment across the whole sequence
#'
#' @param x DNAString object
#' @param kmer - string
#'
#' @return plot with distribution of given kmer across sequence
#' @export
#'
#' @examples
#' plot_kmer_distribution(Biostrings::DNAString("CAGCTAGCTAGCTAGGCAGCTAGCTAGCTAGCATGCTAGCTAGCTAGCTACGTACGTAGCTACACTAGCTAGCTAGCTAGCTAATAATTATGGCGCGC"),"TAC")
plot_kmer_distribution <- function(x,kmer) {
#check if Biostrings package is loaded
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Biostrings package needed for this function to work. Please install it.",
call. = FALSE)
}
#define limits for linearange geom
min_plot_value = -0.1
max_plot_value = 0.1
#make sure an input is provided as DNAString object
assertthat::assert_that(is_DNAstring(x))
#make sure that provided kmer sequence is a character variable
assertthat::assert_that(is.character(kmer))
#make sure that kmer is not an empty string
assertthat::assert_that(nchar(kmer)>0)
#make sure that kmer variable contain only DNA nucleotide letters
if (grepl("[^AGCT]",kmer)) {
stop("Wrong character in the kmer sequence provided. Please use only {A,G,C,T} letters.",
call. = FALSE)
}
#find all occurences of kmer in sequence x using matchPattern from Biostrings
kmer_matches<-Biostrings::matchPattern(kmer,x)
if (length(kmer_matches)>0) {
#if any occurence was found, get position of each occurence using start function from Biostrings
kmer_starts=Biostrings::start(kmer_matches)
#create data frame with values
positions_df<-data.frame(position=kmer_starts,value_min=rep(min_plot_value,length(kmer_starts)),value_max=rep(max_plot_value,length(kmer_starts)))
#plot distribution of given kmer using geom_linerange from ggplot2
plot <- ggplot2::ggplot(positions_df,ggplot2::aes(x=position)) + ggplot2::geom_linerange(ggplot2::aes(ymin=value_min,ymax=value_max)) + ggplot2::scale_y_continuous(limits = c(-2,2)) + ggplot2::ggtitle(paste(kmer," occurences in sequence (",length(kmer_starts)," times)"))
return(plot)
}
else {
#if pattern was not found throw error
stop("Can't find provided pattern in the sequence")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.