#' Find most conserved nucleotide in each position and its frequency
#'
#' This function grab the nucleotide for all of sequences at index position
#' to create a XString sequence for a collection of this nucleotide. Then
#' the function reformats the Xstring into a string clean it up.
#'
#' @param seqSet A set of aligned sequence read from fasta file
#' @param index A numeric index for generating Position String at that position.
#'
#' @return A String containing nucleotides at index position for sequences.
#'
#' @examples
#' # Example 1: Use a sequence set with short sequence from data,
#' # create a string containing nucleotides at index 1 for this sequence set.
#'
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' position1Vec <- createPosVec(testSeq, 1)
#' position1Vec # "ATT"
#'
#' # Example 2: Use a raw data sequence set, create a string containing
#' # nucleotides at index 10 for these raw data sequences.
#' findpath <- system.file("extdata", "sampleSeq3.txt", package = "conservedPos")
#' rawSeq <- Biostrings::readBStringSet(findpath,"fasta")
#' position10Vec <- createPosVec(rawSeq, 10)
#' position10Vec <- "TTTCACTACCAA"
#' @export
createPosVec <- function(seqSet, index){
position <- c()
# Loop over each sequences and grab the nucleotide at index postion.
for (i in 1: length(seqSet)){
if(length(seqSet[[i]]) >= index){
nul <- seqSet[[i]][index]
nul <- paste(nul, "")
position[i] <- nul
}
}
# Reformat position from vector to a string and remove NA and whitespace
position <- paste(position, collapse = "")
position <- gsub(" ", "", position, fixed = TRUE)
position <- gsub("NA", "", position, fixed = TRUE)
return (position)
}
#' This function finds the nucleotide that has highest frequency of
#' in input string.
#'
#' @param str A string containing character nucleotides
#'
#' @return Return a vector that has one or more nucleotides
#' with highest frequency
#'
#' @examples
#' # Example 1: find the the letter in string
#' # with highest occurrence.
#'
#' str <- "ATTTGGGGCC"
#' findConservedNul(str) # "G"
#'
#' # Example 2: find the highest frequency nucleotide from a
#' # string of nucleotides generated by createPosVec()
#'
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' position1Vec <- createPosVec(testSeq, 1)
#' findConservedNul(position1Vec) # "T"
#'
#' @references
#' Antonis. (2018) How to find the most repeated word in a vector with R
#' \emph{Stack oveflow} \href{https://stackoverflow.com/questions/48632957/how-to-find-the-most-repeated-word-in-a-vector-with-r
#' }{Link}
#'
#' @export
findConservedNul <- function(str){
split <- strsplit(str, "")
nul <- names(which(table(split) == max(table(split))))
return(nul)
}
#' This function finds the highest frequency of nucleotide in input string.
#'
#' @param str A string containing character nucleotides
#'
#' @return Return a numeric value for highest frequency
#'
#' @examples
#' # Example 1: find the highest letter frequency in a string
#'
#' str <- "AADDDCCEEEE"
#' freq <- findConservityFromS(str)
#' freq # 0.363636...
#'
#' # Example 2: find the highest frequency of nucleotide from a string of
#' # nucleotide generated by createPosVec()
#' data(testSeqShort)
#' position1Vec <- createPosVec(testSeqShort, 1)
#' freq <- findConservityFromS(position1Vec)
#' freq # 0.66666...
#' @export
findConservityFromS <- function(str){
# Split the str into character vector and find max occurances
# of certain nucleotide.
split <- strsplit(str, "")
maxval <- max(table(split))
# Use the length of the str to calculate the highest frequency
# of the nulceotide.
length <- nchar(str)
return(maxval / length)
}
#' This function finds the max length of sequence for input sequence set
#'
#' @param seqSet seqSet A set of aligned sequence read from fasta file
#'
#' @return A numeric value of the max length of sequence for seqSet
#'
#' @examples
#' # Example 1: find the max length of a sequence set retrieved from raw data
#' findpath <- system.file("extdata", "sampleSeq3.txt", package = "conservedPos")
#' rawSeq <- Biostrings::readBStringSet(findpath,"fasta")
#' maxL <- findMaxLen(rawSeq)
#' maxL # 140
#'
#' # Example 2: find the max length of a sequence set retrieved from data
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' maxL <- findMaxLen(testSeq)
#' maxL # 37
#'
#' @export
findMaxLen <- function(seqSet){
max <- 0
for (i in 1:length(seqSet)){
if (max < length(seqSet[[i]])){
max <- length(seqSet[[i]])
}
}
return(max)
}
#' This function calculates the conservity of all positions for input
#' sequences and generate a vector contains each position's conservity
#'
#' @param seqSet seqSet A set of aligned sequence read from fasta file
#'
#' @return a vector contains each position's highest frequency
#'
#' @examples
#' # Example 1 : Create conservity Table for testSeqShort.
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' table <- conservityTable(testSeq)
#' # You will see a vector containing 37 numbers (max length), these numbers are
#' # the highest frequency for each position.
#' table
#'
#' # Example 2: Create conservity Table for another sequence set, a raw data
#' # sampleSeq3.txt. This should takes longer time to run.
#' findpath <- system.file("extdata", "sampleSeq3.txt", package = "conservedPos")
#' rawSeq <- Biostrings::readBStringSet(findpath,"fasta")
#' table <- conservityTable(rawSeq)
#' # Should have 140 numbers(max length of rawSeq), and
#' # each number is the the highest frequency for each position
#' table
#'
#' @export
conservityTable <- function(seqSet){
# Find max length of sequences in seqSet, it is used as the max range of
# position to compute the conservity. It also guarantees that each position
# to be calculated even the sequences have slightly length differences.
max <- findMaxLen(seqSet)
# From index 1 to max length, create a position string for each index
# and find the conservity from that index.
conserVec <- c()
for (i in 1:max){
newstr <- createPosVec(seqSet, i)
conser <- findConservityFromS(newstr)
conserVec[i] <- conser
}
return(conserVec)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.