R/PositionCal.R

Defines functions conservityTable findMaxLen findConservityFromS findConservedNul createPosVec

Documented in conservityTable createPosVec findConservedNul findConservityFromS findMaxLen

#' Find most conserved nucleotide in each position and its frequency
#'
#' This function grab the nucleotide for all of sequences at index position
#' to create a XString sequence for a collection of this nucleotide. Then
#' the function reformats the Xstring into a string clean it up.
#'
#' @param seqSet A set of aligned sequence read from fasta file
#' @param index A numeric index for generating Position String at that position.
#'
#' @return A String containing nucleotides at index position for sequences.
#'
#' @examples
#' # Example 1: Use a sequence set with short sequence from data,
#' # create a string containing nucleotides at index 1 for this sequence set.
#'
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' position1Vec <- createPosVec(testSeq, 1)
#' position1Vec # "ATT"
#'
#' # Example 2: Use a raw data sequence set, create a string containing
#' # nucleotides at index 10 for these raw data sequences.
#' findpath <- system.file("extdata", "sampleSeq3.txt", package = "conservedPos")
#' rawSeq <- Biostrings::readBStringSet(findpath,"fasta")
#' position10Vec <- createPosVec(rawSeq, 10)
#' position10Vec <- "TTTCACTACCAA"
#' @export
createPosVec <- function(seqSet, index){
  position <- c()

  # Loop over each sequences and grab the nucleotide at index postion.
  for (i in 1: length(seqSet)){
    if(length(seqSet[[i]]) >= index){
      nul <- seqSet[[i]][index]
      nul <- paste(nul, "")
      position[i] <- nul
    }
  }

  # Reformat position from vector to a string and remove NA and whitespace
  position <- paste(position, collapse = "")
  position <- gsub(" ", "", position, fixed = TRUE)
  position <- gsub("NA", "", position, fixed = TRUE)
  return (position)
}

#' This function finds the nucleotide that has highest frequency of
#' in input string.
#'
#' @param str A string containing character nucleotides
#'
#' @return  Return a vector that has one or more nucleotides
#'          with highest frequency
#'
#' @examples
#' # Example 1: find the the letter in string
#' # with highest occurrence.
#'
#' str <- "ATTTGGGGCC"
#' findConservedNul(str) # "G"
#'
#' # Example 2: find the highest frequency nucleotide from a
#' # string of nucleotides generated by createPosVec()
#'
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' position1Vec <- createPosVec(testSeq, 1)
#' findConservedNul(position1Vec) # "T"
#'
#' @references
#' Antonis. (2018) How to find the most repeated word in a vector with R
#' \emph{Stack oveflow}  \href{https://stackoverflow.com/questions/48632957/how-to-find-the-most-repeated-word-in-a-vector-with-r
#' }{Link}
#'
#' @export
findConservedNul <- function(str){
  split <- strsplit(str, "")
  nul <- names(which(table(split) == max(table(split))))
  return(nul)

}

#' This function finds the highest frequency of nucleotide in input string.
#'
#' @param str A string containing character nucleotides
#'
#' @return  Return a numeric value for highest frequency
#'
#' @examples
#' # Example 1: find the highest letter frequency in a string
#'
#' str <- "AADDDCCEEEE"
#' freq <- findConservityFromS(str)
#' freq # 0.363636...
#'
#' # Example 2: find the highest frequency of nucleotide from a string of
#' # nucleotide generated by createPosVec()
#' data(testSeqShort)
#' position1Vec <- createPosVec(testSeqShort, 1)
#' freq <- findConservityFromS(position1Vec)
#' freq # 0.66666...
#' @export
findConservityFromS <- function(str){

  # Split the str into character vector and find max occurances
  # of certain nucleotide.
  split <- strsplit(str, "")
  maxval <- max(table(split))

  # Use the length of the str to calculate the highest frequency
  # of the nulceotide.
  length <- nchar(str)
  return(maxval / length)
}

#' This function finds the max length of sequence for input sequence set
#'
#' @param seqSet seqSet A set of aligned sequence read from fasta file
#'
#' @return A numeric value of the max length of sequence for seqSet
#'
#' @examples
#' # Example 1: find the max length of a sequence set retrieved from raw data
#' findpath <- system.file("extdata", "sampleSeq3.txt", package = "conservedPos")
#' rawSeq <- Biostrings::readBStringSet(findpath,"fasta")
#' maxL <- findMaxLen(rawSeq)
#' maxL # 140
#'
#' # Example 2: find the max length of a sequence set retrieved from data
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' maxL <- findMaxLen(testSeq)
#' maxL # 37
#'
#' @export

findMaxLen <- function(seqSet){
  max <- 0
  for (i in 1:length(seqSet)){
    if (max < length(seqSet[[i]])){
      max <- length(seqSet[[i]])
    }
  }
  return(max)
}

#' This function calculates the conservity of all positions for input
#' sequences and generate a vector contains each position's conservity
#'
#' @param seqSet seqSet A set of aligned sequence read from fasta file
#'
#' @return a vector contains each position's highest frequency
#'
#' @examples
#' # Example 1 : Create conservity Table for testSeqShort.
#' data(testSeqShort)
#' testSeq <- testSeqShort
#' table  <- conservityTable(testSeq)
#' # You will see a vector containing 37 numbers (max length), these numbers are
#' # the highest frequency for each position.
#' table
#'
#' # Example 2: Create conservity Table for another sequence set, a raw data
#' # sampleSeq3.txt. This should takes longer time to run.
#' findpath <- system.file("extdata", "sampleSeq3.txt", package = "conservedPos")
#' rawSeq <- Biostrings::readBStringSet(findpath,"fasta")
#' table <- conservityTable(rawSeq)
#' # Should have 140 numbers(max length of rawSeq), and
#' # each number is the the highest frequency for each position
#' table
#'
#' @export
conservityTable <- function(seqSet){
  # Find max length of sequences in seqSet, it is used as the max range of
  # position to compute the conservity. It also guarantees that each position
  # to be calculated even the sequences have slightly length differences.
  max <- findMaxLen(seqSet)

  # From index 1 to max length, create a position string for each index
  # and find the conservity from that index.
  conserVec <- c()
  for (i in 1:max){
    newstr <- createPosVec(seqSet, i)
    conser <- findConservityFromS(newstr)
    conserVec[i] <- conser
  }
  return(conserVec)
}
hezijin/conservedPos documentation built on Jan. 1, 2021, 3:18 a.m.