R/misc_helper_functions.R

#' Construct the upper-/lower case representation of LNA-oligos by modification-input
#'
#' This function merges sequence and modification for a LNA-oligo 
#' into the upper-/lower case representation.
#'
#' @param line.in a character vector of length 2 where the first element is the sequence 
#'  (e.g. ACGTGTTT) and the second element the modification (e.g. LLLDDDLL).
#' @keywords LNA representation
#' @export
#' @examples
#'  seqmod2uplow(c("ACGTGTTT","LLLDDDLL"))
#' @seealso \code{\link{seqmod2plus}}
seqmod2uplow <- function(line.in) {
  line.in <- toupper(line.in)                     #added for robustness
  tf.dna <- strsplit(line.in[2],"")[[1]]=="D"
  ol.in <- strsplit(line.in[1],"")[[1]]
  tf.u <- ol.in=="U"
  ol.in[tf.u] <- "T"
  ol.in[tf.dna] <- tolower(ol.in[tf.dna])
  ol.in <- paste(ol.in, collapse="")
  ol.in
}

#' Construct the plus (+) representation of LNA-oligos
#'
#' This function merges sequence and modification for a LNA-oligo 
#' into the plus representation.
#'
#' @param line.in a character vector of length 2 where the first element is the sequence 
#'	(e.g. ACGTGTTT) and the second element the modification (e.g. LLLDDDLL).
#' @keywords LNA representation
#' @export
#' @examples
#'  seqmod2plus(c("ACGTGTTT","LLLDDDLL"))
#' @seealso \code{\link{seqmod2uplow}}
seqmod2plus <- function(line.in) {
  line.in <- toupper(line.in)                     #added for robustness
  tf.dna <- strsplit(line.in[2],"")[[1]]=="D"
  ol.in <- strsplit(line.in[1],"")[[1]]
  tf.u <- ol.in=="U"
  ol.in[tf.u] <- "T"
  ol.in[!tf.dna] <- paste("+",ol.in[!tf.dna],sep="")
  ol.in <- paste(ol.in, collapse="")
  ol.in
}


#' Construct the upper-/lower case representation of LNA-oligos by flank-input 
#'
#' This function uses input on flank lengths to construct a upper-/lower case representation
#' of a gapmer.
#'
#' @param o character string (the sequence) to be converted to gapmer format.
#' @param leftflank size of left LNA-flank.
#' @param rightflank size of right LNA flank.
#' @export
#' @examples
#'  sapply(c("ACGTGTTT","TCCGGAAT"),gapmerize)
#' @seealso \code{\link{seqmod2uplow}}
gapmerize <- function(o, leftflank=3, rightflank=3){
  left = substr(o,1,leftflank)
  right = substr(o,nchar(o)-rightflank+1,nchar(o))
  middle = substr(o, leftflank+1, nchar(o)-rightflank)
  gapmer=paste(toupper(left),tolower(middle),toupper(right), sep='')
  names(gapmer)=names(o)
  return(gapmer)
}


#' Another version of seqmod2uplow
#' @export
seqdesign2camel <- function(seqs, mods) {        
  seqs=toupper(seqs)
  camelcases = c()
  for (i in 1:length(seqs)){
    seq = seqs[[i]]
    mod = mods[[i]]
    if (nchar(seq)!=nchar(mod)){
      stop(paste(seq, "and", mod, "not of equal length, at",i))
    }
    dna <- strsplit(mod,"")[[1]]=="D"
    seq.split = strsplit(seq, "")[[1]]
    seq.split[dna] = tolower(seq.split[dna])
    camelcase = paste(seq.split, collapse="")
    camelcases[i]=camelcase
  }
  return(camelcases)
}  


#' Construct the sequence representation of LNA-oligos used in the Pythia database
#'
#' This function merges sequence, modification, and backbone for a LNA-oligo 
#' into the representation used in the Pythia database.
#'
#' @param line.in a character vector of length 3 where the first element is the sequence 
#'  (e.g. ACGTGTTT) and the second element the modification (e.g. LLLDDDLL), and the 
#'  final element is the backbone ("SSOSSOSH"). Notice all elements have same length.
#'  The backbone elements are between nucleotides and always have H in the end.
#' @keywords LNA representation
#' @export
#' @examples
#'  a <- seqmod2pythia( c("acgtacgtee", "LLDDLDLDDD", "SSOSSOOSSH"))
#' @seealso \code{\link{pythia2seqmod}}
seqmod2pythia <- function(line.in) {
  ## split line.in so it is easy to follow the objects
  seq.in <- tolower(line.in[1])
  lna.in <- toupper(line.in[2])
  bb.in <-  toupper(line.in[3])
  
  ## split sequence and replace e with mc
  seq.out <- strsplit(seq.in,"")[[1]]
  seq.out <- gsub("e","mc", seq.out)
  
  ## split LNA pattern and replace with pythia terminology
  lna.out <- strsplit(lna.in,"")[[1]]
  lna.out <- gsub("D","dna", lna.out)
  lna.out <- gsub("L","oxy", lna.out)
  lna.out <- gsub("M","moe", lna.out)
  
  ## combine sequence and LNA
  seq.comb <- paste(lna.out,seq.out,sep="")
  
  ## split backbone
  bb.out <- strsplit(bb.in,"")[[1]]
  
  ## combine backbone with rest
  seq.full <- paste(seq.comb, bb.out,sep=":")
  seq.full <- paste(seq.full, collapse=";")
  
  ## replace oxyc with oxymc (since it is always that version with LNA)
  seq.full <- gsub("oxyc","oxymc", seq.full)
  
  return(seq.full)
}

#' Split up the sequence representation of LNA-oligos used in the Pythia database
#'
#' This function splits up the sequence representation used in the Pythia database
#' into sequence, modification, and backbone for a LNA-oligo.
#'
#' @param line.in a character vector of length 1 with something like
#' oxya:S;oxymc:S;dnag:O;dnat:S;oxya:S;dnac:O;oxyg:O;dnat:S;dnamc:S;dnamc:H.
#' @keywords LNA representation
#' @export
#' @examples
#'  a <- pythia2seqmod("oxya:S;oxymc:S;dnag:O;dnat:S;oxya:S;dnac:O;oxyg:O;dnat:S;dnamc:S;dnamc:H")
#' @seealso \code{\link{seqmod2pythia}}
pythia2seqmod <- function(line.in) {
  ## convert input to lowercase for robustness and split into nucleotides
  line.in <- tolower(line.in)
  line.split <- strsplit(line.in,";")[[1]]
  
  ## identify backbone after the :
  bb.out <- paste(unlist(lapply(strsplit(line.split,":"), function(x) {x[2]})),collapse="")
  
  ## identify sequence AND modification and spliit it based on position
  flna <- function(x) {switch(x, oxy = "L",dna = "D",moe = "M")}
  seqlna.out <- unlist(lapply(strsplit(line.split,":"), function(x) {x[1]}))
  seqlna.out <- gsub("oxymc","oxyc",seqlna.out) #LNAs do not write E but C
  seq.out <- substr(seqlna.out,4,10)
  seq.out <- paste(gsub("mc","e",seq.out),collapse="")
  lna.out <- substr(seqlna.out,1,3)
  lna.out <- paste(sapply(lna.out, flna),collapse="")
  
  ## return everything in the right case
  return(c(seq.out, toupper(lna.out), toupper(bb.out)))
}

#' Filter LNA-oligos and construct the upper-/lower case representation based on modification-input
#'
#' This function filters LNA-oligos based on sequence, sugar modification and backbone and returns
#' the upper-/lower case representation. Only oligos suitable for the oligo-predictors are accepted.
#' All nucleobases other than A, C, G, T, U and E, sugar modifications other than L and D and backbones other 
#' than S (phosphorothioate) are excluded and result in NA. 
#'
#' @param line.in a character vector of length 3 where the first element is the sequence 
#'  (e.g. ACGTGTTT), the second element the modification (e.g. LLLDDDLL) and the third
#'  is the backbone (e.g. SSSSSSSS).
#' @keywords LNA representation
#' @export
#' @examples
#'  seqmod2uplow.strict(c("ACGTGTTT","LLLDDDLL","SSSSSSSS"))
#' @seealso \code{\link{seqmod2uplow}}  \code{\link{seqmod2plus}}
#' 
seqmod2uplow.strict<-function (line.in)
{
  line.in <- toupper(line.in)
  
  line.OK<-all( # Check whether line is compliant     
    grepl("^[ACGTEU]+$", line.in[1]),  #Filter out all nucleobases other than A, C, G, T, U and E
    grepl("^[LD]+$", line.in[2]),      #Filter out all sugar modifications other than L and D
    grepl("^[S]+$", line.in[3]),       #Filter out all backbones other than S
    grepl("^[X]+$", line.in[4])        #Filter out all stereo defined
  )
  
  if(line.OK){ # Determine upper-lower case notation for compliant lines
    tf.dna <- strsplit(line.in[2], "")[[1]] == "D"
    ol.in <- strsplit(line.in[1], "")[[1]]
    tf.u <- ol.in == "U"
    ol.in[tf.u] <- "T"
    tf.e<-ol.in == "E"
    ol.in[tf.e] <- "C"
    ol.in[tf.dna] <- tolower(ol.in[tf.dna])
    ol.in <- paste(ol.in, collapse = "")
  } else{ # Set non-compliant lines as NA
    ol.in<-NA                        
  }
  ol.in                              
}
Santaris/seqtools documentation built on May 9, 2019, 12:44 p.m.