R/folding.r

## Functions and wrappers to fold nucleotide sequences

#' Folds a vector of sequences using a system call to Vienna RNAfold.
#' 
#' This function is a wrapper for RNAfold from the ViennaRNA package. It performs thermodynamic folding of the input and 
#' estimates the minimum free energy (mfe) structure.
#' The ViennaRNA package needs to be installed on the system and present on the PATH
#' 
#' @param seqs a character vector og DNAStringSet object with the sequences to fold
#' @export
#' @return a data frame with columns sec_structure (dot-bracket format) and mfe (kcal/mol)
#' @seealso \code{\link{RNAduplex}}
RNAfold = function(seqs){
  
  if(file.exists('/etc/profile.d/zz_modules.sh')){
    cmmnd <- '/pstore/data/ricc/RNAfoldR.sh'
  }else{
    cmmnd <-"RNAfold --noPS"
  }
  
  output <- system(cmmnd, input=c(seqs),intern = T)
  output <- output[grepl('[0-9]', output)]
  struc <- substr(output, 1, nchar(seqs))
  energy <- substr(output, nchar(seqs)+1, nchar(output))
  energy <- suppressWarnings(as.double(gsub(' |[)]|[()]', '', energy)))
  data.frame(sec_structure= struc,
             sec_energy = energy)
  #note that the only way to keep track of the returned value is that the order
  #is the same as the input sequences
}

#' Estimates the duplex structure of a sequence (oligo) pairing with a copy of itself.
#' 
#' This function is a wrapper for RNAduplex from the ViennaRNA package. It
#' performs thermodynamic duplex folding of the input estimates the minimum free
#' energy (mfe) structure. The ViennaRNA package needs to be installed on the
#' system and present on the PATH
#'
#' @param seqs a character vector or DNAStringSet object with the sequences to
#'   fold
#' @export
#' @importFrom Biostrings reverseComplement
#' @return a data frame with columns duplex_structure (dot-bracket format),
#'   duplex_pos and duplex_energy (kcal/mol)
#' @seealso \code{\link{RNAfold}}
RNAduplex = function(seqs, targetseqs=NULL){
  
  if(is.null(targetseqs)){ 
    targetseqs <- seqs
  }
  
  if(file.exists('/etc/profile.d/zz_modules.sh')){
    cmmnd <- '/pstore/data/ricc/RNAduplexR.sh'
  }else{
    cmmnd <-"RNAduplex"
  }
  
  input0 <- NA
  input0[seq(1,by=2,length=length(seqs))] <- seqs
  input0[seq(2,by=2,length=length(seqs))] <- targetseqs
  tmp2 <- system(cmmnd,input = input0, intern = TRUE)
  tmp2 <- strsplit(tmp2, "\\s+", perl = T)
  
  # duplex_pos and duplex_energy
  data.frame(
    duplex_structure = sapply(tmp2, function(x) x[1]),
    duplex_pos =  gsub(',','-',sapply(
      tmp2, function(x){
        paste0(x[2:4], collapse = ' ')} )) ,
    duplex_energy =
      as.numeric(sapply(
        tmp2, function(x){
          gsub("\\(|\\)", "", tail(x, n=1) ) 
        })
      )
  )
  #note that the only way to keep track of the returned value is that the order
  #is the same as the input sequences
}

#' Estimate nucleotide resolution transcript accessibilty for a transcript
#'
#' This function is the R version of RNAplfold from the ViennaRNA package (Tafer et al., 2008). It 
#' evaluates accessibility (probability of being unpaired) for a transcript. The VienneRNA
#' package needs to be installed (version 2.0.7 from http://www.tbi.univie.ac.at/~ronny/RNA/vrna2_source.html).
#'
#' @param seq.char a character vector with the nucleotide sequence
#' @param L.in allow only pairs (i,j) with j-i <= L.in
#' @param W.in average pair probabilities over windows of size W.in.
#' @param u.in compute the mean probability that regions of length 1 to u.in are unpaired.
#' @keywords transcript accessibility
#' @export
RNAplfoldR <- function(seq.char, L.in=40, W.in=80, u.in=16) {
  
  prefix <- paste0(sample(LETTERS[1:28],size = 10,replace = TRUE), 
                   collapse = '')
  
  
  seq.char <- as.character(seq.char)
  if (nchar(seq.char)<3) {
    stop("Your sequence need to be a character vector more than 3nt long.")
  }
  
  ## check that the ViennaRNA command RNAplfold can be run
  ## if there is a version difference, warn
  if(file.exists('/etc/profile.d/zz_modules.sh')){
    
    system(paste("/pstore/data/ricc/RNAplfoldR.sh -L", L.in, 
                 "-W", W.in,"-u", u.in,"-i ", prefix), 
           input = seq.char, 
           intern = TRUE)
    
  }else{
    cmmnd <- "RNAplfold -V"
    cmmnd2 <- paste("RNAplfold -L",L.in,"-W",W.in,"-u",u.in)
    
    
    a <- tryCatch(
      {suppressMessages(system(cmmnd, intern=T,ignore.stderr = TRUE))},
      error=function(cond) {return(NA)},
      warning=function(cond) {return(NA)} #,
      # finally={}
    )    
    if (is.na(a)) {
      stop("RNAplfold is missing, please install")
    }
    
    
    ## set up the RNAplfold command and pipe the sequence data to it
    system(paste0(cmmnd2,' --id-prefix ', prefix), 
           input=seq.char, intern=TRUE)
  }
  
  ## read the plfold_lunp file outputted by RNAplfold and format it
  acc.tx <- read.delim(paste0(prefix,'_0001_lunp'), as.is=T, skip=2, header=F, row.names=1)
  acc.tx <- acc.tx[,colSums(is.na(acc.tx))!=nrow(acc.tx)]
  colnames(acc.tx) <- 1:ncol(acc.tx)
  
  file.remove(paste0(prefix,'_0001_lunp'))
  file.remove(paste0(prefix,'_0001_dp.ps'))
  
  return(acc.tx)
}


#' Find minimum free energy hybridisations of a long (target) and a short (query) RNA
#'
#' This function is the R version of RNAhybrid by Rehmsmeier et al., RNA, 10:1507-1517, 2004. 
#' The hybridisation is performed in a kind of domain mode, ie. the short sequence is hybridised
#' to the best fitting parts of the long one. The command line tool RNAhybrid
#' needs to be installed (version 2.1 from http://bibiserv.techfak.uni-bielefeld.de/rnahybrid/dl_pre-page.html).
#' There is no I/O activity so this function can be easily parallelized.
#'
#' @param seq.target a character vector with the long nucleotide sequence
#' @param seq.query a character vector with the short nucleotide sequence
#' @param b maximal number of hits to show. Hits with increasing minimum free energy 
#'     (reminder: larger energies are worse) are shown untill there are no more hits. 
#'      Hits may only overlap at dangling bases (5' or 3' unpaired end of target).
#' @keywords rnahybrid mimumum free energy hybridization
#' @export
#' @examples
#' a <- RNAhybridR("AACCTTGG", "ACTG")
RNAhybridR <- function(seq.target, seq.query, b=1) {
  if (nchar(seq.target)<250000) {
    if(file.exists('/etc/profile.d/zz_modules.sh')){
      cmmnd <- paste0('source /etc/profile.d/zz_modules.sh && ml RNAhybrid && ',
        paste("RNAhybrid -b",b,"-c -s 3utr_worm",seq.target,seq.query))
    }else{
      cmmnd <- paste("RNAhybrid -b",b,"-c -s 3utr_worm",seq.target,seq.query)  
    }
    tst <- system(cmmnd, intern = T)
    tst <- strsplit(tst,":")[[1]]
    mfe <- as.numeric(tst[5])	#kcal/mol
  } else {
    mfe <- NA
  }
  return(mfe)
}
Santaris/seqtools documentation built on May 9, 2019, 12:44 p.m.