R/ANF_RNA.R

Defines functions ANF_RNA

Documented in ANF_RNA

#'  Accumulated riboNucleotide Frequency (ANF_RNA)
#'
#' This function replaces ribonucleotides with a four-length vector.
#' The first three elements represent the ribonucleotides and
#' the forth holds the frequency of the ribonucleotide from the beginning of the sequence until the position of the ribonucleotide in the sequence.
#' 'A' will be replaced with c(1, 1, 1, freq), 'C' with c(0, 1, 0, freq),'G' with c(1, 0, 0, freq), and 'U' with c(0, 0, 1, freq).
#'
#' @note This function is provided for sequences with the same lengths.
#' Users can use 'txt' option in outFormat for sequences with different lengths.
#' Warning: If outFormat is set to 'mat' for sequences with different lengths, it returns an error.
#' Also, when output format is 'txt', label information is not shown in the text file.
#' It is noteworthy that 'txt' format is not usable for machine learning purposes if sequences have different sizes. Otherwise 'txt' format
#' is also usable for machine learning purposes.
#'
#' @references Chen, W., Tran, H., Liang, Z. et al. Identification and analysis of the N6-methyladenosine in the Saccharomyces cerevisiae transcriptome. Sci Rep 5, 13859 (2015).
#'
#' @param seqs is a FASTA file containing ribonucleotide sequences. The sequences start
#' with '>'. Also, seqs could be a string vector. Each element of the vector is a ribonucleotide sequence.
#'
#'
#' @param outFormat (output format) can take two values: 'mat'(matrix) and 'txt'. The default value is 'mat'.
#'
#' @param outputFileDist shows the path and name of the 'txt' output file.
#'
#'
#' @param label is an optional parameter. It is a vector whose length is equivalent to the number of sequences. It shows the class of
#' each entry (i.e., sequence).
#'
#' @return The output depends on the outFormat parameter which can be either 'mat' or 'txt'. If outFormat is 'mat', the function returns a feature
#' matrix for sequences with the same length such that the number of columns is (sequence length)*(4)
#' and the number of rows is equal to the number of sequences.
#' If the outFormat is 'txt', the output is written to a tab-delimited file.
#'
#'
#' @export
#'
#' @examples
#'
#' fileLNC<-system.file("extdata/Carica_papaya101RNA.txt",package="ftrCOOL")
#' mat<-ANF_RNA(seqs = fileLNC,outFormat="mat")
#'

ANF_RNA<-function(seqs,outFormat="mat",outputFileDist="",label=c()){


  if(length(seqs)==1&&file.exists(seqs)){
    seqs<-fa.read(seqs,alphabet="rna")
    seqs_Lab<-alphabetCheck(seqs,alphabet = "rna",label)

    seqs<-seqs_Lab[[1]]
    label<-seqs_Lab[[2]]
  }
  else if(is.vector(seqs)){
    seqs<-sapply(seqs,toupper)

    seqs_Lab<-alphabetCheck(seqs,alphabet = "rna",label)


    seqs<-seqs_Lab[[1]]
    label<-seqs_Lab[[2]]

  }
  else {
    stop("ERROR: Input sequence is not in the correct format. It should be a FASTA file or a string vector.")
  }

  lenSeqs<-sapply(seqs,nchar)


  nucs<-list("A"=c(1,1,1),"C"=c(0,1,0),"G"=c(1,0,0),"T"=c(0,0,1),"U"=c(0,0,1))
  numSeqs<-length(seqs)

  if(outFormat=="mat"){

    if(length(unique(lenSeqs))>1){
      stop("ERROR: All sequences should have the same length in 'mat' mode. For sequences with different lengths, please use 'txt' for outFormat parameter")
    }


    featureMatrix<-sapply(seqs,function(x){
      chars<-unlist(strsplit(x,""))
      #patA<-`attributes<-`(gregexpr(pattern ="A",x)[[1]],NULL)
      patA<-which(chars=="A")
      lenA<-length(patA)
      tempA<-1:lenA
      valA<-tempA/patA

      #patC<-`attributes<-`(gregexpr(pattern ="C",x)[[1]],NULL)
      patC<-which(chars=="C")
      lenC<-length(patC)
      tempC<-1:lenC
      valC<-tempC/patC

      #patG<-`attributes<-`(gregexpr(pattern ="G",x)[[1]],NULL)
      patG<-which(chars=="G")
      lenG<-length(patG)
      tempG<-1:lenG
      valG<-tempG/patG

      #patT<-`attributes<-`(gregexpr(pattern ="U",x)[[1]],NULL)
      patT<-which(chars=="U")
      lenT<-length(patT)
      tempT<-1:lenT
      valT<-tempT/patT

      vals<-list("A"=valA,"C"=valC,"G"=valG,"U"=valT)
      counter<-list("A"=0,"C"=0,"G"=0,"U"=0)
      len=lenA+lenC+lenG+lenT
      soretdVals<-vector(mode = "numeric",length = len)
      tempVect<-c()
      for(i in 1:len){
        counter[[chars[i]]]=counter[[chars[i]]]+1
        value<-vals[[chars[i]]][counter[[chars[i]]]]
        tempVect<-c(tempVect,nucs[[chars[i]]],value)

      }
      return(tempVect)

    })

    return(t(featureMatrix))

  } else if(outFormat=="txt"){

    nameSeq<-names(seqs)
    featureList<-lapply(seqs,function(x){
      chars<-unlist(strsplit(x,""))
      #patA<-`attributes<-`(gregexpr(pattern ="A",x)[[1]],NULL)
      patA<-which(chars=="A")
      lenA<-length(patA)
      tempA<-1:lenA
      valA<-tempA/patA

      #patC<-`attributes<-`(gregexpr(pattern ="C",x)[[1]],NULL)
      patC<-which(chars=="C")
      lenC<-length(patC)
      tempC<-1:lenC
      valC<-tempC/patC

      #patG<-`attributes<-`(gregexpr(pattern ="G",x)[[1]],NULL)
      patG<-which(chars=="G")
      lenG<-length(patG)
      tempG<-1:lenG
      valG<-tempG/patG

      #patT<-`attributes<-`(gregexpr(pattern ="U",x)[[1]],NULL)
      patT<-which(chars=="U")
      lenT<-length(patT)
      tempT<-1:lenT
      valT<-tempT/patT

      vals<-list("A"=valA,"C"=valC,"G"=valG,"U"=valT)
      counter<-list("A"=0,"C"=0,"G"=0,"U"=0)
      len=lenA+lenC+lenG+lenT
      soretdVals<-vector(mode = "numeric",length = len)
      tempVect<-c()
      for(i in 1:len){
        counter[[chars[i]]]=counter[[chars[i]]]+1
        value<-vals[[chars[i]]][counter[[chars[i]]]]
        tempVect<-c(tempVect,nucs[[chars[i]]],value)

      }
      return(tempVect)

    })
    for(i in 1:numSeqs){
      tem=featureList[[i]]
      temp<-c(nameSeq[i],tem)
      temp<-paste(temp,collapse = "\t")
      write(temp,outputFileDist,append = TRUE)
    }
  }
  else {
    stop("ERROR: outFormat should be 'mat' or 'txt' ")
  }

}

Try the ftrCOOL package in your browser

Any scripts or data that you put into this service are public.

ftrCOOL documentation built on Nov. 30, 2021, 1:07 a.m.