#' eMIRNA Function for calculating Features from provided filtered sequences.
#'
#' \code{eMIRNA.Features} Returns a data.frame with calculated Features from each
#' of the provided filtered sequences. FASTA file must be previously filtered by
#' length and by Secondary Folding Structure, prior to be subjected to the
#' eMIRNA.Features module.
#'
#' @param file Path to FASTA file previously filtered by length and by Secondary
#' Folding Structure.
#' \code{file}.
#'
#' @param prefix Desired name for Features .csv file output.
#'
#' @param rescale Boolean for performing rescaling of calculated Features.
#' If \code{rescale = TRUE}, features will be rescaled. If \code{rescale = FALSE},
#' features will remain not rescaled.
#'
#' @examples
#' eMIRNA.Features("~/eMIRNA/FilterStructure_Results/Candidates_filter_nloop.fa",
#' "FASTA")
#'
#'@import stringr
#'@import Biobase
#'@import scales
#' @export
eMIRNA.Features <- function(file, prefix, rescale=TRUE){
setwd("~/")
Dir0 <- "eMIRNA"
dir.create(file.path(Dir0), showWarnings=FALSE)
Dir <- "Feature_Results"
setwd("~/eMIRNA/")
dir.create(file.path(Dir), showWarnings = FALSE)
workdir <- "~/eMIRNA/Feature_Results/"
setwd(workdir)
message("Calculating Features")
if (substr(file, nchar(file)-16+1, nchar(file)) == '_filter_nloop.fa'){
command1 <- paste0("RNAfold --MEA -d2 -p --noPS -i ", file)
RNAfold1 <- system(command1, intern=TRUE)
###########################################
############ Sequence Features ############
###########################################
#Triplet estimation from SVM-Triplet (Triplets)
command2 <- paste0("RNAfold --noPS -i ", file, " > RNAfold_pred.txt")
system(command2)
command2.1 <- "1_check_query_content.pl RNAfold_pred.txt RNAfold_pred_checked.txt"
system(command2.1)
command2.2 <- "2_get_stemloop.pl RNAfold_pred_checked.txt RNAfold_pred_stemloop.txt 5"
system(command2.2)
command2.3 <- "3_step_triplet_coding_for_queries.pl RNAfold_pred_stemloop.txt Triplets.txt"
system(command2.3)
Triplets1 <- "Triplets.txt"
Triplets1 <- unlist(lapply(Triplets1, readLines))
Triplets.split1 <- strsplit(Triplets1, "\\s")
Triplets1 <- as.numeric(as.character(unlist(Triplets.split1)))
Triplets.along <- seq_along(Triplets1)
Triplets <- split(Triplets1, ceiling(Triplets.along/32))
Triplets <- do.call(rbind, Triplets)
colnames(Triplets) <- c("1","2","3","4","5","6","7","8","9","10","11","12","13",
"14","15","16","17","18","19","20","21","22","23","24","25",
"26","27","28","29","30","31","32")
T1 <- as.vector(Triplets[,1])
T2 <- as.vector(Triplets[,2])
T3 <- as.vector(Triplets[,3])
T4 <- as.vector(Triplets[,4])
T5 <- as.vector(Triplets[,5])
T6 <- as.vector(Triplets[,6])
T7 <- as.vector(Triplets[,7])
T8 <- as.vector(Triplets[,8])
T9 <- as.vector(Triplets[,9])
T10 <- as.vector(Triplets[,10])
T11 <- as.vector(Triplets[,11])
T12 <- as.vector(Triplets[,12])
T13 <- as.vector(Triplets[,13])
T14 <- as.vector(Triplets[,14])
T15 <- as.vector(Triplets[,15])
T16 <- as.vector(Triplets[,16])
T17 <- as.vector(Triplets[,17])
T18 <- as.vector(Triplets[,18])
T19 <- as.vector(Triplets[,19])
T20 <- as.vector(Triplets[,20])
T21 <- as.vector(Triplets[,21])
T22 <- as.vector(Triplets[,22])
T23 <- as.vector(Triplets[,23])
T24 <- as.vector(Triplets[,24])
T25 <- as.vector(Triplets[,25])
T26 <- as.vector(Triplets[,26])
T27 <- as.vector(Triplets[,27])
T28 <- as.vector(Triplets[,28])
T29 <- as.vector(Triplets[,29])
T30 <- as.vector(Triplets[,30])
T31 <- as.vector(Triplets[,31])
T32 <- as.vector(Triplets[,32])
StemF <- "RNAfold_pred_stemloop.txt"
StemF <- unlist(lapply(StemF, readLines))
StemF.ID <- grep(">", StemF, value=TRUE)
StemF.ID <- gsub(">", "", StemF.ID)
StemF.ID <- strsplit(StemF.ID, "_")
StemF.ID <- lapply(StemF.ID, "[", -2)
StemF.ID.index <- as.numeric(unlist(lapply(StemF.ID, "[", -2)))
#ID
n <- length(RNAfold1)
ID <- RNAfold1[seq(1,n,7)]
ID <- gsub('>', '', ID)
ID <- ID[StemF.ID.index]
#Sequence
Sequence <- RNAfold1[seq(2,n,7)]
Sequence <- Sequence[StemF.ID.index]
#Length of Sequences (Length)
Length <- nchar(Sequence)
#G+C ratio (GC)
nG <- str_count(Sequence, "G")
nC <- str_count(Sequence, "C")
GC <- (nG + nC) / Length
#G/C ratio (G.Cr)
G.Cr <- nG / nC
#A+U/G+C ratio (AU.GCr)
nA <- str_count(Sequence, "A")
nU <- str_count(Sequence, "U")
AU.GCr <- (nA + nU) / (nG + nC)
#Base proportions Ratios (Ar, Ur, Gr, Cr)
Ar <- nA / Length
Ur <- nU / Length
Gr <- nG / Length
Cr <- nC / Length
#Dinucleotide Ratios (DNr)
AAr <- str_count(Sequence, "AA")/ Length
GGr <- str_count(Sequence, "GG")/ Length
CCr <- str_count(Sequence, "CC")/ Length
UUr <- str_count(Sequence, "UU")/ Length
AGr <- str_count(Sequence, "AG")/ Length
ACr <- str_count(Sequence, "AC")/ Length
AUr <- str_count(Sequence, "AU")/ Length
GAr <- str_count(Sequence, "GA")/ Length
GCr <- str_count(Sequence, "GC")/ Length
GUr <- str_count(Sequence, "GU")/ Length
CAr <- str_count(Sequence, "CA")/ Length
CGr <- str_count(Sequence, "CG")/ Length
CUr <- str_count(Sequence, "CU")/ Length
UAr <- str_count(Sequence, "UA")/ Length
UGr <- str_count(Sequence, "UG")/ Length
UCr <- str_count(Sequence, "UC")/ Length
###################################################
########### Secondary Structure Features ##########
###################################################
#RNAfold Secondary Structure (SecondaryStrc)
SecondaryStrc1 <- RNAfold1[seq(3,n,7)]
SecondaryStrc1 <- SecondaryStrc1[StemF.ID.index]
SecondaryStrc <- gsub( " .*$", "", SecondaryStrc1)
#Pairing Probabilities Structure (PairProbStrc)
PairProbStrc1 <- RNAfold1[seq(4,n,7)]
PairProbStrc1 <- PairProbStrc1[StemF.ID.index]
PairProbStrc <- gsub( " .*$", "", PairProbStrc1)
#RNAfold centroid structure (CentroidStrc)
CentroidStrc1 <- RNAfold1[seq(5,n,7)]
CentroidStrc1 <- CentroidStrc1[StemF.ID.index]
CentroidStrc <- gsub( " .*$", "", CentroidStrc1)
#Maximum Expected Accuracy Structure (MEAStrc)
MEAStrc1 <- RNAfold1[seq(6,n,7)]
MEAStrc1 <- MEAStrc1[StemF.ID.index]
MEAStrc <- gsub( " .*$", "", MEAStrc1)
#Hairpin length (Hl)
nl <- length(StemF)
Hl1 <- StemF[seq(3,nl,3)]
Hl2 <- strsplit(Hl1, "\\((?=(\\.+\\)))", perl = TRUE)
Hl1 <- lapply(Hl2, "[", -1)
Hl1 <- lapply(Hl1, function(x) gsub(").*", "", as.character(x)))
Hl3 <- lapply(Hl2, nchar)
nseq <- length(StemF.ID.index)
Hl3.2 <- as.list(rep(1, nseq))
Hl3 <- mapply('+', Hl3.2, Hl3, SIMPLIFY=FALSE)
Hl1 <- lapply(Hl1, nchar)
Hl <- as.vector(unlist(Hl1))
#5' and 3' Stem length (Steml5, Steml3)
Hl.list <- as.list(Hl)
list0 <- as.list(rep(0, nseq))
Steml1 <- mapply('c', list0, Hl.list, SIMPLIFY=FALSE)
Steml <- mapply('-', Hl3, Steml1, SIMPLIFY=FALSE)
Steml5 <- sapply(Steml, "[", 1)
Steml3 <- sapply(Steml, "[", 2)
Steml3 <- Steml3 - 1
#Number of basepairs in Secondary Structure (BP)
BP <- str_count(SecondaryStrc, "\\(")
#Number of basepairs in 5' and 3' Stem (BP5, BP3)
BP1 <- unlist(lapply(Hl2, "[", 1))
BP5 <- str_count(BP1, "\\(") + 1
BP2 <- unlist(lapply(Hl2, "[", 2))
BP2 <- sub("^\\.*", "", BP2)
BP3 <- str_count(BP2, "\\)")
#Number of mismatches in 5' and 3' Stem (Mism5, Mism3)
Mism5 <- str_count(BP1, "\\.")
Mism3 <- str_count(BP2, "\\.")
#Number of bulges in 5' and 3' Stem (Bulge5, Bulge3)
Bulge1 <- strsplit(BP1, "(\\.)+.")
Bulge5 <- listLen(Bulge1) - 1
Bulge2 <- strsplit(BP2, "(\\.)+.")
Bulge3 <- listLen(Bulge2) - 1
#Number of bulges by type (BulgeN1, BulgeN2, BulgeN3, BulgeN4, BulgeN5)
BulgeN1.5 <- str_count(BP1, "\\(\\.\\(")
BulgeN1.3 <- str_count(BP2, "\\)\\.\\)")
BulgeN2.5 <- str_count(BP1, "\\(\\.\\.\\(")
BulgeN2.3 <- str_count(BP2, "\\)\\.\\.\\)")
BulgeN3.5 <- str_count(BP1, "\\(\\.\\.\\.\\(")
BulgeN3.3 <- str_count(BP2, "\\)\\.\\.\\.\\)")
BulgeN4.5 <- str_count(BP1, "\\(\\.\\.\\.\\.\\(")
BulgeN4.3 <- str_count(BP2, "\\)\\.\\.\\.\\.\\)")
BulgeN5.5 <- str_count(BP1, "\\(\\.\\.\\.\\.\\.\\(")
BulgeN5.3 <- str_count(BP2, "\\)\\.\\.\\.\\.\\.\\)")
#Number of A-U, C-G and G-U pairs in 1 loop sequences (AUp, GCp, GUp)
Sequence2 <- strsplit(Sequence, "")
SecondaryStrc2 <- strsplit(SecondaryStrc, "")
Matches5 <- sapply(SecondaryStrc2, function(x) which(x == "("))
Matches3 <- sapply(SecondaryStrc2, function(x) which(x == ")"))
if (class(Matches5) == "matrix"){
Extract5 <- lapply(seq_len(ncol(Matches5)), function(i) Matches5[,i])
Extract5 <- mapply('[', Sequence2, Extract5)
Extract5 <- lapply(seq_len(ncol(Extract5)), function(i) Extract5[,i])
Extract3 <- lapply(seq_len(ncol(Matches3)), function(i) Matches3[,i])
Extract3 <- mapply('[', Sequence2, Extract3)
Extract3 <- lapply(seq_len(ncol(Extract3)), function(i) Extract3[,i])
Extract3 <- lapply(Extract3, function(x) rev(x))
Pairs <- mapply('paste0', Extract5, Extract3, SIMPLIFY=FALSE)
CGc1 <- sapply(Pairs, function(x) sum(str_count(x, "CG")))
GCc1 <- sapply(Pairs, function(x) sum(str_count(x, "GC")))
GCp <- CGc1 + GCc1
AUc1 <- sapply(Pairs, function(x) sum(str_count(x, "AU")))
UAc1 <- sapply(Pairs, function(x) sum(str_count(x, "UA")))
AUp <- AUc1 + UAc1
GUc1 <- sapply(Pairs, function(x) sum(str_count(x, "GU")))
UGc1 <- sapply(Pairs, function(x) sum(str_count(x, "UG")))
GUp <- GUc1 + UGc1
} else {
Extract5 <- mapply('[', Sequence2, Matches5)
Extract3 <- mapply('[', Sequence2, Matches3)
Extract3 <- sapply(Extract3, function(x) rev(x))
Pairs <- mapply('paste0', Extract5, Extract3, SIMPLIFY=FALSE)
CGc1 <- sapply(Pairs, function(x) sum(str_count(x, "CG")))
GCc1 <- sapply(Pairs, function(x) sum(str_count(x, "GC")))
GCp <- CGc1 + GCc1
AUc1 <- sapply(Pairs, function(x) sum(str_count(x, "AU")))
UAc1 <- sapply(Pairs, function(x) sum(str_count(x, "UA")))
AUp <- AUc1 + UAc1
GUc1 <- sapply(Pairs, function(x) sum(str_count(x, "GU")))
UGc1 <- sapply(Pairs, function(x) sum(str_count(x, "UG")))
GUp <- GUc1 + UGc1
}
Nloop1 <- strsplit(SecondaryStrc, "\\((?=(\\.+\\)))", perl = TRUE)
Nloop <- listLen(Nloop1) - 1
##################################################
######### Secondary Structure Statistics #########
##################################################
#Minimum Free Energy (MFE)
MFE <- as.numeric(regmatches(SecondaryStrc1,
gregexpr("(?>-)*[[:digit:]]+\\.*[[:digit:]]*",
SecondaryStrc1, perl=TRUE)))
#Ensemble Free Energy (EFE)
EFE <- as.numeric(regmatches(PairProbStrc1,
gregexpr("(?>-)*[[:digit:]]+\\.*[[:digit:]]*",
PairProbStrc1, perl=TRUE)))
#Centroid Free Energy (CFE)
CFE1 <- sub(".*? (.+)", "\\1", CentroidStrc1)
CFE1 <- gsub("\\{", "", CFE1)
CFE1 <- gsub("\\}", "", CFE1)
CFE1 <- gsub(" ", "", CFE1)
CFE1 <- unlist(strsplit(CFE1, "d="))
n2 <- length(CFE1)
CFE <- as.numeric(CFE1[seq(1,n2,2)])
#Centroid Distance to Ensemble (CDE)
CDE <- as.numeric(CFE1[seq(2,n2,2)])
#Maximum Expected Accuracy Structure Free Energy (MEAFE)
MEAFE1 <- sub(".*? (.+)", "\\1", MEAStrc1)
MEAFE1 <- gsub("\\{", "", MEAFE1)
MEAFE1 <- gsub("\\}", "", MEAFE1)
MEAFE1 <- gsub(" ", "", MEAFE1)
MEAFE1 <- unlist(strsplit(MEAFE1, "MEA="))
MEAFE <- as.numeric(MEAFE1[seq(1,n2,2)])
#Maximum Expected Accurary (MEA)
MEA <- as.numeric(MEAFE1[seq(2,n2,2)])
#Base pair Propensity (BPP)
BPP <- (BP / Length)
#Frequency of the MFE in ensemble (EFreq)
EFreq1 <- RNAfold1[seq(7,n,7)]
EFreq1 <- EFreq1[StemF.ID.index]
EFreq1 <- unlist(strsplit(EFreq1, ";"))
EFreq2 <- EFreq1[seq(1,n2,2)]
EFreq <- round(as.numeric(regmatches(EFreq2,
gregexpr("(?>-)*[[:digit:]]+\\.*[[:digit:]]*",
EFreq2, perl=TRUE))), 4)
#Ensemble Diversity (ED)
ED1 <- EFreq1[seq(2,n2,2)]
ED <- as.numeric(regmatches(ED1,
gregexpr("(?>-)*[[:digit:]]+\\.*[[:digit:]]*",
ED1, perl=TRUE)))
#Adjusted MFE (MFEadj)
MFEadj <- (MFE / Length)
#Adjusted EFE (EFEadj)
EFEadj <- (EFE / Length)
#Adjusted base pair Distance (Dadj)
Dadj <- (CDE / Length)
#Adjusted Shannon Entropy (SEadj)
SE <- -((Ar*log2(Ar))+(Ur*log2(Ur))+(Gr*log2(Gr))+(Cr*log2(Cr)))
SEadj <- (SE /Length)
#Difference between MFE and EFE Adjusted (DiffMFE.EFE)
DiffMFE.EFE <- ((MFE - EFE) / Length)
#Ratio between Adjusted MFE and GC (MFEadj.GC)
MFEadj.GC <- (MFEadj / GC)
#Ratio between Adjusted MFE and base pairs (MFEadj.BP)
MFEadj.BP <- (MFEadj / BP)
table <- as.data.frame(cbind(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
T26, T27, T28, T29, T30, T31, T32, GC, EFreq, BPP, Dadj, SEadj,
DiffMFE.EFE, Length, AU.GCr,
Ar, Ur, Gr, Cr, AAr, GGr, CCr, UUr, AGr, ACr, AUr,
GAr, GCr, GUr, CAr, CGr, CUr, UAr, UGr, UCr, Hl, Steml5,
Steml3, BP, BP5, BP3, Mism5, Mism3, Bulge5, Bulge3, BulgeN1.5,
BulgeN1.3, BulgeN2.5, BulgeN2.3, BulgeN3.5, BulgeN3.3,
BulgeN4.5, BulgeN4.3, BulgeN5.5, BulgeN5.3, AUp, GCp, GUp, MFE,
MFEadj, EFE, EFEadj, CFE, CDE, MEAFE, MEA,
ED, MFEadj.GC, MFEadj.BP))
unlink("*.ps")
unlink("*.txt")
unlink("*.fa*")
table_1 <- table[, 1:32]
table_2 <- table[, 33:94]
if (rescale == TRUE){
table_2 <- as.data.frame(lapply(table_2, rescale))
table <- cbind(table_1, table_2)
rownames(table) <- ID
table[is.na(table)] <- 0
} else {
table <- cbind(table_1, table_2)
rownames(table) <- ID
table[is.na(table)] <- 0
}
final_table <- paste0(workdir, prefix, ".csv")
write.table(table, final_table, sep=",", quote=F, col.names=NA)
return(table)
} else {
stop("Input file is not a nloop filtered FASTA file.")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.