#' Run RNAsubopt in R
#' @description This function can run and capture the result of RNAsubopt (ViennaRNA Package) in R
#' by invoking the OS command.
#' The results can be formatted and used as the features of tool "catRAPID", "lncPro", etc.
#' ViennaRNA Package is required (\url{http://www.tbi.univie.ac.at/RNA/index.html}).
#'
#' @param seqs RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' Each sequence should be a vector of single characters.
#' @param structureRNA.num integer. Number of random samples of suboptimal structures. Default: \code{6}.
#' @param args.RNAsubopt string (in format such as "-N --pfScale 1.07") specifying additional arguments (except "-p" which is already determined by \emph{structureRNA.num}) for RNAsubopt. This is used when you want to control the behaviour of RNAsubopt. Arguments for RNAsubopt please refer to its manual. Default: \code{NULL}.
#' @param path.RNAsubopt string specifying the location of RNAsubopt program.
#' @param returnSum logical. If \code{FALSE}, the raw output of RNAsubopt (Dot-Bracket Notation of RNA structure) will be returned.
#' If \code{TRUE}, the results of RNAsubopt will be converted into numeric sequence:
#' In any RNA structure sequence (Dot-Bracket Notation), "." (Dot) will be replaced with 0;
#' "(" and ")" (Bracket) will be replaced with 1. And \emph{structureRNA.num} binary sequences will be added to obtain a
#' new numeric sequence.
#' @param verbose logical. Should the relevant information be printed during the calculation? (Only available on UNIX/Linux.)
#' @param parallel.cores an integer that indicates the number of cores for parallel computation. Default: \code{2}.
#' Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#'
#' @return This function returns a data frame that contains the raw outputs when \code{returnSum = FALSE}.
#' Or a list of numeric results of RNAsubopt when \code{returnSum = TRUE}.
#'
#' @details
#' This function depends on the program "RNAsubopt" of software "ViennaRNA".
#' (\url{http://www.tbi.univie.ac.at/RNA/index.html})
#'
#' Parameter \code{path.RNAsubopt} can be simply defined as \code{"RNAsubopt"} as
#' default when the OS is UNIX/Linux. However, for some OS, such as Windows, users
#' need to specify the \code{path.RNAsubopt} if the path of "RNAsubopt" haven't been
#' added in environment variables (e.g. \code{path.RNAsubopt = '"C:/Program Files/ViennaRNA/RNAsubopt.exe"'}).
#'
#' This function can print the related information when running on Linux OS,
#' such as:
#'
#' \code{"25 of 100, length: 695 nt"},
#'
#' which means around 100 sequences are assigned to this node, and the program is now
#' computing the 25th sequence. The length of this sequence is 695 nt.
#'
#' @section References:
#' Lorenz R, Bernhart SH, Honer zu Siederdissen C, \emph{et al}.
#' ViennaRNA Package 2.0.
#' Algorithms Mol. Biol. 2011; 6:26
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom parallel parLapply
#' @importFrom seqinr s2c
#'
#' @seealso \code{\link{runPredator}}
#' @examples
#'
#' \donttest{
#' data(demoPositiveSeq)
#' seqsRNA <- demoPositiveSeq$RNA.positive
#'
#' path.RNAsubopt <- "RNAsubopt" # you need to use your own paths, for example:
#' # path.RNAsubopt <- '"C:/Program Files (x86)/ViennaRNA Package/RNAsubopt.exe"' # in Windows OS
#'
#' RNAsubopt_1 <- runRNAsubopt(seqs = seqsRNA, path.RNAsubopt = path.RNAsubopt,
#' returnSum = FALSE, verbose = TRUE,
#' parallel.cores = 2)
#'
#' RNAsubopt_2 <- runRNAsubopt(seqs = seqsRNA, args.RNAsubopt = "--pfScale 1.07",
#' structureRNA.num = 6,
#' path.RNAsubopt = path.RNAsubopt,
#' returnSum = TRUE, parallel.cores = 2)
#' }
#' @export
runRNAsubopt <- function(seqs, structureRNA.num = 6, args.RNAsubopt = NULL, path.RNAsubopt = "RNAsubopt",
returnSum = FALSE, verbose = FALSE, parallel.cores = 2, cl = NULL) {
message("+ Initializing... ", Sys.time())
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
if (verbose) {
cl <- parallel::makeCluster(parallel.cores, outfile = "")
} else {
cl <- parallel::makeCluster(parallel.cores)
}
close_cl <- TRUE
}
message("\n", "+ Calculating structural information of RNA sequences... ", Sys.time())
seqValidate <- seqs[which(lengths(seqs) <= 4095)]
if (length(seqValidate) < length(seqs)) {
message(" * Due to the limitation of RNAsubopt,")
message(" sequences with length more than 4095 nt will be omitted.")
message(" ", length(seqs) - length(seqValidate), " of ", length(seqs), " sequences have been removed.")
}
seqs <- seqValidate
message("- Sequences Number: ", length(seqs))
index <- 1
seqs <- parallel::parLapply(cl, seqs, Internal.checkRNA)
seq.string <- parallel::parSapply(cl, seqs, seqinr::getSequence, TRUE)
info <- paste(ceiling(length(seqs) / parallel.cores), ",", sep = "")
parallel::clusterExport(cl, varlist = c("index"), envir = environment())
RNAsubopt.seq <- parallel::parLapply(cl, seq.string, Internal.runRNAsubopt, info = info,
args.RNAsubopt = args.RNAsubopt,
structureRNA.num = structureRNA.num, verbose = verbose,
path.RNAsubopt = path.RNAsubopt, outputNum = returnSum)
if (close_cl) parallel::stopCluster(cl)
message("\n", "+ Completed. ", Sys.time())
if (!returnSum) RNAsubopt.seq <- data.frame(RNAsubopt.seq, check.names = FALSE)
RNAsubopt.seq
}
#' Run Predator in R
#' @description This function can run and capture the result of Predator in R
#' by invoking the OS command.
#' The results can be formatted and used as the features of tool "lncPro".
#' Predator is required (\url{https://bioweb.pasteur.fr/packages/pack@predator@2.1.2}).
#' NOTE: Predator does not support 64-bit MS Windows OS. Linux OS is recommended).
#'
#' @param seqs RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' Each sequence should be a vector of single characters. (Non-AA letters will be ignored.)
#' @param args.Predator string specifying additional arguments (except "-a" and "-b") for Predator. This is used when you want to control the behaviour of Predator. Arguments for Predator please refer to its manual. Default: \code{NULL}.
#' @param path.Predator string specifying the location of Predator program.
#' @param path.stride string specifying the location of file "stride.dat" required by program Predator.
#' @param workDir string specifying the directory for temporary files.
#' The temp files will be deleted automatically when
#' the calculation is completed. The directory does not exist will be created automatically.
#' @param verbose logical. Should the relevant information be printed during the calculation? (Only available on Linux OS.)
#' @param parallel.cores an integer that indicates the number of cores for parallel computation. Default: \code{2}.
#' Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#'
#' @return This function returns a data frame of the raw outputs of Predator.
#'
#' @details
#' This function depends on the program Predator (\url{https://bioweb.pasteur.fr/packages/pack@predator@2.1.2}).
#' Program Predator is only available on Linux and 32-bit Windows OS.
#'
#' This function can print the related information when the OS is Linux,
#' such as:
#'
#' \code{"25 of 100, length: 50 aa"},
#'
#' which means around 100 sequences are assigned to this node, and the program is
#' computing the 25th sequence. The length of this sequence is 50 aa.
#'
#' @section References:
#' Frishman D, Argos P.
#' Incorporation of non-local interactions in protein secondary structure prediction from the amino acid sequence.
#' Protein Eng. 1996; 9:133-42
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom parallel parLapply
#' @importFrom utils data
#' @importFrom seqinr c2s
#' @importFrom seqinr s2c
#' @importFrom seqinr write.fasta
#'
#' @seealso \code{\link{runRNAsubopt}}
#' @examples
#'
#' \donttest{
#' data(demoPositiveSeq)
#' seqsPro <- demoPositiveSeq$Pro.positive
#'
#' path.Predator <- "/mnt/external_drive_1/hansy/predator/predator"
#' path.stride <- "/mnt/external_drive_1/hansy/predator/stride.dat"
#'
#' path.stride <- "/media/psf/AllFiles/Volumes/Work/Projects/ncPro/lnc-pro/predator/stride.dat"
#' path.Predator <- "/media/psf/AllFiles/Volumes/Work/Projects/ncPro/lnc-pro/predator/predator"
#'
#' Predator.res <- runPredator(seqs = seqsPro, path.Predator = path.Predator,
#' path.stride = path.stride, workDir = "tmp",
#' verbose = TRUE, parallel.cores = 2)
#' }
#' @export
runPredator <- function(seqs, args.Predator = NULL, path.Predator = "Predator/predator",
path.stride = "Predator/stride.dat", workDir = getwd(),
verbose = FALSE, parallel.cores = 2, cl = NULL) {
if (!file.exists(path.stride)) stop("The path of stride.dat is not correct! Please check parameter path.stride.")
message("+ Initializing... ", Sys.time())
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
if (verbose) {
cl <- parallel::makeCluster(parallel.cores, outfile = "")
} else {
cl <- parallel::makeCluster(parallel.cores)
}
close_cl <- TRUE
}
message("\n", "+ Calculating structural information of protein sequences... ", Sys.time())
seqValidate <- seqs[which(lengths(seqs) >= 30)]
if (length(seqValidate) < length(seqs)) {
message(" * Due to the limitation of predator,")
message(" sequences with length less than 30 amino acids will be omitted.")
message(" ", length(seqs) - length(seqValidate), " of ", length(seqs), " sequences have been removed.")
}
seqs <- seqValidate
message("- Sequences Number: ", length(seqs))
index <- 1
info <- paste0(ceiling(length(seqs) / parallel.cores), ",")
parallel::clusterExport(cl, varlist = c("index", "Internal.randomName"), envir = environment())
if (!dir.exists(workDir)) {
dir.create(workDir, recursive = TRUE)
createDir <- TRUE
} else {
createDir <- FALSE
}
predator.seq <- parallel::parSapply(cl, seqs, Internal.runPredator, info = info,
args.Predator = args.Predator,
workDir = workDir, path.Predator = path.Predator,
path.stride = path.stride, as.string = TRUE,
outputRaw = TRUE, verbose = verbose)
if (close_cl) parallel::stopCluster(cl)
predator.seq <- data.frame(predator.seq, check.names = FALSE)
if (createDir & length(dir(workDir, all.files = TRUE, recursive = TRUE)) == 0) unlink(workDir, recursive = TRUE)
message("\n", "+ Completed. ", Sys.time())
predator.seq
}
#' Computation of the Secondary Structural Features of RNA or Protein Sequences
#' @description The function \code{computeStructure} computes the secondary structural features of RNA
#' or protein sequences. ViennaRNA package and Predator is required.
#' @param seqs sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters, but
#' protein sequences will be converted into upper case letters, and non-AA letters will be ignored.
#' Each sequence should be a vector of single characters.
#' @param seqType a string that specifies the nature of the sequence: \code{"RNA"} or \code{"Pro"} (protein).
#' If the input is DNA sequence and \code{seqType = "RNA"}, the DNA sequence will be converted to RNA sequence automatically. Default: \code{"RNA"}.
#' @param args.RNAsubopt string (in format such as "-N -z -S 1.07") specifying additional arguments (except "-p" which is already determined by \emph{structureRNA.num}) for RNAsubopt. This is used when you want to control the behaviour of RNAsubopt. Arguments for RNAsubopt please refer to its manual. Default: \code{NULL}.
#' @param args.Predator string specifying additional arguments (except "-a" and "-b") for Predator. This is used when you want to control the behaviour of Predator. Arguments for Predator please refer to its manual. Default: \code{NULL}.
#' @param structureRNA.num integer. The number of random samples of suboptimal structures. Default: \code{6}.
#' @param structurePro strings specifying the secondary structural information that are extracted from protein sequences.
#' Ignored if \code{seqType = "RNA"}.
#' Options: \code{"ChouFasman"}, \code{"DeleageRoux"}, and \code{"Levitt"}. See details below.(Ref: [2-4])
#' Multiple elements can be selected at the same time.
#' @param Fourier.len positive integer specifying the Fourier series length that will be used as features.
#' The \code{Fourier.len} should be >= the length of the input sequence. Default: \code{10}.
#' @param workDir.Pro string specifying the directory for temporary files.
#' The temp files will be deleted automatically when
#' the calculation is completed.
#' @param as.list logical. The result will be returned as a list or data frame.
#' @param path.RNAsubopt string specifying the location of RNAsubopt program. (Ref: [5])
#' @param path.Predator string specifying the location of Predator program. (Ref: [6])
#' @param path.stride string specifying the location of file "stride.dat" required by program Predator.
#' @param verbose logical. Should the relevant information be printed during the calculation? (Only available on Linux.)
#' @param parallel.cores an integer that indicates the number of cores for parallel computation. Default: \code{2}.
#' Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#'
#' @return This function returns a data frame if \code{as.list = FALSE} or returns a list if \code{as.list = TRUE}.
#'
#' @details
#' The secondary structures of RNA and protein are computed by RNAsubopt and Predator, respectively.
#' And the protein secondary features are encoded using three amino acid scales:
#' \enumerate{
#' \item Chou & Fasman conformational parameter (Ref: [2])
#' \item Deleage & Roux conformational parameter (Ref: [3])
#' \item Levitt normalised frequency (Ref: [4])
#' }
#' The feature encoding strategy is based on lncPro (Ref: [7]).
#'
#' This function depends on the program "RNAsubopt" of software "ViennaRNA"
#' (\url{http://www.tbi.univie.ac.at/RNA/index.html}) and "Predator"
#' (\url{https://bioweb.pasteur.fr/packages/pack@predator@2.1.2}).
#'
#' Parameter \code{path.RNAsubopt} can be simply defined as \code{"RNAsubopt"} as
#' default when the OS is UNIX/Linux. However, for some OS, such as Windows, users may
#' need to specify the \code{path.RNAsubopt} if the path of "RNAsubopt" haven't been
#' added in environment variables (e.g. \code{path.RNAsubopt = '"C:/Program Files/ViennaRNA/RNAsubopt.exe"'}).
#'
#' Program "Predator" is only available on UNIX/Linux and 32-bit Windows OS.
#'
#' @section References:
#' [1] Han S, Yang X, Sun H, \emph{et al}.
#' LION: an integrated R package for effective prediction of ncRNA–protein interaction.
#' Briefings in Bioinformatics. 2022; 23(6):bbac420
#'
#' [2] Chou PY, Fasman GD.
#' Prediction of the secondary structure of proteins from their amino acid sequence.
#' Adv. Enzymol. Relat. Areas Mol. Biol. 1978; 47:45-148
#'
#' [3] Deleage G, Roux B.
#' An algorithm for protein secondary structure prediction based on class prediction.
#' Protein Eng. Des. Sel. 1987; 1:289-294
#'
#' [4] Levitt M.
#' Conformational preferences of amino acids in globular proteins.
#' Biochemistry 1978; 17:4277-85
#'
#' [5] Frishman D, Argos P.
#' Incorporation of non-local interactions in protein secondary structure prediction from the amino acid sequence.
#' Protein Eng. 1996; 9:133-42
#'
#' [6] Lorenz R, Bernhart SH, Honer zu Siederdissen C, \emph{et al}.
#' ViennaRNA Package 2.0.
#' Algorithms Mol. Biol. 2011; 6:26
#'
#' [7] Lu Q, Ren S, Lu M, \emph{et al}.
#' Computational prediction of associations between long non-coding RNAs and proteins.
#' BMC Genomics 2013; 14:651
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom parallel parLapply
#' @importFrom seqinr getSequence
#' @importFrom seqinr write.fasta
#' @importFrom seqinr s2c
#' @importFrom seqinr a
#' @importFrom utils data
#'
#' @seealso \code{\link{runRNAsubopt}}, \code{\link{runPredator}}, \code{\link{featureStructure}}
#' @examples
#'
#' \donttest{
#' data(demoPositiveSeq)
#' seqsRNA <- demoPositiveSeq$RNA.positive
#' seqsPro <- demoPositiveSeq$Pro.positive
#'
#' # You need to use your own paths:
#'
#' path.Predator <- "/mnt/external_drive_1/hansy/predator/predator"
#' path.stride <- "/mnt/external_drive_1/hansy/predator/stride.dat"
#'
#' structureRNA <- computeStructure(seqsRNA, seqType = "RNA", structureRNA.num = 6,
#' Fourier.len = 10, as.list = FALSE,
#' path.RNAsubopt = "RNAsubopt", parallel.cores = 2)
#'
#' structurePro <- computeStructure(seqsPro, seqType = "Pro",
#' structurePro = c("ChouFasman", "DeleageRoux",
#' "Levitt"),
#' Fourier.len = 10, workDir.Pro = getwd(),
#' as.list = TRUE, path.Predator = path.Predator,
#' path.stride = path.stride, parallel.cores = 2)
#' }
#' @export
computeStructure <- function(seqs, seqType = c("RNA", "Pro"),
args.RNAsubopt = NULL, args.Predator = NULL,
structureRNA.num = 6,
structurePro = c("ChouFasman", "DeleageRoux", "Levitt"),
Fourier.len = 10, workDir.Pro = getwd(), as.list = TRUE,
path.RNAsubopt = "RNAsubopt", path.Predator = "Predator/predator",
path.stride = "Predator/stride.dat", verbose = FALSE,
parallel.cores = 2, cl = NULL) {
seqType <- match.arg(seqType)
structurePro <- match.arg(structurePro, several.ok = TRUE)
if (seqType == "Pro" & !file.exists(path.stride)) stop("The path of stride.dat is not correct! Please check parameter path.stride.")
if (min(lengths(seqs)) < Fourier.len) stop("The profile length of Fourier Series (Fourier.len) cannot be larger than the minimum length of the input sequences!")
close_cl <- FALSE
if (is.null(cl)) {
message("+ Initializing... ", Sys.time())
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
if (verbose) {
cl <- parallel::makeCluster(parallel.cores, outfile = "")
} else {
cl <- parallel::makeCluster(parallel.cores)
}
close_cl <- TRUE
}
if (seqType == "RNA") {
message("\n", "+ Calculating structural information of RNA sequences... ", Sys.time())
seqValidate <- seqs[which(lengths(seqs) <= 4095)]
if (length(seqValidate) < length(seqs)) {
message(" * Due to the limitation of RNAsubopt,")
message(" sequences with length more than 4095 nt will be omitted.")
message(" ", length(seqs) - length(seqValidate), " of ", length(seqs), " sequences have been removed.")
}
seqs <- seqValidate
seqNames <- names(seqs)
if(is.null(seqNames)) {
seqNames <- sapply(seqs, attr, "name")
}
message("- Sequences Number: ", length(seqs))
index <- 1
seqs <- parallel::parLapply(cl, seqs, Internal.checkRNA)
seq.string <- parallel::parSapply(cl, seqs, seqinr::getSequence, TRUE)
info <- paste(ceiling(length(seqs) / parallel.cores), ",", sep = "")
parallel::clusterExport(cl, varlist = c("index"), envir = environment())
RNAsubopt.seq <- parallel::parLapply(cl, seq.string, Internal.runRNAsubopt,
info = info,
args.RNAsubopt = args.RNAsubopt,
structureRNA.num = structureRNA.num,
verbose = verbose,
path.RNAsubopt = path.RNAsubopt,
outputNum = TRUE)
out.seq <- parallel::parLapply(cl, RNAsubopt.seq, Internal.FourierSeries,
profile.length = Fourier.len)
if (close_cl) parallel::stopCluster(cl)
if (!as.list) {
tmp.df <- t(as.data.frame(out.seq))
# row.names(tmp.df) <- seqNames
# .rowNamesDF(tmp.df, make.names = T) <- seqNames
tmp.df <- as.data.frame(tmp.df)
row.names(tmp.df) <- make.names(seqNames, unique = TRUE)
names(tmp.df) <- paste0("RNAstructure", c(1:ncol(tmp.df)))
out.seq <- tmp.df
}
} else {
data(aaindex, package = "seqinr", envir = environment())
aaindex <- get("aaindex")
message("\n", "+ Calculating structural information of protein sequences...", Sys.time())
structurePro <- match.arg(structurePro, several.ok = TRUE)
seqValidate <- seqs[which(lengths(seqs) >= 30)]
if (length(seqValidate) < length(seqs)) {
message(" * Due to the limitation of predator,")
message(" sequences with length less than 30 amino acids will be omitted.")
message(" ", length(seqs) - length(seqValidate), " of ", length(seqs), " sequences have been removed.")
}
seqs <- seqValidate
seqNames <- names(seqs)
if(is.null(seqNames)) {
seqNames <- sapply(seqs, attr, "name")
}
message("- Sequences Number: ", length(seqs))
index <- 1
info <- paste0(ceiling(length(seqs) / parallel.cores), ",")
parallel::clusterExport(cl, varlist = c("index", "Internal.randomName",
"aaindex", "Internal.convertSeq",
"Internal.convertSeq_customize",
"Internal.FourierSeries"), envir = environment())
if (!dir.exists(workDir.Pro)) {
dir.create(workDir.Pro, recursive = TRUE)
createDir <- TRUE
} else {
createDir <- FALSE
}
out.seq <- parallel::parLapply(cl, seqs, Internal.convertPro_Struct,
info = info, args.Predator = args.Predator,
path.Predator = path.Predator,
path.stride = path.stride, workDir = workDir.Pro,
structurePro = structurePro, as.list = as.list,
Fourier.len = Fourier.len, verbose = verbose, aaindex)
if (close_cl) parallel::stopCluster(cl)
if (createDir & length(dir(workDir.Pro, all.files = TRUE, recursive = TRUE)) == 0) unlink(workDir.Pro, recursive = TRUE)
if (!as.list) {
out.seq <- as.data.frame(t(data.frame(out.seq, check.names = FALSE)))
# row.names(out.seq) <- seqNames
# .rowNamesDF(out.seq, make.names = T) <- seqNames
row.names(out.seq) = make.names(seqNames, unique = TRUE)
}
}
# message("\n", "+ Completed. ", Sys.time(), "\n")
out.seq
}
#' Extraction of the Secondary Structural Features of RNA and Protein Sequences
#' @description Basically a wrapper for \code{\link{computeStructure}} function.
#' This function can extract secondary structure features of RNA and protein sequences at the same time
#' and format the results as the dataset that can be used to build classifier.
#' ViennaRNA package and Predator is required.
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA sequences.
#' RNA sequences will be converted into lower case letters.
#' Each sequence should be a vector of single characters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#' @param label optional. A string or a vector of strings or \code{NULL}.
#' Indicates the class of the samples such as
#' "Interact", "Non.Interact". Default: \code{NULL}.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#' @param ... arguments (\code{structureRNA.num}, \code{structurePro}, \code{Fourier.len},
#' \code{workDir.Pro}, \code{path.RNAsubopt}, \code{path.Predator} and \code{path.stride}
#' passed to function \code{\link{computeStructure}}. See example below.
#'
#' @return This function returns a data frame.
#'
#' @details see \code{\link{computeStructure}}.
#' @section References:
#' [1] Han S, Yang X, Sun H, \emph{et al}.
#' LION: an integrated R package for effective prediction of ncRNA–protein interaction.
#' Briefings in Bioinformatics. 2022; 23(6):bbac420
#'
#' [2] Chou PY, Fasman GD.
#' Prediction of the secondary structure of proteins from their amino acid sequence.
#' Adv. Enzymol. Relat. Areas Mol. Biol. 1978; 47:45-148
#'
#' [3] Deleage G, Roux B.
#' An algorithm for protein secondary structure prediction based on class prediction.
#' Protein Eng. Des. Sel. 1987; 1:289-294
#'
#' [4] Levitt M.
#' Conformational preferences of amino acids in globular proteins.
#' Biochemistry 1978; 17:4277-85
#'
#' [5] Frishman D, Argos P.
#' Incorporation of non-local interactions in protein secondary structure prediction from the amino acid sequence.
#' Protein Eng. 1996; 9:133-42
#'
#' [6] Lorenz R, Bernhart SH, Honer zu Siederdissen C, \emph{et al}.
#' ViennaRNA Package 2.0.
#' Algorithms Mol. Biol. 2011; 6:26
#'
#' [7] Lu Q, Ren S, Lu M, \emph{et al}.
#' Computational prediction of associations between long non-coding RNAs and proteins.
#' BMC Genomics 2013; 14:651
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom parallel parLapply
#' @importFrom seqinr getSequence
#' @importFrom seqinr write.fasta
#' @importFrom seqinr s2c
#' @importFrom seqinr a
#' @importFrom utils data
#'
#' @seealso \code{\link{runRNAsubopt}}, \code{\link{runPredator}}, \code{\link{computeStructure}}
#' @examples
#'
#' \donttest{
#' data(demoNegativeSeq)
#' seqsRNA <- demoNegativeSeq$RNA.negative
#' seqsPro <- demoNegativeSeq$Pro.negative
#'
#' # Use your own paths of the program Predator and file "stride.dat". For example:
#'
#' path.Predator <- "/mnt/external_drive_1/hansy/predator/predator"
#' path.stride <- "/mnt/external_drive_1/hansy/predator/stride.dat"
#'
#' # Pass "structureRNA.num", "structurePro", "Fourier.len", "workDir.Pro",
#' # "path.RNAsubopt", "path.Predator" and "path.stride" using "..." argument:
#'
#' dataset <- featureStructure(seqRNA = seqsRNA, seqPro = seqsPro, label = "Non.Interact",
#' parallel.cores = 2, structureRNA.num = 6,
#' structurePro = c("ChouFasman", "DeleageRoux", "Levitt"),
#' Fourier.len = 10, workDir.Pro = "tmpDir",
#' path.RNAsubopt = "RNAsubopt", path.Predator = path.Predator,
#' path.stride = path.stride)
#' }
#' @export
featureStructure <- function(seqRNA, seqPro, label = NULL, parallel.cores = 2, cl = NULL, ...) {
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
close_cl <- FALSE
if (is.null(cl)) {
message("+ Initializing... ", Sys.time())
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
message("\n+ Checking sequences for secondary structure computation... ")
RNA.velidate.idx <- which(lengths(seqRNA) <= 4095)
seqValidate <- seqRNA[RNA.velidate.idx]
if (length(seqValidate) < length(seqRNA)) {
message(" * Due to the limitation of RNAsubopt,")
message(" sequences with length more than 4095 nt will be omitted.")
message(" ", length(seqRNA) - length(seqValidate), " of ", length(seqRNA), " pairs have been removed.")
}
seqRNA <- seqValidate
seqPro <- seqPro[RNA.velidate.idx]
if (!is.null(label)) label <- label[RNA.velidate.idx]
Pro.velidate.idx <- which(lengths(seqPro) >= 30)
seqValidate <- seqPro[Pro.velidate.idx]
if (length(seqValidate) < length(seqPro)) {
message(" * Due to the limitation of predator,")
message(" sequences with length less than 30 amino acids will be omitted.")
message(" ", length(seqPro) - length(seqValidate), " of ", length(seqPro), " pairs have been removed.")
}
seqPro <- seqValidate
seqRNA <- seqRNA[Pro.velidate.idx]
if (!is.null(label)) label <- label[RNA.velidate.idx]
Structure.RNA <- computeStructure(seqs = seqRNA, seqType = "RNA", as.list = FALSE, cl = cl, ...)
Structure.Pro <- computeStructure(seqs = seqPro, seqType = "Pro", as.list = FALSE, cl = cl, ...)
if (close_cl) parallel::stopCluster(cl)
sequenceName <- paste(names(seqRNA), names(seqPro), sep = ".")
# features <- as.data.frame(cbind(Structure.RNA, Structure.Pro),
# row.names = sequenceName)
features <- cbind(Structure.RNA, Structure.Pro)
row.names(features) <- make.names(sequenceName, unique = TRUE)
if (!is.null(label)) features <- data.frame(label = label, features)
features
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.