ncProR: Predicting Long non-coding RNA-Protein Interaction

Documented in computeFreq computeMLC featureFreq

#' Computation of the most-like CDS region of an RNA sequence
#' @description This function compute the most-like CDS (MLC) region of one RNA.
#' Methods based on the longest open reading frame (ORF) and maximum subarray sum (MSS) are supported.
#' @param oneRNA one RNA loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}.
#' Or a list of one RNA sequence. The sequence should be a vector of single characters.
#' @param mode can be \code{"ORF"} (compute the longest open reading frame) and/or
#' \code{"MSS"} (compute subsequence having the maximum subarray sum of hexamer score).
#'
#' @return A data frame. The MLC region, length and coverage of the MLC will be returned.
#' MSS-based method is based on [1] and [2].
#' ORF extraction function is borrowed from our previous work [3].
#'
#' @section References:
#' [1] Yang C, Yang L, Zhou M, \emph{et al}.
#' LncADeep: an ab initio lncRNA identification and functional annotation tool based on deep learning.
#' Bioinformatics. 2018; 34(22):3825-3834.
#'
#' [2] Sun L, Luo H, Bu D, \emph{et al}.
#' Utilizing sequence intrinsic composition to classify protein-coding and long non-coding transcripts.
#' Nucleic acids research. 2013; 41(17):e166-e166.
#'
#' [3] Han S, Liang Y, Ma Q, \emph{et al}.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing sequence intrinsic composition, structural information and physicochemical property.
#' Briefings in bioinformatics. 2019; 20(6):2009-2027.
#'
#' @importFrom seqinr getSequence
#'
#' @examples
#'
#' # Use "read.fasta" function of package "seqinr" to read a FASTA file:
#'
#' seqRNA <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#'
#' # Compute the MLC region using both "ORF" and "MSS" methods:
#'
#' MLC_list <- lapply(seqRNA, computeMLC, mode = c("ORF", "MSS"))
#'
#' @export

computeMLC <- function(oneRNA, mode = c("ORF", "MSS")) {
        mode <- match.arg(mode, several.ok = TRUE)
        oneRNA <- Internal.checkRNA(oneRNA)
        info <- data.frame(stringsAsFactors = F)

        if ("ORF" %in% mode) {
                info_ORF <- Internal.findORF(oneRNA, max.only = TRUE)
                info <- rbind(info, info_ORF)
        }

        if ("MSS" %in% mode) {
                info_MSS <- Internal.calculateMLC(oneRNA, logscore = LncADeep_logScore)
                info <- rbind(info, info_MSS)
        }
        info
}

#' Computation of the K-Mer Frequencies of RNA or Protein Sequences
#' @description This function can calculate the \emph{k}-mer frequencies of RNA or protein sequences.
#' Three kinds of protein representations are available.
#'
#' @param seqs sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters, but
#' protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#' @param seqType a string that specifies the nature of the sequence: \code{"RNA"} or \code{"Pro"} (protein).
#' If the input is DNA sequence and \code{seqType = "RNA"}, the DNA sequence will be converted to RNA sequence automatically.
#' Default: \code{"RNA"}.
#' @param computePro a string that specifies the computation mode of protein sequences: \code{"RPISeq"}, \code{"DeNovo"},
#' or \code{"rpiCOOL"}. Ignored when \code{seqType = "RNA"}.
#' Three modes indicate three different amino acid residues classifications that corresponds to methods "RPISeq", "De Novo prediction",
#' and "rpiCOOL". See details below. Default: \code{"RPISeq"}.
#' @param k an integer that indicates the sliding window step. Default: \code{3}.
#' @param EDP logical. If \code{TRUE}, entropy density profile (EDP) will be computed. Default: \code{FALSE}.
#' @param normalize can be \code{"none"}, \code{"row"} or \code{"column"}. Indicate if the frequencies should be normalized.
#' If normalize, should the features be normalized by row (each sequence) or by column (each feature)?
#' See details below. Default: \code{"none"}.
#' @param normData is the normalization data generated by this function.
#' If the input dataset is training set, or normalize strategy is \code{"none"} or \code{"row"},
#' just leave \code{normData = NULL}. If users want to build test set and the normalize strategy is \code{"column"},
#' the normalization data of the corresponding training set generated by this function should be passed to this argument.
#' See examples.
#' @param parallel.cores an integer specifying the number of cores for parallel computation. Default: \code{2}.
#' Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#'
#' @return If \code{normalize = "none"} or \code{normalize = "row"}, this function will return a data frame.
#' Row names are sequences names, and column names are polymer names.
#'
#' If \code{normalize = "column"}, the function will return a list containing features (a data frame named "feature")
#' and normalization values (a list named "normData") for extracting features for test sets.
#'
#' @details Function \code{computeFreq} calculate the \emph{k}-mer frequencies of RNA/protein sequences. Three computation modes
#' of protein frequencies are:
#'
#' \code{RPISeq}:
#' \{A, G, V\}, \{I, L, F, P\}, \{Y, M, T, S\}, \{H, N, Q, W\}, \{R, K\}, \{D, E\}, \{C\}
#' (Ref: [3]);
#'
#' \code{DeNovo}:
#' \{D, E\}, \{H, R, K\}, \{C, G, N, Q, S, T, Y\}, \{A, F, I, L, M, P, V, W\}
#' (Ref: [4]).
#'
#' \code{rpiCOOL}:
#' \{A, E\}, \{I, L, F, M, V\}, \{N, D, T, S\}, \{G\}, \{P\}, \{R, K, Q, H\}, \{Y, W\}, \{C\}
#' (Ref: [5]).
#'
#' If \code{EDP = TRUE}, entropy density profile (EDP) will be computed with equation:
#' s_\emph{i} = -1/H * c_\emph{i} * log2(c_\emph{i}), H = -sum(c_\emph{j} * log2(c_\emph{j})).
#' c is the frequencies, \emph{i} and \emph{j} represents the indices of \emph{k}-mer frequencies. (Ref: [6])
#'
#' The function also provides two normalization strategies: by row (each sequence) or by column (each feature).
#' If by row, the dataset will be processed with equation (Ref: [2]):
#' d_\emph{i} = (f_\emph{i} - min\{f_1, f_2, ...\}) / max\{f_1, f_2, ...\}.
#' f_1, f_2, ..., f_\emph{i} are the original values of each row.
#'
#' If by column, the dataset will be processed with:
#' d_\emph{i} = (f_\emph{i} - min\{f_1, f_2, ...\}) / (max\{f_1, f_2, ...\} - min\{_f1, f_2, ...\}).
#'
#' In [2], normalization is computed by row (each sequence).
#'
#' @section References:
#' [1] Han S, Yang X, Sun H, \emph{et al}.
#' LION: an integrated R package for effective prediction of ncRNA–protein interaction.
#' Briefings in Bioinformatics. 2022; 23(6):bbac420
#'
#' [2] Shen J, Zhang J, Luo X, \emph{et al}.
#' Predicting protein-protein interactions based only on sequences information.
#' Proc. Natl. Acad. Sci. U. S. A. 2007; 104:4337-41
#'
#' [3] Muppirala UK, Honavar VG, Dobbs D.
#' Predicting RNA-protein interactions using only sequence information.
#' BMC Bioinformatics 2011; 12:489
#'
#' [4] Wang Y, Chen X, Liu Z-P, \emph{et al}.
#' De novo prediction of RNA-protein interactions from sequence information.
#' Mol. BioSyst. 2013; 9:133-142
#'
#' [5] Akbaripour-Elahabad M, Zahiri J, Rafeh R, \emph{et al}.
#' rpiCOOL: A tool for In Silico RNA-protein interaction detection using random forest.
#' J. Theor. Biol. 2016; 402:1-8
#'
#' [6] Yang C, Yang L, Zhou M, \emph{et al}.
#' LncADeep: an ab initio lncRNA identification and functional annotation tool based on deep learning.
#' Bioinformatics. 2018; 34(22):3825-3834.
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr count
#' @seealso \code{\link{featureFreq}}
#' @examples
#'
#' # Use "read.fasta" function of package "seqinr" to read a FASTA file:
#'
#' seqs1 <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#' seqFreq1 <- computeFreq(seqs1, seqType = "RNA", k = 4, normalize = "row",
#'                         parallel.cores = 2)
#'
#' data(demoPositiveSeq)
#' seqs2 <- demoPositiveSeq$Pro.positive
#'
#' # Training set with normalization on column:
#'
#' seqFreq2 <- computeFreq(seqs2, seqType = "Pro", computePro = "RPISeq", k = 3,
#'                         normalize = "column", parallel.cores = 2)
#'
#' # If build a test set with normalization on column,
#' # "normData" of the corresponding training set (generated by this function) is required:
#'
#' seqFreq3 <- computeFreq(seqs2, seqType = "Pro", computePro = "RPISeq", k = 3,
#'                         normalize = "column", normData = seqFreq2$normData,
#'                         parallel.cores = 2)
#'
#' # If no normalization used:
#'
#' seqFreq4 <- computeFreq(seqs2, seqType = "Pro", computePro = "DeNovo", k = 3,
#'                         normalize = "none", parallel.cores = 2)
#'
#' @export

computeFreq <- function(seqs, seqType = c("RNA", "Pro"),
                        computePro = c("RPISeq", "DeNovo", "rpiCOOL"), k = 3, EDP = FALSE,
                        normalize = c("none", "row", "column"), normData = NULL,
                        parallel.cores = 2, cl = NULL) {

        seqType <- match.arg(seqType)
        computePro <- match.arg(computePro)
        normalize <- match.arg(normalize)

        flagComputeNorm <- FALSE
        if (normalize == "column" & is.null(normData)) flagComputeNorm <- TRUE

        close_cl <- FALSE
        if (is.null(cl)) {
                message("+ Initializing...  ", Sys.time())
                message("- Creating cores...  ")
                parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
                cl <- parallel::makeCluster(parallel.cores)
                close_cl <- TRUE
        }
        parallel::clusterExport(cl, c("Internal.checkRNA", "Internal.computeEDP"), envir = environment())

        if (seqType == "RNA") {
                message("\n", "+ Calculating RNA sequences...", Sys.time(), "\n", "- The value of k is ", k, " (Normalize: ", normalize, ").")
        } else {
                message("\n", "+ Calculating protein sequences...", Sys.time(), "\n",  "- The value of k is ", k, " (Normalize: ", normalize, ", Mode: ", computePro, ").")
        }

        features <- parallel::parSapply(cl, seqs, Internal.computeFreq, seqType = seqType,
                                        computePro = computePro, k = k, EDP = EDP)
        if (close_cl) parallel::stopCluster(cl)

        features <- t(features)

        if (normalize == "none") {
                out_features <- as.data.frame(features)
        } else if (normalize == "column") {
                if (flagComputeNorm) {
                        normMin <- apply(features, 2, min)
                        normMax <- apply(features, 2, max)
                } else {
                        normMin <- normData$normMin
                        normMax <- normData$normMax
                }

                freq.out <- apply(features, 1, function(x) {
                        out <- (x - normMin) / (normMax - normMin)
                })
                features <- as.data.frame(t(freq.out))
                features <- Internal.checkNa(features)
                out_features <- list(feature = features,
                                     normData = list(seqType = seqType,
                                                     normMin = normMin, normMax = normMax))
        } else {
                normMin <- apply(features, 1, min)
                normMax <- apply(features, 1, max)

                freq.out <- apply(features, 2, function(x) {
                        out <- (x - normMin) / normMax
                })
                out_features <- as.data.frame(freq.out)
                features <- Internal.checkNa(features)
                }

        # message("\n", "+ Completed.  ", Sys.time(), "\n")
        out_features
}

#' Extraction of the \emph{K}-Mer Features of RNA and Protein Sequences
#' @description Basically a wrapper for \code{\link{computeFreq}} function.
#' This function can calculate the \emph{k}-mer frequencies of RNA and protein sequences at the same time
#' and format the results as the dataset that can be used to build classifier.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA sequences.
#' RNA sequences will be converted into lower case letters.
#' Each sequence should be a vector of single characters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#' @param label optional. A string or a vector of strings or \code{NULL}.
#' Indicates the class of the samples such as
#' "Interact", "Non.Interact". Default: \code{NULL}.
#' @param featureMode a string that can be \code{"concatenate"} or \code{"combine"}.
#' If \code{"concatenate"}, the \emph{k}-mer features of RNA and proteins will be simply concatenated.
#' If \code{"combine"}, the returned dataset will be formed by combining the \emph{k}-mer features of RNA and proteins.
#' See details below. Default: \code{"concatenate"}.
#' @param computePro a string that specifies the computation mode of protein sequence: \code{"RPISeq"}, \code{"DeNovo"},
#' or \code{"rpiCOOL"}. Ignored when \code{seqType = "RNA"}.
#' Three modes indicate three different amino acid residues classifications that corresponds to the methods "RPISeq", "De Novo prediction",
#' and "rpiCOOL". See details below. Default: \code{"RPISeq"}.
#' @param k.Pro an integer that indicates the sliding window step of RNA sequences. Default: \code{4}.
#' @param k.RNA an integer that indicates the sliding window step of protein sequences. Default: \code{3}.
#' @param EDP logical. If \code{TRUE}, entropy density profile (EDP) will be computed. Default: \code{FALSE}.
#' @param normalize can be \code{"none"}, \code{"row"} or \code{"column"}. Indicate if the frequencies should be normalized.
#' If normalize, should the features be normalized by row (each sequence) or by column (each feature)?
#' See details below. Default: \code{"none"}.
#' @param normData is the normalization data generated by this function.
#' If the input dataset is training set, or normalize strategy is \code{"none"} or \code{"row"},
#' just leave \code{normData = NULL}. If users want to build test set and the normalize strategy is \code{"column"},
#' the normalization data of the corresponding training set generated by this function should be passed to this argument.
#' See examples.
#' @param parallel.cores an integer specifying the number of cores for parallel computation. Default: \code{2}.
#' Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#'
#' @return If \code{normalize = "none"} or \code{normalize = "row"}, this function will return a data frame.
#' Row names are sequences names, and column names are polymer names.
#' The names of RNA and protein sequences are separated with ".",
#' i.e. row names format: "\emph{RNASequenceName}.\emph{proteinSequenceName}" (e.g. "YDL227C.YOR198C").
#' If \code{featureMode = "combine"}, the polymers of RNA and protein sequences are also separated with ".",
#' i.e. column format: "\emph{RNAPolymerName}.\emph{proteinPolymerName}" (e.g. "aa.CCA").
#'
#' If \code{normalize = "column"}, the function will return a list containing features (a data frame named "feature") and normalization
#' values (a list named "normData") for extracting features for test sets.
#'
#' @details
#' see \code{\link{computeFreq}}.
#'
#' @section References:
#' [1] Han S, Yang X, Sun H, \emph{et al}.
#' LION: an integrated R package for effective prediction of ncRNA–protein interaction.
#' Briefings in Bioinformatics. 2022; 23(6):bbac420
#'
#' [2] Shen J, Zhang J, Luo X, \emph{et al}.
#' Predicting protein-protein interactions based only on sequences information.
#' Proc. Natl. Acad. Sci. U. S. A. 2007; 104:4337-41
#'
#' [3] Muppirala UK, Honavar VG, Dobbs D.
#' Predicting RNA-protein interactions using only sequence information.
#' BMC Bioinformatics 2011; 12:489
#'
#' [4] Wang Y, Chen X, Liu Z-P, \emph{et al}.
#' De novo prediction of RNA-protein interactions from sequence information.
#' Mol. BioSyst. 2013; 9:133-142
#'
#' [5] Akbaripour-Elahabad M, Zahiri J, Rafeh R, \emph{et al}.
#' rpiCOOL: A tool for In Silico RNA-protein interaction detection using random forest.
#' J. Theor. Biol. 2016; 402:1-8
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr count
#' @seealso \code{\link{computeFreq}}
#' @examples
#' data(demoPositiveSeq)
#' seqsRNA <- demoPositiveSeq$RNA.positive
#' seqsPro <- demoPositiveSeq$Pro.positive
#'
#' dataset1 <- featureFreq(seqRNA = seqsRNA, seqPro = seqsPro, label = "Interact",
#'                         featureMode = "comb", computePro = "DeNovo", k.Pro = 3,
#'                         k.RNA = 2, normalize = "row", parallel.cores = 2)
#'
#' # Training set with normalization on column:
#'
#' dataset2 <- featureFreq(seqRNA = seqsRNA, seqPro = seqsPro, featureMode = "conc",
#'                         computePro = "rpiCOOL", k.Pro = 3, k.RNA = 4,
#'                         normalize = "column", parallel.cores = 2)
#'
#' # If build a test set with normalization on column,
#' # "normData" of the corresponding training set (generated by this function) is required:
#'
#' dataset3 <- featureFreq(seqRNA = seqsRNA, seqPro = seqsPro, featureMode = "conc",
#'                         computePro = "rpiCOOL", k.Pro = 3, k.RNA = 4,
#'                         normalize = "column", normData = dataset2$normData,
#'                         parallel.cores = 2)
#'
#' @export

featureFreq <- function(seqRNA, seqPro, label = NULL, featureMode = c("concatenate", "combine"),
                        computePro = c("RPISeq", "DeNovo", "rpiCOOL"), k.Pro = 3, k.RNA = 4, EDP = FALSE,
                        normalize = c("none", "row", "column"), normData = NULL,
                        parallel.cores = 2, cl = NULL) {

        if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
        if (!is.null(label)) {
                if (length(label) == 1) {
                        label <- rep(label, length(seqPro))
                }
                if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
        }

        featureMode <- match.arg(featureMode)
        computePro <- match.arg(computePro)
        normalize <- match.arg(normalize)

        close_cl <- FALSE
        if (is.null(cl)) {
                message("+ Initializing...  ", Sys.time())
                message("- Creating cores...  ")
                parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
                cl <- parallel::makeCluster(parallel.cores)
                close_cl <- TRUE
        }

        if (normalize == "column" & !is.null(normData)) {
                featureRNA <- computeFreq(seqs = seqRNA, seqType = "RNA", k = k.RNA, EDP = EDP,
                                          normalize = normalize, normData = normData$normData_RNA,
                                          cl = cl)
                featurePro <- computeFreq(seqs = seqPro, seqType = "Pro", k = k.Pro, EDP = EDP,
                                          normalize = normalize, computePro = computePro,
                                          normData = normData$normData_Pro,
                                          cl = cl)
        } else {
                featureRNA <- computeFreq(seqs = seqRNA, seqType = "RNA", k = k.RNA, EDP = EDP,
                                          normalize = normalize, normData = NULL,
                                          cl = cl)
                featurePro <- computeFreq(seqs = seqPro, seqType = "Pro", k = k.Pro, EDP = EDP,
                                          normalize = normalize, computePro = computePro,
                                          normData = NULL, cl = cl)
        }

        if (close_cl) parallel::stopCluster(cl)

        if (normalize == "column") {
                normData <- list(normData_RNA = featureRNA$normData, normData_Pro = featurePro$normData)
                featureRNA <- featureRNA$feature
                featurePro <- featurePro$feature
        }

        sequenceName <- paste(names(seqRNA), names(seqPro), sep = ".")

        if (featureMode == "combine") {
                featureName <- sapply(names(featureRNA), function(nameRNA) {
                        names <- paste(nameRNA, names(featurePro), sep = ".")
                })
                featureName <- as.character(featureName)
                featureValue <- mapply(Internal.combineMotifs, oneRNA = as.data.frame(t(featureRNA)),
                                       onePro = as.data.frame(t(featurePro)))
                features <- as.data.frame(t(featureValue), row.names = sequenceName)
                names(features) <- featureName
        } else {
                features <- cbind(featureRNA, featurePro)
                row.names(features) <- make.names(sequenceName, unique = TRUE)
        }

        if (!is.null(label)) features <- data.frame(label = label, features)

        if (normalize == "column") {
                features <- list(feature = features, normData = normData)
        }
        features
}