#' Predict RNA-Protein Interaction Using lncPro Method
#' @description This function can predict lncRNA/RNA-protein interactions using lncPro method. Model retraining and feature extraction are also supported.
#' Programs "RNAsubopt" from software "ViennaRNA Package" and "Predator" is required. Please also note that
#' "Predator" is only available on UNIX/Linux and 32-bit Windows OS.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}.
#' Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}.
#' Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#' @param mode a string. Set \code{"prediction"} to predict ncRNA-protein pairs and return prediction results;
#' set \code{"retrain"} to build a new random forest model using the input data;
#' set \code{"feature"} to return a data frame contains the extracted features.
#' Users can use the extracted features generated by \code{mode = "feature"} to train classifiers
#' with other machine learning algorithms. Default: \code{"prediction"}.
#' @param args.RNAsubopt,args.Predator string (in format such as "-N --pfScale 1.07") specifying additional arguments for "RNAsubopt" (except "-p") and "Predator" (except "-a" and "-b"). This is used when you want to control their behaviours. Arguments for "RNAsubopt" and "Predator" please refer to their manual. Default: \code{NULL}.
#' @param path.RNAsubopt,path.Predator a string specifying the location of "RNAsubopt" and "Predator" program.
#' @param path.stride a string specifying the location of file "stride.dat" required by program Predator.
#' @param workDir.Pro a string specifying the directory for temporary files used for process protein sequences.
#' The temp files will be deleted automatically when
#' the calculation is completed. If the directory does not exist, it will be created automatically.
#' @param prediction (only when \code{mode = "prediction"}) set \code{"original"} to use original lncPro algorithm,
#' or set \code{"retrained"} to call retrained model.
#' The retrained model is constructed with the same features as the original version, but random classifier is employed to
#' build the classifier.
#' @param retrained.model (only when \code{mode = "prediction"} and \code{prediction = "retrained"})
#' use the default model or a new retrained model to predict ncRNA-protein pairs?
#' If \code{NULL}, default machine learning model will be used. Or pass the model generated by this function
#' with parameter \code{"mode = retrain"}. Default: \code{NULL}. See examples below.
#' @param label a string or a vector of strings or \code{NULL}.
#' Optional when \code{mode = "prediction"} or \code{mode = "feature"}: used to give labels or notes to the output result.
#' Required when \code{mode = "retrain"}: must be a vector of strings that corresponds to input sequences.
#' Each string indicates the class of each input pair. Default: \code{NULL}.
#' @param positive.class (only when \code{mode = "retrain"}) \code{NULL} or a string used to indicate
#' which class is the positive class, Should be one
#' of the classes in \code{label} or leave \code{positive.class = NULL}.
#' In the latter case, the first class in \code{label} will be used
#' as the positive class. Default: \code{NULL}.
#' @param folds.num (only when \code{mode = "retrain"}) an integer indicates the number of folds for cross validation.
#' Default: \code{10} for 10-fold cross validation.
#' @param ntree integer, number of trees to grow. See \code{\link[randomForest]{randomForest}}.
#' Default: \code{3000}.
#' @param mtry.ratios (only when \code{mode = "retrain"}) used to indicate the ratios of \code{mtry} when tuning the random forest classifier.
#' \code{mtry} = ratio of mtry * number of features Default: \code{c(0.1, 0.2, 0.4, 0.6, 0.8)}.
#' @param seed (only when \code{mode = "retrain"}) an integer indicates the random seed for data splitting.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#' @param ... (only when \code{mode = "retrain"}) other parameters (except \code{ntree} and \code{mtry}) passed to \code{\link[randomForest]{randomForest}} function.
#'
#' @return
#' If \code{mode = "prediction"}, this function returns a data frame that contains the predicted results.
#'
#' If \code{mode = "retrain"}, this function returns a random forest classifier.
#'
#' If \code{mode = "feature"}, this function returns a data frame that contains the extracted features.
#'
#' @details
#' The method is proposed by lncPro. This function, \code{runlncPro}, has
#' improved and fixed the original code.
#'
#' \code{runlncPro} depends on the program "RNAsubopt" of software "ViennaRNA"
#' (\url{http://www.tbi.univie.ac.at/RNA/index.html}) and "Predator"
#' (\url{https://bioweb.pasteur.fr/packages/pack@predator@2.1.2}).
#'
#' Parameter \code{path.RNAsubopt} can be simply defined as \code{"RNAsubopt"} as
#' default when the OS is UNIX/Linux. However, for some OS, such as Windows, users may
#' need to specify the \code{path.RNAsubopt} if the path of "RNAsubopt" haven't been
#' added in environment variables (e.g. \code{path.RNAsubopt = '"C:/Program Files/ViennaRNA/RNAsubopt.exe"'}).
#'
#' Program "Predator" is only available on UNIX/Linux and 32-bit Windows OS.
#'
#' @section References:
#' Lu Q, Ren S, Lu M, \emph{et al}.
#' Computational prediction of associations between long non-coding RNAs and proteins.
#' BMC Genomics 2013; 14:651
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom randomForest randomForest
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom parallel mcmapply
#' @importFrom seqinr a
#' @importFrom seqinr s2c
#' @importFrom seqinr getSequence
#' @importFrom seqinr write.fasta
#' @importFrom utils data
#' @importFrom stats predict
#'
#' @examples
#'
#' \donttest{
#'
#' # Following codes only show how to use this function
#' # and cannot reflect the genuine performance of tools or classifiers.
#'
#' data(demoPositiveSeq)
#' seqRNA <- demoPositiveSeq$RNA.positive
#' seqPro <- demoPositiveSeq$Pro.positive
#'
#' # Predicting ncRNA-protein pairs (you need to use your own paths):
#'
#' path.RNAsubopt <- "RNAsubopt"
#' path.Predator <- "/mnt/external_drive_1/hansy/predator/predator"
#' path.stride <- "/mnt/external_drive_1/hansy/predator/stride.dat"
#' workDir.Pro <- "tmp"
#'
#' Res_lncPro_1 <- run_lncPro(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' path.RNAsubopt = path.RNAsubopt, path.Predator = path.Predator,
#' path.stride = path.stride, workDir.Pro = workDir.Pro,
#' prediction = "original", label = "lncPro_original",
#' parallel.cores = 10) # using original algorithm
#'
#' Res_lncPro_2 <- run_lncPro(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' path.RNAsubopt = path.RNAsubopt, path.Predator = path.Predator,
#' path.stride = path.stride, workDir.Pro = workDir.Pro,
#' prediction = "retrained", retrained.model = NULL,
#' label = "lncPro_retrained",
#' parallel.cores = 10) # using default rebuilt model
#'
#' # Train a new model:
#'
#' # Argument "label" which indicates the class of each input pair is required here.
#' # "label" should correspond to the classes of "seqRNA" and "seqPro".
#' # "positive.class" should be one of the classes in argument "label" or can be set as "NULL".
#' # In the latter case, the first label in "label" will be used as the positive class.
#' # Parameters of random forest, such as "replace", "nodesize", can be passed using "..." argument.
#'
#' lncPro_model = run_lncPro(seqRNA = seqRNA, seqPro = seqPro, mode = "retrain",
#' path.RNAsubopt = path.RNAsubopt, path.Predator = path.Predator,
#' path.stride = path.stride, workDir.Pro = workDir.Pro,
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' positive.class = NULL, folds.num = 10,
#' ntree = 100, seed = 1, parallel.cores = 2, replace = FALSE)
#'
#' # Predicting using new built model by setting "retrained.model = lncPro_model":
#'
#' Res_lncPro_3 <- run_lncPro(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' path.RNAsubopt = path.RNAsubopt, path.Predator = path.Predator,
#' path.stride = path.stride, workDir.Pro = workDir.Pro,
#' prediction = "retrained", retrained.model = lncPro_model,
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' parallel.cores = 10)
#'
#' # Only extracting features:
#'
#' lncPro_feature_df <- run_lncPro(seqRNA = seqRNA, seqPro = seqPro,
#' mode = "feature", path.RNAsubopt = path.RNAsubopt,
#' path.Predator = path.Predator, path.stride = path.stride,
#' workDir.Pro = workDir.Pro, label = "Interact",
#' parallel.cores = 10)
#'
#' # Extracted features can be used to build classifiers using other machine learning
#' # algorithms, which provides users with more flexibility.
#'
#' }
#'
#' @export
run_lncPro <- function(seqRNA, seqPro, mode = c("prediction", "retrain", "feature"),
args.RNAsubopt = NULL, args.Predator = NULL,
path.RNAsubopt = "RNAsubopt", path.Predator = "Predator/predator",
path.stride = "Predator/stride.dat", workDir.Pro = getwd(),
prediction = c("original", "retrained"), retrained.model = NULL,
label = NULL, positive.class = NULL, folds.num = 10,
ntree = 3000, mtry.ratios = c(0.1, 0.2, 0.4, 0.6, 0.8), seed = 1,
parallel.cores = 2, cl = NULL, ...) {
mode <- match.arg(mode, choices = c("prediction", "retrain", "feature"))
prediction <- match.arg(prediction)
if (!file.exists(path.stride)) stop("The path of stride.dat is not correct! Please check parameter path.stride.")
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
if (mode == "retrain") {
if (length(label) != length(seqRNA) | length(unique(label)) != 2) {
stop("label is required and should correspond to input sequences!")
}
if (!is.null(positive.class)) {
if (!positive.class %in% label) stop("positive.class should be NULL or one of the classes in label.")
}
}
message("+ Initializing... ", Sys.time())
message("- Checking sequences... ")
check_idx <- which(lengths(seqRNA) <= 4095)
if (length(check_idx) < length(seqRNA)) {
message(" * Due to the limitation of RNAsubopt,")
message(" sequences with length more than 4095 nt will be omitted.")
message(" ", length(seqRNA) - length(check_idx), " of ", length(seqRNA), " pairs have been removed.")
seqRNA <- seqRNA[check_idx]
seqPro <- seqPro[check_idx]
if (!is.null(label)) label <- label[check_idx]
}
check_idx <- which(lengths(seqPro) >= 30)
if (length(check_idx) < length(seqPro)) {
message(" * Due to the limitation of predator,")
message(" sequences with length less than 30 amino acids will be omitted.")
message(" ", length(seqPro) - length(check_idx), " of ", length(seqPro), " pairs have been removed.")
seqRNA <- seqRNA[check_idx]
seqPro <- seqPro[check_idx]
if (!is.null(label)) label <- label[check_idx]
}
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
# parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
# cl <- parallel::makeCluster(parallel.cores)
seqRNA <- parallel::parLapply(cl, seqRNA, Internal.checkRNA)
data(aaindex, package = "seqinr", envir = environment())
aaindex <- get("aaindex")
if (mode == "prediction" & prediction == "original") {
message("\n", "+ Extracting features from RNA sequences... ", Sys.time())
lncProFeatures.RNA <- Internal.lncPro_extractFeatures(cl = cl, seqs = seqRNA, seqType = "RNA",
args.RNAsubopt = args.RNAsubopt,
path.RNAsubopt = path.RNAsubopt,
aaindex = aaindex)
message("\n", "+ Extracting features from protein sequences... ", Sys.time())
lncProFeatures.Pro <- Internal.lncPro_extractFeatures(cl = cl, seqs = seqPro, seqType = "Pro",
args.Predator = args.Predator,
path.Predator = path.Predator,
path.stride = path.stride,
workDir.Pro = workDir.Pro, aaindex = aaindex)
if (close_cl) parallel::stopCluster(cl)
lncProFeatures.RNA.df <- data.frame(lncProFeatures.RNA, stringsAsFactors = FALSE)
lncProFeatures.Pro.df <- data.frame(lncProFeatures.Pro, stringsAsFactors = FALSE)
} else {
featureSetPhysChem <- featurePhysChem(seqRNA = seqRNA, seqPro = seqPro,
cl = cl, Fourier.len = 10, physchemRNA = c("hydrogenBonding", "vanderWaal"),
physchemPro = c("polarity.Grantham", "polarity.Zimmerman",
"hphob.KyteDoolittle", "hphob.BullBreese"))
featureSetStruct <- featureStructure(seqRNA = seqRNA, seqPro = seqPro,
cl = cl, structureRNA.num = 6, structurePro = "ChouFasman",
Fourier.len = 10, workDir.Pro = workDir.Pro,
args.RNAsubopt = args.RNAsubopt,
args.Predator = args.Predator,
path.RNAsubopt = path.RNAsubopt,
path.Predator = path.Predator,
path.stride = path.stride)
if (close_cl) parallel::stopCluster(cl)
featureSet <- cbind(featureSetPhysChem, featureSetStruct)
}
if (length(names(seqRNA)) != 0) names.seqRNA <- names(seqRNA) else names.seqRNA <- "RNA.noName"
if (length(names(seqPro)) != 0) names.seqPro <- names(seqPro) else names.seqPro <- "Protein.noName"
if (mode == "prediction") {
if (prediction == "original") {
message("\n", "+ Predicting pairs using lncPro... ", Sys.time())
message("- Original algorithm selected. ", Sys.time())
scores <- t(parallel::mcmapply(Internal.lncPro_finalScore, lncProFeatures.RNA.df,
lncProFeatures.Pro.df, mc.cores = parallel.cores))
Prediction <- ifelse(scores[,1] > 50, "Interact", "Non.Interact")
if (is.null(label)) {
outRes <- data.frame(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
lncPro_original_pred = Prediction,
lncPro_original_prob = scores[,1]/100,
stringsAsFactors = FALSE)
} else {
outRes <- data.frame(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
label = label, lncPro_original_pred = Prediction,
lncPro_original_prob = scores[,1]/100,
stringsAsFactors = FALSE)
}
} else {
message("\n", "+ Predicting pairs using lncPro... ", Sys.time())
if (is.null(retrained.model)) {
message("- Default retrained model selected. ", Sys.time())
data(mod_lncPro, package = "LION", envir = environment())
mod_lncPro <- get("mod_lncPro")
} else {
message("- Using model provided by user. ", Sys.time())
mod_lncPro <- retrained.model
}
res <- stats::predict(mod_lncPro, featureSet, type = "response")
prob <- stats::predict(mod_lncPro, featureSet, type = "prob")
if (is.null(label)) {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
lncPro_retrain_pred = as.character(res),
lncPro_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F)
} else {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro, label = label,
lncPro_retrain_pred = as.character(res),
lncPro_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F)
}
}
message("\n", "+ Completed. ", Sys.time())
return(outRes)
} else if (mode == "feature") {
if (!is.null(label)) featureSet <- cbind(label = label, featureSet)
message("\n", "+ Completed. ", Sys.time())
return(featureSet)
} else {
featureSet <- cbind(label = label, featureSet)
message("\n", "+ Retraining model... ", Sys.time())
retrained.model <- Internal.randomForest_tune(datasets = list(featureSet), label.col = 1,
positive.class = positive.class, folds.num = folds.num,
ntree = ntree,
mtry.ratios = mtry.ratios,
seed = seed, return.model = TRUE,
parallel.cores = parallel.cores, ...)
message("\n", "+ Completed. ", Sys.time())
return(retrained.model)
}
}
#' Predict RNA-Protein Interaction Using RPISeq Method
#' @description This function can predict lncRNA/RNA-protein interactions using RPISeq method.
#' Both the web-based original version and retrained model are available. Network is required to use
#' the original version. Model retraining and feature extraction are also supported.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}.
#' Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}.
#' Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#'
#' @param mode a string. Set \code{"prediction"} to predict ncRNA-protein pairs and return prediction results;
#' set \code{"retrain"} to build a new random forest model using the input data;
#' set \code{"feature"} to return a data frame contains the extracted features.
#' Users can use the extracted features generated by \code{mode = "feature"} to train classifiers
#' with other machine learning algorithms. Default: \code{"prediction"}.
#' @param prediction (only when \code{mode = "prediction"}) set \code{"web"} to use original web-based RPISeq algorithm (network is required),
#' or set \code{"retrained"} to call retrained model.
#' @param retrained.model (only when \code{mode = "prediction"} and \code{prediction = "retrained"})
#' use the default model or a new retrained model to predict ncRNA-protein pairs?
#' If \code{NULL}, default machine learning model will be used. Or pass the model generated by this function
#' with parameter \code{"mode = retrain"}. Default: \code{NULL}. See examples below.
#' @param label a string or a vector of strings or \code{NULL}.
#' Optional when \code{mode = "prediction"} or \code{mode = "feature"}: used to give labels or notes to the output result.
#' Required when \code{mode = "retrain"}: must be a vector of strings that corresponds to input sequences.
#' Each string indicates the class of each input pair. Default: \code{NULL}.
#' @param positive.class (only when \code{mode = "retrain"}) \code{NULL} or a string used to indicate
#' which class is the positive class, Should be one
#' of the classes in \code{label} or leave \code{positive.class = NULL}.
#' In the latter case, the first class in \code{label} will be used
#' as the positive class. Default: \code{NULL}.
#' @param folds.num (only when \code{mode = "retrain"}) an integer indicates the number of folds for cross validation.
#' Default: \code{10} for 10-fold cross validation.
#' @param ntree integer, number of trees to grow. See \code{\link[randomForest]{randomForest}}.
#' Default: \code{3000}.
#' @param mtry.ratios (only when \code{mode = "retrain"}) used to indicate the ratios of \code{mtry} when tuning the random forest classifier.
#' \code{mtry} = ratio of mtry * number of features Default: \code{c(0.1, 0.2, 0.4, 0.6, 0.8)}.
#' @param seed (only when \code{mode = "retrain"}) an integer indicates the random seed for data splitting.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#' @param ... (only when \code{mode = "retrain"}) other parameters (except \code{ntree} and \code{mtry}) passed to \code{\link[randomForest]{randomForest}} function.
#'
#' @return
#' If \code{mode = "prediction"}, this function returns a data frame that contains the predicted results.
#'
#' If \code{mode = "retrain"}, this function returns a random forest classifier.
#'
#' If \code{mode = "feature"}, this function returns a data frame that contains the extracted features.
#'
#' @section References:
#' Muppirala UK, Honavar VG, Dobbs D.
#' Predicting RNA-protein interactions using only sequence information.
#' BMC Bioinformatics 2011; 12:489
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom randomForest randomForest
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom RCurl postForm
#' @importFrom stats predict
#'
#' @examples
#'
#' # Following codes only show how to use this function
#' # and cannot reflect the genuine performance of tools or classifiers.
#'
#' data(demoPositiveSeq)
#' seqRNA <- demoPositiveSeq$RNA.positive
#' seqPro <- demoPositiveSeq$Pro.positive
#'
#' # Predicting ncRNA-protein pairs:
#'
#' Res_RPISeq_1 <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' label = "Interact", prediction = "web",
#' parallel.cores = 2) # using web server
#'
#' Res_RPISeq_2 <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' prediction = "retrained", retrained.model = NULL,
#' parallel.cores = 2) # using default rebuilt model
#'
#' # Train a new model:
#'
#' # Argument "label" which indicates the class of each input pair is required here.
#' # "label" should correspond to the classes of "seqRNA" and "seqPro".
#' # "positive.class" should be one of the classes in argument "label" or can be set as "NULL".
#' # In the latter case, the first label in "label" will be used as the positive class.
#' # Parameters of random forest, such as "nodesize", can be passed using "..." argument.
#'
#' RPI_model <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro, mode = "retrain",
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' positive.class = "Interact", folds.num = 5,
#' ntree = 300, seed = 1, parallel.cores = 2, nodesize = 2)
#'
#' # Predicting using new built model by setting "retrained.model = RPI_model":
#'
#' Res_RPISeq_3 <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' prediction = "retrained", retrained.model = RPI_model,
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' parallel.cores = 2)
#'
#' # Only extracting features:
#'
#' RPISeq_feature_df <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro, mode = "feature",
#' label = "Interact", parallel.cores = 2)
#'
#' # Extracted features can be used to build classifiers using other machine learning
#' # algorithms, which provides users with more flexibility.
#'
#' @export
run_RPISeq <- function(seqRNA, seqPro, mode = c("prediction", "retrain", "feature"),
prediction = c("web", "retrained"), retrained.model = NULL,
label = NULL, positive.class = NULL, folds.num = 10,
ntree = 3000, mtry.ratios = c(0.1, 0.2, 0.4, 0.6, 0.8), seed = 1,
parallel.cores = 2, cl = NULL, ...) {
mode <- match.arg(mode, choices = c("prediction", "retrain", "feature"))
prediction <- match.arg(prediction)
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
if (mode == "retrain") {
if (length(label) != length(seqRNA) | length(unique(label)) != 2) {
stop("label is required and should correspond to input sequences!")
}
if (!is.null(positive.class)) {
if (!positive.class %in% label) stop("positive.class should be NULL or one of the classes in label.")
}
}
message("+ Initializing... ", Sys.time())
# message("- Creating cores... ")
#
# parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
# cl <- parallel::makeCluster(parallel.cores)
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
seqRNA <- parallel::parLapply(cl, seqRNA, Internal.checkRNA)
if (length(names(seqRNA)) != 0) names.seqRNA <- names(seqRNA) else names.seqRNA <- "RNA.noName"
if (length(names(seqPro)) != 0) names.seqPro <- names(seqPro) else names.seqPro <- "Protein.noName"
if (mode %in% c("retrain", "feature") | prediction == ("retrained")) {
featureSet <- featureFreq(seqRNA = seqRNA, seqPro = seqPro,
featureMode = "conc", computePro = "RPISeq", k.Pro = 3,
k.RNA = 4, normalize = "row", cl = cl)
}
if (mode == "prediction") {
message("\n", "+ Predicting pairs using RPISeq... ", Sys.time())
if (prediction == "web") {
message("- Original algorithm (web server-based) selected.")
ProSeq <- sapply(seqPro, seqinr::getSequence, as.string = T)
RNASeq <- sapply(seqRNA, seqinr::getSequence, as.string = T)
parallel::clusterExport(cl, c("ProSeq", "RNASeq",
"Internal.get_RPISeqWeb_res"), envir = environment())
RPISeq <- parallel::parSapply(cl, 1:length(ProSeq), function(pairNum) {
RPISeq_res <- Internal.get_RPISeqWeb_res(ProSeq[[pairNum]], RNASeq[[pairNum]])
RPISeq_res
})
RPISeq <- t(RPISeq)
RPISeq <- data.frame(RPISeq, stringsAsFactors = F)
RFres <- ifelse(RPISeq$RF_prob > 0.5, "Interact", "Non.Interact")
SVMres <- ifelse(RPISeq$SVM_prob > 0.5, "Interact", "Non.Interact")
if (is.null(label)) {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
RPISeq_Web_RF_pred = RFres, RPISeq_Web_SVM_pred = SVMres,
RPISeq_Web_RF_prob = RPISeq$RF_prob,
RPISeq_Web_SVM_prob = RPISeq$SVM_prob)
outRes <- data.frame(outRes, stringsAsFactors = F)
} else {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro, label = label,
RPISeq_Web_RF_pred = RFres, RPISeq_Web_SVM_pred = SVMres,
RPISeq_Web_RF_prob = RPISeq$RF_prob,
RPISeq_Web_SVM_prob = RPISeq$SVM_prob)
outRes <- data.frame(outRes, stringsAsFactors = F)
}
} else {
if (is.null(retrained.model)) {
message("- Default retrained model selected.")
data(mod_RPISeq, package = "LION", envir = environment())
mod_RPISeq <- get("mod_RPISeq")
} else {
message("- Using model provided by user. ", Sys.time())
mod_RPISeq <- retrained.model
}
res <- stats::predict(mod_RPISeq, featureSet, type = "response")
prob <- stats::predict(mod_RPISeq, featureSet, type = "prob")
if (is.null(label)) {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
RPISeq_retrain_pred = as.character(res),
RPISeq_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F)
} else {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro, label = label,
RPISeq_retrain_pred = as.character(res), RPISeq_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F)
}
}
row.names(outRes) <- NULL
if (close_cl) parallel::stopCluster(cl)
message("\n", "+ Completed. ", Sys.time())
return(outRes)
} else if (mode == "feature") {
if (close_cl) parallel::stopCluster(cl)
if (!is.null(label)) featureSet <- cbind(label = label, featureSet)
message("\n", "+ Completed. ", Sys.time())
return(featureSet)
} else {
if (close_cl) parallel::stopCluster(cl)
message("\n", "+ Retraining model... ", Sys.time())
featureSet <- cbind(label = label, featureSet)
retrained.model <- Internal.randomForest_tune(datasets = list(featureSet), label.col = 1,
positive.class = positive.class, folds.num = folds.num,
ntree = ntree,
mtry.ratios = mtry.ratios,
seed = seed, return.model = TRUE,
parallel.cores = parallel.cores, ...)
message("\n", "+ Completed. ", Sys.time())
return(retrained.model)
}
}
#' Predict RNA-Protein Interaction Using rpiCOOL's Features
#' @description This function can predict lncRNA/RNA-protein interactions using rebuilt model trained with rpiCOOL's feature set.
#' Model retraining and feature extraction are also supported.
#' The codes of this function slightly differ from rpiCOOL's script.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#'
#' @param mode a string. Set \code{"prediction"} to predict ncRNA-protein pairs and return prediction results;
#' set \code{"retrain"} to build a new random forest model using the input data;
#' set \code{"feature"} to return a data frame contains the extracted features.
#' Users can use the extracted features generated by \code{mode = "feature"} to train classifiers
#' with other machine learning algorithms. Default: \code{"prediction"}.
#' @param retrained.model (only when \code{mode = "prediction"})
#' use the default model or a new retrained model to predict ncRNA-protein pairs?
#' If \code{NULL}, default machine learning model will be used. Or pass the model generated by this function
#' with parameter \code{"mode = retrain"}. Default: \code{NULL}. See examples below.
#' @param label a string or a vector of strings or \code{NULL}.
#' Optional when \code{mode = "prediction"} or \code{mode = "feature"}: used to give labels or notes to the output result.
#' Required when \code{mode = "retrain"}: must be a vector of strings that corresponds to input sequences.
#' Each string indicates the class of each input pair. Default: \code{NULL}.
#' @param positive.class (only when \code{mode = "retrain"}) \code{NULL} or a string used to indicate
#' which class is the positive class, Should be one
#' of the classes in \code{label} or leave \code{positive.class = NULL}.
#' In the latter case, the first class in \code{label} will be used
#' as the positive class. Default: \code{NULL}.
#' @param folds.num (only when \code{mode = "retrain"}) an integer indicates the number of folds for cross validation.
#' Default: \code{10} for 10-fold cross validation.
#' @param ntree integer, number of trees to grow. See \code{\link[randomForest]{randomForest}}.
#' Default: \code{3000}.
#' @param mtry.ratios (only when \code{mode = "retrain"}) used to indicate the ratios of \code{mtry} when tuning the random forest classifier.
#' \code{mtry} = ratio of mtry * number of features Default: \code{c(0.1, 0.2, 0.4, 0.6, 0.8)}.
#' @param seed (only when \code{mode = "retrain"}) an integer indicates the random seed for data splitting.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#' @param ... (only when \code{mode = "retrain"}) other parameters (except \code{ntree} and \code{mtry}) passed to \code{\link[randomForest]{randomForest}} function.
#'
#' @return
#' If \code{mode = "prediction"}, this function returns a data frame that contains the predicted results.
#'
#' If \code{mode = "retrain"}, this function returns a random forest classifier.
#'
#' If \code{mode = "feature"}, this function returns a data frame that contains the extracted features.
#'
#' @section References:
#' Akbaripour-Elahabad M, Zahiri J, Rafeh R, \emph{et al}.
#' rpiCOOL: A tool for In Silico RNA-protein interaction detection using random forest.
#' J. Theor. Biol. 2016; 402:1-8
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom randomForest randomForest
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom stats predict
#'
#' @examples
#'
#' # Following codes only show how to use this function
#' # and cannot reflect the genuine performance of tools or classifiers.
#'
#' data(demoPositiveSeq)
#' seqRNA <- demoPositiveSeq$RNA.positive
#' seqPro <- demoPositiveSeq$Pro.positive
#'
#' # Predicting ncRNA-protein pairs:
#'
#' Res_rpiCOOL_1 <- run_rpiCOOL(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' retrained.model = NULL, label = "rpiCOOL_res",
#' parallel.cores = 2) # using default rebuilt model
#'
#' # Train a new model:
#'
#' # Argument "label" which indicates the class of each input pair is required here.
#' # "label" should correspond to the classes of "seqRNA" and "seqPro".
#' # "positive.class" should be one of the classes in argument "label" or can be set as "NULL".
#' # In the latter case, the first label in "label" will be used as the positive class.
#' # Parameters of random forest, such as "replace", can be passed using "..." argument.
#'
#' rpiCOOL_model = run_rpiCOOL(seqRNA = seqRNA, seqPro = seqPro, mode = "retrain",
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' positive.class = NULL, folds.num = 5, ntree = 300,
#' seed = 1, parallel.cores = 2, replace = FALSE)
#'
#' # Predicting using new built model by setting "retrained.model = rpiCOOL_model":
#'
#' Res_rpiCOOL_2 <- run_rpiCOOL(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' retrained.model = rpiCOOL_model, label = NULL,
#' parallel.cores = 2)
#'
#' # Only extracting features:
#'
#' rpiCOOL_feature_df <- run_rpiCOOL(seqRNA = seqRNA, seqPro = seqPro, mode = "feature",
#' label = "feature", parallel.cores = 2)
#'
#' # Extracted features can be used to build classifiers using other machine learning
#' # algorithms, which provides users with more flexibility.
#'
#' @export
run_rpiCOOL <- function(seqRNA, seqPro, mode = c("prediction", "retrain", "feature"),
retrained.model = NULL, label = NULL, positive.class = NULL,
folds.num = 10, ntree = 3000, mtry.ratios = c(0.1, 0.2, 0.4, 0.6, 0.8),
seed = 1, parallel.cores = 2, cl = NULL, ...) {
mode <- match.arg(mode, choices = c("prediction", "retrain", "feature"))
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
if (mode == "retrain") {
if (length(label) != length(seqRNA) | length(unique(label)) != 2) {
stop("label is required and should correspond to input sequences!")
}
if (!is.null(positive.class)) {
if (!positive.class %in% label) stop("positive.class should be NULL or one of the classes in label.")
}
}
message("+ Initializing... ", Sys.time())
# message("- Creating cores... ")
#
# parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
# cl <- parallel::makeCluster(parallel.cores)
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
seqRNA <- parallel::parLapply(cl, seqRNA, Internal.checkRNA)
if (length(names(seqRNA)) != 0) names.seqRNA <- names(seqRNA) else names.seqRNA <- "RNA.noName"
if (length(names(seqPro)) != 0) names.seqPro <- names(seqPro) else names.seqPro <- "Protein.noName"
# message("- Extracting sequence-based features... ", Sys.time())
featureSetFreq <- featureFreq(seqRNA = seqRNA, seqPro = seqPro,
featureMode = "conc", computePro = "rpiCOOL", k.Pro = 3,
k.RNA = 4, normalize = "none", cl = cl)
# message("- Extracting motif-based features... ", Sys.time())
featureSetMotif <- featureMotifs(seqRNA = seqRNA, seqPro = seqPro, featureMode = "comb",
motifRNA = "rpiCOOL", motifPro = "rpiCOOL",
cl = cl)
if (close_cl) parallel::stopCluster(cl)
featureSet <- cbind(featureSetFreq, featureSetMotif)
if(mode == "prediction") {
message("\n", "+ Predicting pairs using rpiCOOL (retrained model)... ", Sys.time())
if (is.null(retrained.model)) {
message("- Default retrained model selected. ")
data(mod_rpiCOOL, package = "LION", envir = environment())
mod_rpiCOOL <- get("mod_rpiCOOL")
} else {
message("- Using model provided by user.")
mod_rpiCOOL <- retrained.model
}
res <- stats::predict(mod_rpiCOOL, featureSet, type = "response")
prob <- stats::predict(mod_rpiCOOL, featureSet, type = "prob")
if (is.null(label)) {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
rpiCOOL_retrain_pred = as.character(res),
rpiCOOL_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F, row.names = NULL)
} else {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro, label = label,
rpiCOOL_retrain_pred = as.character(res), rpiCOOL_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F, row.names = NULL)
}
message("\n", "+ Completed. ", Sys.time())
return(outRes)
} else if (mode == "feature") {
if (!is.null(label)) featureSet <- cbind(label = label, featureSet)
message("\n", "+ Completed. ", Sys.time())
return(featureSet)
} else {
featureSet <- cbind(label = label, featureSet)
message("\n", "+ Retraining model... ", Sys.time())
retrained.model <- Internal.randomForest_tune(datasets = list(featureSet), label.col = 1,
positive.class = positive.class, folds.num = folds.num,
ntree = ntree,
mtry.ratios = mtry.ratios,
seed = seed, return.model = TRUE,
parallel.cores = parallel.cores, ...)
message("\n", "+ Completed. ", Sys.time())
return(retrained.model)
}
}
#' Predict RNA-Protein Interaction Using LncADeep's Features
#' @description This function can predict lncRNA/RNA-protein interactions using rebuilt model trained with LncADeep's feature set.
#' Model retraining and feature extraction are also supported.
#' LncADeep selects 110 features to build its classifier.
#' Here, the 110 top features are determined by averaging feature scores of 33 evaluation results provided by
#' LncADeep. LncADeep's original model is trained using deep neural network (DNN). Considering that DNN architecture is hard
#' to perform parameter tuning, we rebuild the model using the same machine algorithm (random forest) as the other methods.
#' Users can build DNN model with the features generated by this function.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#'
#' @param mode a string. Set \code{"prediction"} to predict ncRNA-protein pairs and return prediction results;
#' set \code{"retrain"} to build a new random forest model using the input data;
#' set \code{"feature"} to return a data frame contains the extracted features.
#' Users can use the extracted features generated by \code{mode = "feature"} to train classifiers
#' with other machine learning algorithms. Default: \code{"prediction"}.
#' @param retrained.model (only when \code{mode = "prediction"})
#' use the default model or a new retrained model to predict ncRNA-protein pairs?
#' If \code{NULL}, default machine learning model will be used. Or pass the model generated by this function
#' with parameter \code{"mode = retrain"}. Default: \code{NULL}. See examples below.
#' @param label a string or a vector of strings or \code{NULL}.
#' Optional when \code{mode = "prediction"} or \code{mode = "feature"}: used to give labels or notes to the output result.
#' Required when \code{mode = "retrain"}: must be a vector of strings that corresponds to input sequences.
#' Each string indicates the class of each input pair. Default: \code{NULL}.
#' @param positive.class (only when \code{mode = "retrain"}) \code{NULL} or a string used to indicate
#' which class is the positive class, Should be one
#' of the classes in \code{label} or leave \code{positive.class = NULL}.
#' In the latter case, the first class in \code{label} will be used
#' as the positive class. Default: \code{NULL}.
#' @param folds.num (only when \code{mode = "retrain"}) an integer indicates the number of folds for cross validation.
#' Default: \code{10} for 10-fold cross validation.
#' @param ntree integer, number of trees to grow. See \code{\link[randomForest]{randomForest}}.
#' Default: \code{3000}.
#' @param mtry.ratios (only when \code{mode = "retrain"}) used to indicate the ratios of \code{mtry} when tuning the random forest classifier.
#' \code{mtry} = ratio of mtry * number of features Default: \code{c(0.1, 0.2, 0.4, 0.6, 0.8)}.
#' @param seed (only when \code{mode = "retrain"}) an integer indicates the random seed for data splitting.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#' @param ... (only when \code{mode = "retrain"}) other parameters (except \code{ntree} and \code{mtry}) passed to \code{\link[randomForest]{randomForest}} function.
#'
#' @return
#' If \code{mode = "prediction"}, this function returns a data frame that contains the predicted results.
#'
#' If \code{mode = "retrain"}, this function returns a random forest classifier.
#'
#' If \code{mode = "feature"}, this function returns a data frame that contains the extracted features.
#'
#' @section References:
#' Yang C, Yang L, Zhou M, \emph{et al}.
#' LncADeep: an ab initio lncRNA identification and functional annotation tool based on deep learning.
#' Bioinformatics. 2018; 34(22):3825-3834.
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom randomForest randomForest
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom stats predict
#'
#' @examples
#'
#' # Following codes only show how to use this function
#' # and cannot reflect the genuine performance of tools or classifiers.
#'
#' data(demoPositiveSeq)
#' seqRNA <- demoPositiveSeq$RNA.positive
#' seqPro <- demoPositiveSeq$Pro.positive
#'
#' # Predicting ncRNA-protein pairs:
#'
#' Res_LncADeep_1 <- run_LncADeep(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' retrained.model = NULL, label = "LncADeep_res",
#' parallel.cores = 2) # using default rebuilt model
#'
#' # Train a new model:
#'
#' # Argument "label" which indicates the class of each input pair is required here.
#' # "label" should correspond to the classes of "seqRNA" and "seqPro".
#' # "positive.class" should be one of the classes in argument "label" or can be set as "NULL".
#' # In the latter case, the first label in "label" will be used as the positive class.
#' # Parameters of random forest, such as "nodesize", can be passed using "..." argument.
#'
#' LncADeep_model <- run_LncADeep(seqRNA = seqRNA, seqPro = seqPro, mode = "retrain",
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' positive.class = NULL, folds.num = 5, ntree = 100,
#' seed = 1, parallel.cores = 2, nodesize = 2)
#'
#' # Predicting using new built model by setting "retrained.model = LncADeep_model":
#'
#' Res_LncADeep_2 <- run_LncADeep(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' retrained.model = LncADeep_model, label = NULL,
#' parallel.cores = 2)
#'
#' # Only extracting features:
#'
#' LncADeep_feature_df <- run_LncADeep(seqRNA = seqRNA, seqPro = seqPro, mode = "feature",
#' label = "feature", parallel.cores = 2)
#'
#' # Extracted features can be used to build classifiers using other machine learning
#' # algorithms, which provides users with more flexibility.
#'
#' @export
run_LncADeep <- function(seqRNA, seqPro, mode = c("prediction", "retrain", "feature"),
retrained.model = NULL, label = NULL, positive.class = NULL,
folds.num = 10, ntree = 3000, mtry.ratios = c(0.1, 0.2, 0.4, 0.6, 0.8),
seed = 1, parallel.cores = 2, cl = NULL, ...) {
mode <- match.arg(mode, choices = c("prediction", "retrain", "feature"))
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
if (mode == "retrain") {
if (length(label) != length(seqRNA) | length(unique(label)) != 2) {
stop("label is required and should correspond to input sequences!")
}
if (!is.null(positive.class)) {
if (!positive.class %in% label) stop("positive.class should be NULL or one of the classes in label.")
}
}
message("+ Initializing... ", Sys.time())
# message("- Creating cores... ")
#
# parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
# cl <- parallel::makeCluster(parallel.cores)
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
seqRNA <- parallel::parLapply(cl, seqRNA, Internal.checkRNA)
if (length(names(seqRNA)) != 0) names.seqRNA <- names(seqRNA) else names.seqRNA <- "RNA.noName"
if (length(names(seqPro)) != 0) names.seqPro <- names(seqPro) else names.seqPro <- "Protein.noName"
# message("- Extracting sequence-based features... ", Sys.time())
featureSetFreq <- featureFreq(seqRNA = seqRNA, seqPro = seqPro,
featureMode = "conc", computePro = "RPISeq", k.Pro = 3,
k.RNA = 4, normalize = "none", cl = cl)
# message("- Calculating MLC coverage... ", Sys.time())
featureSetCoverage <- Internal.featureCoverage(seqRNA = seqRNA, cl = cl)
if (close_cl) parallel::stopCluster(cl)
featureSet <- cbind(featureSetFreq, featureSetCoverage[,2,drop = F])
selectedFeature_idx <- names(featureSet) %in% LncADeep_featureRank$Name
featureSet <- featureSet[,selectedFeature_idx]
if(mode == "prediction") {
message("\n", "+ Predicting pairs using LncADeep (retrained model)... ", Sys.time())
if (is.null(retrained.model)) {
message("- Default retrained model selected. ")
data(mod_LncADeep, package = "LION", envir = environment())
mod_LncADeep <- get("mod_LncADeep")
# names(featureSet)[names(featureSet) %in% "MLC_Coverage"] <- "MLC_coverage"
} else {
message("- Using model provided by user.")
mod_LncADeep <- retrained.model
}
res <- stats::predict(mod_LncADeep, featureSet, type = "response")
prob <- stats::predict(mod_LncADeep, featureSet, type = "prob")
if (is.null(label)) {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
LncADeep_retrain_pred = as.character(res),
LncADeep_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F, row.names = NULL)
} else {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro, label = label,
LncADeep_retrain_pred = as.character(res),
LncADeep_retrain_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F, row.names = NULL)
}
message("\n", "+ Completed. ", Sys.time())
return(outRes)
} else if (mode == "feature") {
if (!is.null(label)) featureSet <- cbind(label = label, featureSet)
message("\n", "+ Completed. ", Sys.time())
return(featureSet)
} else {
featureSet <- cbind(label = label, featureSet)
message("\n", "+ Retraining model... ", Sys.time())
retrained.model <- Internal.randomForest_tune(datasets = list(featureSet), label.col = 1,
positive.class = positive.class, folds.num = folds.num,
ntree = ntree,
mtry.ratios = mtry.ratios,
seed = seed, return.model = TRUE,
parallel.cores = parallel.cores, ...)
message("\n", "+ Completed. ", Sys.time())
return(retrained.model)
}
}
#' Predict RNA-Protein Interaction Using LION Method
#' @description This function can predict lncRNA/RNA-protein interactions using LION method. Model retraining and feature extraction are also supported.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#'
#' @param mode a string. Set \code{"prediction"} to predict ncRNA-protein pairs and return prediction results;
#' set \code{"retrain"} to build a new random forest model using the input data;
#' set \code{"feature"} to return a data frame contains the extracted features.
#' Users can use the extracted features generated by \code{mode = "feature"} to train classifiers
#' with other machine learning algorithms. Default: \code{"prediction"}.
#' @param retrained.model (only when \code{mode = "prediction"})
#' use the default model or a new retrained model to predict ncRNA-protein pairs?
#' If \code{NULL}, default machine learning model will be used. Or pass the model generated by this function
#' with parameter \code{"mode = retrain"}. Default: \code{NULL}. See examples below.
#' @param label a string or a vector of strings or \code{NULL}.
#' Optional when \code{mode = "prediction"} or \code{mode = "feature"}: used to give labels or notes to the output result.
#' Required when \code{mode = "retrain"}: must be a vector of strings that corresponds to input sequences.
#' Each string indicates the class of each input pair. Default: \code{NULL}.
#' @param positive.class (only when \code{mode = "retrain"}) \code{NULL} or a string used to indicate
#' which class is the positive class, Should be one
#' of the classes in \code{label} or leave \code{positive.class = NULL}.
#' In the latter case, the first class in \code{label} will be used
#' as the positive class. Default: \code{NULL}.
#' @param folds.num (only when \code{mode = "retrain"}) an integer indicates the number of folds for cross validation.
#' Default: \code{10} for 10-fold cross validation.
#' @param ntree integer, number of trees to grow. See \code{\link[randomForest]{randomForest}}.
#' Default: \code{3000}.
#' @param mtry.ratios (only when \code{mode = "retrain"}) used to indicate the ratios of \code{mtry} when tuning the random forest classifier.
#' \code{mtry} = ratio of mtry * number of features Default: \code{c(0.1, 0.2, 0.4, 0.6, 0.8)}.
#' @param seed (only when \code{mode = "retrain"}) an integer indicates the random seed for data splitting.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#' @param ... (only when \code{mode = "retrain"}) other parameters (except \code{ntree} and \code{mtry}) passed to \code{\link[randomForest]{randomForest}} function.
#'
#' @return
#' If \code{mode = "prediction"}, this function returns a data frame that contains the predicted results.
#'
#' If \code{mode = "retrain"}, this function returns a random forest classifier.
#'
#' If \code{mode = "feature"}, this function returns a data frame that contains the extracted features.
#'
#' @section References:
#' Han S, Yang X, Sun H, \emph{et al}.
#' LION: an integrated R package for effective prediction of ncRNA–protein interaction.
#' Briefings in Bioinformatics. 2022; 23(6):bbac420
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom randomForest randomForest
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom stats predict
#'
#' @examples
#'
#' # Following codes only show how to use this function
#' # and cannot reflect the genuine performance of tools or classifiers.
#'
#' data(demoPositiveSeq)
#' seqRNA <- demoPositiveSeq$RNA.positive
#' seqPro <- demoPositiveSeq$Pro.positive
#'
#' # Predicting ncRNA-protein pairs:
#'
#' Res_LION_1 <- run_LION(seqRNA = seqRNA, seqPro = seqPro,
#' parallel.cores = 2) # using the default setting
#'
#' # the above command is equivalent to:
#' Res_LION_2 <- run_LION(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' retrained.model = NULL, label = NULL,
#' parallel.cores = 2)
#'
#' # Train a new model:
#'
#' # Argument "label" which indicates the class of each input pair is required here.
#' # "label" should correspond to the classes of "seqRNA" and "seqPro".
#' # "positive.class" should be one of the classes in argument "label" or can be set as "NULL".
#' # In the latter case, the first label in "label" will be used as the positive class.
#' # Parameters of random forest, such as "replace", can be passed using "..." argument.
#'
#' LION_model <- run_LION(seqRNA = seqRNA, seqPro = seqPro, mode = "retrain",
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' positive.class = NULL, folds.num = 5, ntree = 100,
#' seed = 1, parallel.cores = 2, replace = FALSE)
#'
#' # Predicting using new built model by setting "retrained.model = LION_model":
#'
#' Res_LION_2 <- run_LION(seqRNA = seqRNA, seqPro = seqPro, mode = "prediction",
#' retrained.model = LION_model,
#' label = rep(c("Interact", "Non.Interact"), each = 10),
#' parallel.cores = 2)
#'
#' # Only extracting features:
#'
#' LION_feature_df <- run_LION(seqRNA = seqRNA, seqPro = seqPro, mode = "feature",
#' label = "LION_feature", parallel.cores = 2)
#'
#' # Extracted features can be used to build classifiers using other machine learning
#' # algorithms, which provides users with more flexibility.
#'
#' @export
run_LION <- function(seqRNA, seqPro, mode = c("prediction", "retrain", "feature"),
retrained.model = NULL, label = NULL, positive.class = NULL,
folds.num = 10, ntree = 3000, mtry.ratios = c(0.1, 0.2, 0.4, 0.6, 0.8),
seed = 1, parallel.cores = 2, cl = NULL, ...) {
mode <- match.arg(mode, choices = c("prediction", "retrain", "feature"))
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
if (mode == "retrain") {
if (length(label) != length(seqRNA) | length(unique(label)) != 2) {
stop("label is required and should correspond to input sequences!")
}
if (!is.null(positive.class)) {
if (!positive.class %in% label) stop("positive.class should be NULL or one of the classes in label.")
}
}
message("+ Initializing... ", Sys.time())
# message("- Creating cores... ")
#
# parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
# cl <- parallel::makeCluster(parallel.cores)
close_cl <- FALSE
if (is.null(cl)) {
message("- Creating cores... ")
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
seqRNA <- parallel::parLapply(cl, seqRNA, Internal.checkRNA)
if (length(names(seqRNA)) != 0) names.seqRNA <- names(seqRNA) else names.seqRNA <- "RNA.noName"
if (length(names(seqPro)) != 0) names.seqPro <- names(seqPro) else names.seqPro <- "Protein.noName"
# message("- Extracting sequence-based features... ", Sys.time())
featureSetEDP <- featureFreq(seqRNA = seqRNA, seqPro = seqPro, EDP = TRUE,
featureMode = "conc", computePro = "rpiCOOL",
k.Pro = 3, k.RNA = 4, normalize = "none", cl = cl)
featureSetMLC <- Internal.featureCoverage(seqRNA = seqRNA, label = NULL, cl = cl)
featureSetSeq <- cbind(featureSetEDP, MLC_Coverage = featureSetMLC$MLC_Coverage)
# featureSetSeq <- featureSetSeq[,names(featureSetSeq) %in% LION_featureRank]
# message("- Extracting motif-based features... ", Sys.time())
featureSetMotif <- featureMotifs(seqRNA = seqRNA, seqPro = seqPro,
featureMode = "conc", cl = cl,
motifRNA = c("Slm2", "Fusip1", "PTB", "QKI",
"HuD", "HuR", "AU", "UG"),
motifPro = c("E", "H", "K", "R", "H_R",
"EE", "KK", "HR_RH", "RS_SR"))
# message("- Extracting physicochemical features... ", Sys.time())
featureSetPhysChem <- featurePhysChem(seqRNA = seqRNA, seqPro = seqPro,
# physchemRNA = c("hydrogenBonding", "vanderWaal"),
# physchemPro = c("bulkiness.Zimmerman", "isoelectricPoint.Zimmerman"),
cl = cl)
if (close_cl) parallel::stopCluster(cl)
featureSet <- cbind(featureSetSeq, featureSetMotif, featureSetPhysChem)
if(mode == "prediction") {
message("\n", "+ Predicting pairs using LION... ", Sys.time())
if (is.null(retrained.model)) {
message("- Default retrained model selected.")
data(mod_LION, package = "LION", envir = environment())
mod_LION <- get("mod_LION")
} else {
message("- Using model provided by user.")
mod_LION <- retrained.model
}
res <- stats::predict(mod_LION, featureSet, type = "response")
prob <- stats::predict(mod_LION, featureSet, type = "prob")
if (is.null(label)) {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro,
LION_pred = as.character(res), LION_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F, row.names = NULL)
} else {
outRes <- cbind(RNA_Name = names.seqRNA, Pro_Name = names.seqPro, label = label,
LION_pred = as.character(res), LION_prob = prob[,1])
outRes <- data.frame(outRes, stringsAsFactors = F, row.names = NULL)
}
message("\n", "+ Completed. ", Sys.time())
return(outRes)
} else if (mode == "feature") {
### TODO - Select feature
if (!is.null(label)) featureSet <- cbind(label = label, featureSet)
message("\n", "+ Completed. ", Sys.time())
return(featureSet)
} else {
### TODO - Select feature
featureSet <- cbind(label = label, featureSet)
message("\n", "+ Retraining model... ", Sys.time())
retrained.model <- Internal.randomForest_tune(datasets = list(featureSet), label.col = 1,
positive.class = positive.class, folds.num = folds.num,
ntree = ntree,
mtry.ratios = mtry.ratios,
seed = seed, return.model = TRUE,
parallel.cores = parallel.cores, ...)
message("\n", "+ Completed. ", Sys.time())
return(retrained.model)
}
}
#' Confident Prediction of RNA-Protein Interaction Using Multiple Methods Simultaneously
#' @description This function can predict lncRNA/RNA-protein interactions using all supported methods, which is useful to have a high-confident prediction.
#'
#' @param seqRNA RNA sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of RNA/protein sequences.
#' RNA sequences will be converted into lower case letters.
#' @param seqPro protein sequences loaded by function \code{\link[seqinr]{read.fasta}} from \code{\link[seqinr]{seqinr-package}}. Or a list of protein sequences.
#' Protein sequences will be converted into upper case letters.
#' Each sequence should be a vector of single characters.
#' @param label optional. A string or a vector of strings or \code{NULL}. Used to give labels or notes to the output result.
#' Default: \code{NULL}.
#' @param methods strings. Indicate the method(s) to be used for prediction.
#' Can be: \code{"RPISeq_web"}, \code{"RPISeq_retrain"}, \code{"lncPro_original"},
#' \code{"lncPro_retrain"}, \code{"rpiCOOL_retrain"}, \code{"LncADeep_retrain"} and \code{"LION"}.
#' @param RPISeq.mod,lncPro.mod,rpiCOOL.mod,LncADeep.mod,LION.mod use default retrained model (if \code{NULL}) or assign a new retrained model?
#' New retrained model can be generated with \code{\link{run_RPISeq}}, \code{\link{run_lncPro}}, \code{\link{run_rpiCOOL}}, \code{\link{run_LncADeep}} and \code{\link{run_LION}}.
#' @param parallel.cores an integer that indicates the number of cores for parallel computation.
#' Default: \code{2}. Set \code{parallel.cores = -1} to run with all the cores. \code{parallel.cores} should be == -1 or >= 1.
#' @param cl parallel cores to be passed to this function.
#'
#' @return A list containing the predicted results.
#'
#' @section References:
#' Han S, Yang X, Sun H, \emph{et al}.
#' LION: an integrated R package for effective prediction of ncRNA–protein interaction.
#' Briefings in Bioinformatics. 2022; 23(6):bbac420
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom randomForest randomForest
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parLapply
#' @importFrom parallel parSapply
#' @importFrom parallel mcmapply
#' @importFrom parallel stopCluster
#' @importFrom parallel detectCores
#' @importFrom seqinr a
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr write.fasta
#' @importFrom seqinr getSequence
#' @importFrom utils data
#' @importFrom stats predict
#' @importFrom RCurl postForm
#'
#' @examples
#'
#' # Following codes only show how to use this function
#' # and cannot reflect the genuine performance of tools or classifiers.
#'
#' data(demoPositiveSeq)
#' seqRNA <- demoPositiveSeq$RNA.positive
#' seqPro <- demoPositiveSeq$Pro.positive
#'
#' # Using methods RPISeq (retrained model) and rpiCOOL (retrained model):
#'
#' Res_confidence <- run_confidentPrediction(seqRNA = seqRNA, seqPro = seqPro,
#' methods = c("RPISeq_retrain",
#' "rpiCOOL_retrain", "LION"),
#' label = "Interact", # label is optional
#' parallel.cores = 2)
#' # Convert to data frame:
#'
#' Res_confidence_df <- do.call("cbind", Res_confidence)
#' Res_confidence_df <- Res_confidence_df[!duplicated(names(Res_confidence_df))]
#'
#' @export
run_confidentPrediction <- function(seqRNA, seqPro, label = NULL,
methods = c("RPISeq_web", "RPISeq_retrain",
"lncPro_original", "lncPro_retrain",
"rpiCOOL_retrain",
"LncADeep_retrain", "LION"),
RPISeq.mod = NULL, lncPro.mod = NULL,
rpiCOOL.mod = NULL, LncADeep.mod = NULL,
LION.mod = NULL,
parallel.cores = 2, cl = NULL) {
methods <- match.arg(methods, several.ok = TRUE)
if (length(seqRNA) != length(seqPro)) stop("The number of RNA sequences should match the number of protein sequences!")
if (!is.null(label)) {
if (length(label) == 1) {
label <- rep(label, length(seqPro))
}
if (length(label) != length(seqPro)) stop("The length of label should be one or match the length of sequences!")
}
close_cl <- FALSE
if (is.null(cl)) {
parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
cl <- parallel::makeCluster(parallel.cores)
close_cl <- TRUE
}
outRes <- list()
if ("lncPro_original" %in% methods) {
message("\n>>> ", "lncPro (original model) ", Sys.time())
res_lncPro <- run_lncPro(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction",
prediction = "original", label = label,
cl = cl)
outRes <- c(outRes, list(res_lncPro))
}
if ("lncPro_retrain" %in% methods) {
message("\n>>> ", "lncPro (retrained model) ", Sys.time())
res_lncPro <- run_lncPro(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction", retrained.model = lncPro.mod,
prediction = "retrained", label = label,
cl = cl)
outRes <- c(outRes, list(res_lncPro))
}
if ("RPISeq_web" %in% methods) {
message("\n>>> ", "RPISeq (original model, web server-based) ", Sys.time())
res_RPISeq <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction",
prediction = "web", label = label,
cl = cl)
outRes <- c(outRes, list(res_RPISeq))
}
if ("RPISeq_retrain" %in% methods) {
message("\n>>> ", "RPISeq (retrained model) ", Sys.time())
res_RPISeq <- run_RPISeq(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction", retrained.model = RPISeq.mod,
prediction = "retrained", label = label,
cl = cl)
outRes <- c(outRes, list(res_RPISeq))
}
if ("rpiCOOL_retrain" %in% methods) {
message("\n>>> ", "rpiCOOL (retrained model) ", Sys.time())
res_rpiCOOL <- run_rpiCOOL(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction", label = label,
retrained.model = rpiCOOL.mod,
cl = cl)
outRes <- c(outRes, list(res_rpiCOOL))
}
if ("LncADeep_retrain" %in% methods) {
message("\n>>> ", "LncADeep (retrained model) ", Sys.time())
res_LncADeep <- run_LncADeep(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction", label = label,
retrained.model = LncADeep.mod,
cl = cl)
outRes <- c(outRes, list(res_LncADeep))
}
if ("LION" %in% methods) {
message("\n>>> ", "LION ", Sys.time())
res_LION <- run_LION(seqRNA = seqRNA, seqPro = seqPro,
mode = "prediction", label = label,
retrained.model = LION.mod,
cl = cl)
outRes <- c(outRes, list(res_LION))
}
if (close_cl) parallel::stopCluster(cl)
outRes
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.