R/LncFinder.R

Defines functions lnc_finder build_model extract_features make_frequencies read_SS run_RNAfold find_orfs svm_tune svm_cv compute_pI compute_EIIP compute_GC compute_FickettScore compute_kmer compute_hexamerScore compute_EucDistance compute_LogDistance make_referFreq secondary_seq LogDist.acguACGU LogDist.acguD LogDist.DNA get.freq find_ORF get.Seq_DNA get.Seq_acguD get.Seq_acguACGU get.Seq_UP get.svm.res fold.res get_EIIP EIIP.DFT Internal.hexamerScore Internal.LogDistance Internal.EucDistance Internal.FickettScore Internal.convertProb

Documented in build_model compute_EIIP compute_EucDistance compute_FickettScore compute_GC compute_hexamerScore compute_kmer compute_LogDistance compute_pI extract_features find_orfs lnc_finder make_frequencies make_referFreq read_SS run_RNAfold svm_cv svm_tune

################  Internal Functions  ################

Internal.convertProb <- function(value, base, param, prob, weight) {
        for(idx in 1:10) {
                if(value >= param[idx]) {
                        probRes <- prob[[base]][[idx]] * weight[[base]]
                        break
                }
        }
        probRes
}

Internal.FickettScore <- function(oneSeq, .positionProb, .positionWeight,
                                  .positionParam, .contentProb,
                                  .contentWeight, .contentParam) {
        countSeq <- seqinr::count(oneSeq, wordsize = 1, freq = TRUE)
        lenSeq <- length(oneSeq)
        phase1 <- oneSeq[seq(1, lenSeq, 3)]
        phase2 <- oneSeq[seq(2, lenSeq, 3)]
        phase3 <- oneSeq[seq(3, lenSeq, 3)]

        position.a <- max(sum(phase1 == "a"), sum(phase2 == "a"), sum(phase3 == "a")) / (min(sum(phase1 == "a"), sum(phase2 == "a"), sum(phase3 == "a")) + 1)
        position.c <- max(sum(phase1 == "c"), sum(phase2 == "c"), sum(phase3 == "c")) / (min(sum(phase1 == "c"), sum(phase2 == "c"), sum(phase3 == "c")) + 1)
        position.g <- max(sum(phase1 == "g"), sum(phase2 == "g"), sum(phase3 == "g")) / (min(sum(phase1 == "g"), sum(phase2 == "g"), sum(phase3 == "g")) + 1)
        position.t <- max(sum(phase1 == "t"), sum(phase2 == "t"), sum(phase3 == "t")) / (min(sum(phase1 == "t"), sum(phase2 == "t"), sum(phase3 == "t")) + 1)

        contentProb.a <- Internal.convertProb(value = countSeq["a"], base = "a", param = .contentParam, prob = .contentProb, weight = .contentWeight)
        contentProb.c <- Internal.convertProb(value = countSeq["c"], base = "c", param = .contentParam, prob = .contentProb, weight = .contentWeight)
        contentProb.g <- Internal.convertProb(value = countSeq["g"], base = "g", param = .contentParam, prob = .contentProb, weight = .contentWeight)
        contentProb.t <- Internal.convertProb(value = countSeq["t"], base = "t", param = .contentParam, prob = .contentProb, weight = .contentWeight)

        positionProb.a <- Internal.convertProb(value = position.a, base = "a", param = .positionParam, prob = .positionProb, weight = .positionWeight)
        positionProb.c <- Internal.convertProb(value = position.c, base = "c", param = .positionParam, prob = .positionProb, weight = .positionWeight)
        positionProb.g <- Internal.convertProb(value = position.g, base = "g", param = .positionParam, prob = .positionProb, weight = .positionWeight)
        positionProb.t <- Internal.convertProb(value = position.t, base = "t", param = .positionParam, prob = .positionProb, weight = .positionWeight)

        FickettScore <- sum(contentProb.a, contentProb.c, contentProb.g, contentProb.t,
                            positionProb.a, positionProb.c, positionProb.g, positionProb.t)
        FickettScore
}

Internal.EucDistance <- function(OneSeq, referFreq, k, step, alphabet){

        freq.val <- seqinr::count(OneSeq, wordsize = k, by = step, alphabet = alphabet, freq = TRUE)

        EucDist.LNC <- sqrt(sum((freq.val - referFreq$ref.lnc) ^ 2))
        EucDist.PCT <- sqrt(sum((freq.val - referFreq$ref.cds) ^ 2))
        EucDist.Ratio <- EucDist.LNC / EucDist.PCT

        distance <- c(EucDist.LNC = EucDist.LNC, EucDist.PCT = EucDist.PCT, EucDist.Ratio = EucDist.Ratio)

        distance
}

Internal.LogDistance <- function(OneSeq, referFreq, k, step, alphabet){

        count.num <- seqinr::count(OneSeq, wordsize = k, by = step, alphabet = alphabet, freq = FALSE)

        freq.val  <- count.num / sum(count.num)
        lnc.ratio <- freq.val[referFreq$ref.lnc != 0] / referFreq$ref.lnc[referFreq$ref.lnc != 0]
        pct.ratio <- freq.val[referFreq$ref.cds != 0] / referFreq$ref.cds[referFreq$ref.cds != 0]
        LogDist.LNC <- (sum(log(lnc.ratio[lnc.ratio != 0])) - length(which(lnc.ratio == 0)) + length(which(referFreq$ref.lnc == 0))) / sum(count.num)
        LogDist.PCT <- (sum(log(pct.ratio[pct.ratio != 0])) - length(which(pct.ratio == 0)) + length(which(referFreq$ref.cds == 0))) / sum(count.num)
        LogDist.Ratio <- LogDist.LNC / LogDist.PCT

        distance <- c(LogDist.LNC = LogDist.LNC, LogDist.PCT = LogDist.PCT, LogDist.Ratio = LogDist.Ratio)

        distance
}

Internal.hexamerScore <- function(OneSeq, referFreq, k, step, alphabet){

        count.num <- seqinr::count(OneSeq, wordsize = k, by = step, alphabet = alphabet, freq = FALSE)

        seq.sum  <- 0
        for(i in names(count.num)[count.num != 0]){
                if(referFreq$ref.cds[[i]] != 0 & referFreq$ref.cds[[i]] != 0) {
                        seq.sum <- seq.sum + (count.num[[i]] * log(referFreq$ref.cds[[i]] / referFreq$ref.cds[[i]]))
                } else {
                        if(referFreq$ref.cds[[i]] == 0) {
                                seq.sum <- seq.sum - count.num[[i]]
                        }
                        if(referFreq$ref.lnc[[i]] == 0) {
                                seq.sum <- seq.sum + count.num[[i]]
                        }
                }
        }
        hex.score <- seq.sum / sum(count.num)

        hex.score
}

EIIP.DFT <- function(OneSeq) {

        OneSeq <- seqinr::getSequence(OneSeq)
        OneSeq[!(OneSeq %in% "a" | OneSeq %in% "c" | OneSeq %in% "g" | OneSeq %in% "t")] <- 0
        OneSeq[OneSeq %in% "a"] <- 0.1260
        OneSeq[OneSeq %in% "c"] <- 0.1340
        OneSeq[OneSeq %in% "g"] <- 0.0806
        OneSeq[OneSeq %in% "t"] <- 0.1335

        numeric.seq  <- as.numeric(OneSeq)
        fourier.seq  <- stats::fft(numeric.seq)
        seq.spectrum <- (abs(fourier.seq)) ^ 2
        power.length <- length(seq.spectrum)
        k <- round(power.length / 3)
        DFT.max <- max(seq.spectrum[k-2], seq.spectrum[k-1], seq.spectrum[k],
                       seq.spectrum[k+1], seq.spectrum[k+2])
        total.power  <- sum(seq.spectrum) / power.length
        SNR.res <- c(Signal.Peak = DFT.max, SNR = (DFT.max / total.power))

        seq.spectrum <- seq.spectrum[-c(1, power.length)]
        ordered.pow  <- sort(seq.spectrum, decreasing = TRUE)
        pow.percent  <- ordered.pow[1:round(length(ordered.pow) * 0.1)]
        Quantile.res <- stats::quantile(pow.percent)[-4]
        names(Quantile.res) <- c("Signal.Min", "Signal.Q1", "Signal.Q2", "Signal.Max")
        EIIP.res <- c(SNR.res, Quantile.res)

        return(EIIP.res)
}

get_EIIP <- function(OneSeq, percent.length = 0.1, probs = seq(0, 1, 0.25)) {

        OneSeq[!(OneSeq %in% "a" | OneSeq %in% "c" | OneSeq %in% "g" | OneSeq %in% "t")] <- 0
        OneSeq[OneSeq %in% "a"] <- 0.1260
        OneSeq[OneSeq %in% "c"] <- 0.1340
        OneSeq[OneSeq %in% "g"] <- 0.0806
        OneSeq[OneSeq %in% "t"] <- 0.1335

        numeric.seq  <- as.numeric(OneSeq)
        fourier.seq  <- stats::fft(numeric.seq)
        seq.spectrum <- (abs(fourier.seq)) ^ 2
        power.length <- length(seq.spectrum)
        k <- round(power.length / 3)
        DFT.max <- max(seq.spectrum[k-2], seq.spectrum[k-1], seq.spectrum[k],
                       seq.spectrum[k+1], seq.spectrum[k+2])
        total.power  <- sum(seq.spectrum) / power.length
        SNR.res <- c(Signal.Peak = DFT.max, Average.Power = total.power, SNR = (DFT.max / total.power))

        seq.spectrum <- seq.spectrum[-c(1, power.length)]

        ordered.pow  <- sort(seq.spectrum, decreasing = TRUE)
        pow.percent  <- ordered.pow[1:round(length(ordered.pow) * percent.length)]
        Quantile.res <- stats::quantile(pow.percent, probs = probs)

        EIIP.res <- c(SNR.res, Quantile.res)

        return(EIIP.res)
}

fold.res <- function(x, dataset, positive.class, ...) {
        subset <- dataset[x, ]
        test.set  <- dataset[-x, ]

        svm.mod <- e1071::svm(label ~ ., data = subset, probability = TRUE, ...)
        print(svm.mod)

        res <- stats::predict(svm.mod, test.set, probability = TRUE)

        confusion.res <- caret::confusionMatrix(data.frame(res)[,1], test.set$label,
                                                positive = positive.class, mode = "everything")
        performance.res <- data.frame(Sensitivity = confusion.res$byClass[1],
                                      Specificity = confusion.res$byClass[2],
                                      Accuracy    = confusion.res$overall[1],
                                      F.Measure   = confusion.res$byClass[7],
                                      Kappa       = confusion.res$overall[2])
        performance.res
}

get.svm.res <- function(x, index, dataset = dataset){

        train.set <- dataset[x,c("label", index)]
        test.set  <- dataset[-x,]

        svm.mod   <- e1071::svm(label ~ ., data = train.set, scale = TRUE, probability = TRUE, kernel = "radial")
        results   <- stats::predict(svm.mod, test.set, probability = TRUE)
        conf.res  <- caret::confusionMatrix(data.frame(results)[,1], test.set$label,
                                            positive = "lnc", mode = "everything")
        perf.res  <- data.frame(Sensitivity = conf.res$byClass[1],
                                Specificity = conf.res$byClass[2],
                                Accuracy    = conf.res$overall[1],
                                F.Measure   = conf.res$byClass[7],
                                Kappa       = conf.res$overall[2])

        w.value <- t(svm.mod$coefs) %*% svm.mod$SV
        weights <- w.value * w.value
        weights <- data.frame(weight = t(weights))

        output  <- list(perf.res, weights)
        output
}

# svm.rfe <- function(dataset, variable.group, save.rank = FALSE, file.path = NULL){
#         message("+ Initialization.  ",  Sys.time())
#         message("")
#
#         if(is.null(file.path)) file.path <- getwd()
#
#         if(save.rank) setwd(file.path)
#         variable.group <- c(variable.group, 0)
#         variable.group <- unique(variable.group)
#         variable.group <- variable.group[which(variable.group < (ncol(dataset) - 1) & variable.group >= 0)]
#         variable.group <- sort(variable.group, decreasing = TRUE)
#         feature.num    <- ncol(dataset)
#
#         set.seed(1)
#         folds <- caret::createFolds(dataset$label, k = 10, returnTrain = TRUE)
#
#         new.fea <- c()
#
#         perf.all <- sapply(variable.group, function(variable.number){
#
#                 varindex <- if(variable.number == max(variable.group)) names(dataset)[-1] else new.fea
#                 message("+ RFE Processing... Variables: ", length(varindex))
#                 cl <- parallel::makeCluster(10)
#                 parallel::clusterExport(cl, varlist = c("dataset", "get.svm.res", "varindex"), envir = environment())
#                 message("- Training Model...  (",  Sys.time(), ")")
#                 svm.res <- parallel::parLapply(cl, folds, get.svm.res, index = varindex)
#                 parallel::stopCluster(cl)
#                 message("- Completed.         (",  Sys.time(), ")")
#                 message("- Metric:")
#
#                 perf    <- sapply(svm.res, rbind)
#                 perf    <- sapply(perf[1,], cbind)
#                 perf.convert <- apply(perf, 1, as.numeric)
#                 print(colMeans(perf.convert))
#
#                 weights <- svm.res[[1]][[2]]
#                 for(weight.index in 2:10) {
#                         weight  <- svm.res[[weight.index]][[2]]
#                         weights <- weights + weight
#                 }
#                 weights    <- (weights - min(weights)) / (max(weights) - min(weights)) * 100
#                 sort.index <- sort(weights$weight, decreasing = T, index.return = TRUE)$ix
#                 weights    <- weights[sort.index, , drop = F]
#                 if(nrow(weights) > 1 & save.rank) {
#                         save(weights, file = paste("ranking.", nrow(weights), ".RData", sep = ""))
#                 }
#                 if(variable.number != 0) {
#                         new.fea    <<- rownames(weights)[1:variable.number]
#                         message("- Remained Features:")
#                         cat(new.fea, fill = TRUE, labels = " ", sep = "  ")
#                         message("- Omitted Features:")
#                         cat(rownames(weights)[-(1:variable.number)], fill = TRUE, labels = " ", sep = "  ")
#                         message("- New Dataset Loaded. Variables: ", variable.number)
#                         message("- Remained Iterations: ", length(variable.group) - which(variable.group == variable.number))
#                         message("")
#                 }
#                 perf
#         })
#
#         ### Assign Column Names
#         var.colnames <- variable.group[which(variable.group != 0)]
#         var.colnames <- c(feature.num - 1, var.colnames)
#         colnames(perf.all) <- var.colnames
#
#         ### Assign Row Names
#         folds.names    <- sapply(c(1:10), function(x) paste("Fold", x, sep = ""))
#         rfe.rownames   <- c()
#         for(folds.index in folds.names){
#                 for (metric in c("Sensitivity", "Specificity", "Accuracy", "F.Measure", "Kappa")) {
#                         temp.name    <- paste(metric, folds.index, sep = ".")
#                         rfe.rownames <- c(rfe.rownames, temp.name)
#                 }
#         }
#         row.names(perf.all) <- rfe.rownames
#
#         ### Obtain the Average Performance
#         Ave <- apply(perf.all, 2, function(col){
#                 Ave.res <- data.frame(Sensitivity = mean(unlist(col[seq(1, 50, 5)])),
#                                       Specificity = mean(unlist(col[seq(2, 50, 5)])),
#                                       Accuracy    = mean(unlist(col[seq(3, 50, 5)])),
#                                       F.Measure   = mean(unlist(col[seq(4, 50, 5)])),
#                                       Kappa       = mean(unlist(col[seq(5, 50, 5)])))
#
#         })
#         Ave.res <- sapply(Ave, rbind)
#         Ave.res <- t(Ave.res)
#         Ave.res <- as.data.frame(apply(Ave.res, 2, unlist))
#         # Ave.res <- as.data.frame(Ave.res[nrow(Ave.res):1,])
#
#         perf.all.final <- list(raw.perf = perf.all, ave.perf = Ave.res)
#
#         message("")
#         message("+ RFE Process Completed. ", Sys.time())
#         perf.all.final
# }

get.Seq_UP <- function(OneSeq) {
        UP.seq <- seqinr::s2c(OneSeq[[2]])
        UP.seq[UP.seq %in% "(" | UP.seq %in% ")"] <- "P"
        UP.seq[UP.seq %in% "."] <- "U"
        UP.seq
}

get.Seq_acguACGU <- function(OneSeq) {
        ACGU.seq   <- seqinr::s2c(OneSeq[[1]])
        Stem.index <- seqinr::s2c(OneSeq[[2]]) %in% "(" | seqinr::s2c(OneSeq[[2]]) %in% ")"
        ACGU.seq[Stem.index & ACGU.seq %in% "a"] <- "A"
        ACGU.seq[Stem.index & ACGU.seq %in% "c"] <- "C"
        ACGU.seq[Stem.index & ACGU.seq %in% "g"] <- "G"
        ACGU.seq[Stem.index & ACGU.seq %in% "u"] <- "U"
        ACGU.seq
}

get.Seq_acguD <- function(OneSeq) {
        acguD.seq <- seqinr::s2c(OneSeq[[1]])
        Dot.index <- seqinr::s2c(OneSeq[[2]]) %in% "."
        acguD.seq[Dot.index] <- "D"
        acguD.seq
}

get.Seq_DNA <- function(OneSeq) {
        DNA.seq <- seqinr::s2c(OneSeq[[1]])
        DNA.seq[DNA.seq %in% "u"] <- "t"
        DNA.seq
}

find_ORF <- function(OneSeq, max.only = TRUE) {
        OneSeq <- unlist(seqinr::getSequence(OneSeq, as.string = TRUE))
        OneSeq <- gsub("\n", "", OneSeq)
        start_pos <- unlist(gregexpr("atg", OneSeq, ignore.case = TRUE))
        if(sum(start_pos) == -1) {
                if(max.only) {
                        ORF_info <- list(ORF.Max.Len = 0, ORF.Max.Cov = 0, ORF.Max.Seq = NA)
                } else {
                        ORF_info <- data.frame(ORF.Seq = NA, ORF.Start = 0, ORF.Stop = 0, ORF.Len = 0)
                }

        } else {
                stop_pos <- sapply(c("taa", "tag", "tga"), function(x){
                        pos <- unlist(gregexpr(x, OneSeq, ignore.case = TRUE))
                })
                stop_pos <- sort(unlist(stop_pos, use.names = FALSE))
                seq_length <- nchar(OneSeq)
                if(all(stop_pos == -1)) {
                        if(max.only) {
                                orf_start <- min(start_pos)
                                orf_stop  <- seq_length - ((seq_length - orf_start + 1) %% 3) - 2
                                max_len   <- orf_stop - orf_start + 3
                                max_cov   <- max_len / seq_length
                                orf_seq   <- substr(OneSeq, start = orf_start, stop = orf_stop + 2)
                                ORF_info  <- list(ORF.Max.Len = max_len, ORF.Max.Cov = max_cov, ORF.Max.Seq = orf_seq)
                        } else {
                                for(i in seq_along(start_pos)) {
                                        start.val  <- start_pos[i]
                                        stop.val   <- seq_length - ((seq_length - start.val + 1) %% 3) - 2
                                        length.val <- stop.val - start.val + 3
                                        cover.val  <- length.val / seq_length
                                        ORF.seq    <- substr(OneSeq, start = start.val, stop = stop.val + 2)
                                        output.tmp <- data.frame(ORF.Seq = ORF.seq, ORF.Start = start.val, ORF.Stop = stop.val,
                                                                 ORF.Len = length.val, ORF.Cov = cover.val, stringsAsFactors = FALSE)
                                        if(i == 1) {
                                                ORF_info <- output.tmp
                                        } else {
                                                ORF_info <- rbind(ORF_info, output.tmp)
                                        }

                                }
                        }

                } else {
                        orf_starts  <- c()
                        orf_stops   <- c()
                        orf_lengths <- c()
                        for(i in start_pos){
                                orf_flag <- 0
                                for(j in stop_pos){
                                        if(j < i) next
                                        diff_mod    <- (j - i) %% 3
                                        if(diff_mod != 0) next
                                        orf_starts  <- c(orf_starts, i)
                                        orf_stops   <- c(orf_stops, j)
                                        orf_lengths <- c(orf_lengths, j - i + 3)
                                        orf_flag    <- 1
                                        break
                                }
                                if(orf_flag == 0){
                                        orf_starts   <- c(orf_starts, i)
                                        orf_stop_tmp <- seq_length - ((seq_length - i + 1) %% 3) - 2
                                        orf_stops    <- c(orf_stops, orf_stop_tmp)
                                        orf_lengths  <- c(orf_lengths, orf_stop_tmp - i + 3)
                                }
                        }
                        if(max.only) {
                                max_len   <- max(orf_lengths)
                                max_cov   <- max_len / seq_length
                                max_index <- which(orf_lengths == max_len)
                                orf_start <- orf_starts[max_index]
                                orf_stop  <- orf_stops[max_index]
                                orf_seq   <- substr(OneSeq, start = orf_start, stop = orf_stop + 2)
                                ORF_info  <- list(ORF.Max.Len = max_len, ORF.Max.Cov = max_cov, ORF.Max.Seq = orf_seq)
                        } else {
                                for(i in seq_along(orf_starts)) {
                                        start.val  <- orf_starts[i]
                                        stop.val   <- orf_stops[i] + 2
                                        length.val <- orf_lengths[i]
                                        cover.val  <- length.val / seq_length
                                        ORF.seq    <- substr(OneSeq, start = start.val, stop = stop.val)
                                        output.tmp <- data.frame(ORF.Seq = ORF.seq, ORF.Start = start.val, ORF.Stop = stop.val,
                                                                 ORF.Len = length.val, ORF.Cov = cover.val, stringsAsFactors = FALSE)
                                        if(i == 1) {
                                                ORF_info <- output.tmp
                                        } else {
                                                ORF_info <- rbind(ORF_info, output.tmp)
                                        }

                                }
                        }
                }

        }
        ORF_info
}

get.freq <- function(seqs, alphabet, wordsize, step, freq, ignore.illegal = TRUE) {

        if(ignore.illegal) {
                for(i in 1:length(seqs)) {
                        if(!all(seqs[[i]] == "a" | seqs[[i]] == "c" | seqs[[i]] == "g" | seqs[[i]] == "t")) next
                        freq_tmp  <- seqinr::count(seqs[[i]], wordsize = wordsize,
                                                   by = step, freq = freq, alphabet = alphabet)
                        if(i == 1) freq.res <- freq_tmp else freq.res <- freq.res + freq_tmp
                        if(i %% 5000 == 0) message("  ", i, " sequences completed.")
                }
        } else {
                for(i in 1:length(seqs)) {
                        freq_tmp  <- seqinr::count(seqs[[i]], wordsize = wordsize,
                                                   by = step, freq = freq, alphabet = alphabet)
                        if(i == 1) freq.res <- freq_tmp else freq.res <- freq.res + freq_tmp
                        if(i %% 5000 == 0) message("  ", i, " sequences completed.")
                }
        }

        freq.out <- freq.res / sum(freq.res)
        freq.out
}

LogDist.DNA  <- function(OneSeq, hexamer.lnc, hexamer.cds, wordsize, step) {
        orf.info  <- find_ORF(OneSeq)

        count6  <- seqinr::count(seqinr::s2c(orf.info[[3]]), wordsize = wordsize,
                                 by = step, freq = FALSE)

        if(sum(count6) < 3) {
                count6  <- seqinr::count(OneSeq,  wordsize = wordsize,
                                         by = step, freq = FALSE)
        }

        freq6     <- count6 / sum(count6)
        lnc.ratio <- freq6[hexamer.lnc != 0] / hexamer.lnc[hexamer.lnc != 0]
        pct.ratio <- freq6[hexamer.cds != 0] / hexamer.cds[hexamer.cds != 0]
        lnc.dist  <- (sum(log(lnc.ratio[lnc.ratio != 0])) - length(which(lnc.ratio == 0)) + length(which(hexamer.lnc == 0))) / sum(count6)
        pct.dist  <- (sum(log(pct.ratio[pct.ratio != 0])) - length(which(pct.ratio == 0)) + length(which(hexamer.cds == 0))) / sum(count6)
        Ratio     <- lnc.dist / pct.dist
        distance  <- c(ORF.Max.Len = orf.info[[1]], ORF.Max.Cov = orf.info[[2]],
                       Seq.lnc.Dist = lnc.dist, Seq.pct.Dist = pct.dist, Seq.Dist.Ratio = Ratio)
}

LogDist.acguD  <- function(OneSeq, hexamer.lnc, hexamer.cds, wordsize, step) {
        count6    <- seqinr::count(OneSeq, wordsize = wordsize, freq = FALSE, by = step,
                                   alphabet = c("D", "a", "c", "g", "u"))
        freq6     <- count6 / sum(count6)
        lnc.ratio <- freq6[hexamer.lnc != 0] / hexamer.lnc[hexamer.lnc != 0]
        pct.ratio <- freq6[hexamer.cds != 0] / hexamer.cds[hexamer.cds != 0]
        lnc.dist  <- (sum(log(lnc.ratio[lnc.ratio != 0])) - length(which(lnc.ratio == 0)) + length(which(hexamer.lnc == 0))) / sum(count6)
        pct.dist  <- (sum(log(pct.ratio[pct.ratio != 0])) - length(which(pct.ratio == 0)) + length(which(hexamer.cds == 0))) / sum(count6)
        Ratio     <- lnc.dist / pct.dist
        distance  <- c(Dot_lnc.dist = lnc.dist, Dot_pct.dist = pct.dist, Dot_Dist.Ratio = Ratio)
}

LogDist.acguACGU  <- function(OneSeq, hexamer.lnc, hexamer.cds, wordsize, step) {
        count6    <- seqinr::count(OneSeq, wordsize = wordsize, freq = FALSE, by = step,
                                   alphabet = c("A", "C", "G", "U", "a", "c", "g", "u"))
        freq6     <- count6 / sum(count6)
        lnc.ratio <- freq6[hexamer.lnc != 0] / hexamer.lnc[hexamer.lnc != 0]
        pct.ratio <- freq6[hexamer.cds != 0] / hexamer.cds[hexamer.cds != 0]
        lnc.dist  <- (sum(log(lnc.ratio[lnc.ratio != 0])) - length(which(lnc.ratio == 0)) + length(which(hexamer.lnc == 0))) / sum(count6)
        pct.dist  <- (sum(log(pct.ratio[pct.ratio != 0])) - length(which(pct.ratio == 0)) + length(which(hexamer.cds == 0))) / sum(count6)
        Ratio     <- lnc.dist / pct.dist
        distance  <- c(SS.lnc.dist = lnc.dist, SS.pct.dist = pct.dist, SS.Dist.Ratio = Ratio)
}

secondary_seq <- function(OneSeq, info, RNAfold.path){
        seq.string <- unlist(seqinr::getSequence(OneSeq, TRUE))

        # assign("index", index + 1, inherits = TRUE)
        index <- get("index")
        showMessage <- paste(index, "of", info, "length:", nchar(seq.string), "nt", "\n")
        cat(showMessage)

        RNAfold.command <- paste(RNAfold.path, "--noPS")

        seq.ss <- system(RNAfold.command, intern = TRUE, input = seq.string)
        index <<- index + 1

        seq.ss[3] <- as.numeric(substr(seq.ss[2], nchar(seq.string) + 3, nchar(seq.ss[2]) - 1))
        seq.ss[2] <- substr(seq.ss[2], 1, nchar(seq.string))
        seq.ss
}

# CPAT.evaluate <- function(path) {
#
#         dataset <- read.table(file = path, sep ="\t", header = T)
#
#         set.seed(1)
#         folds <- caret::createFolds(dataset$label, k = 10, returnTrain = TRUE)
#
#         threshold <- sapply(folds, function(x) {
#                 subset <- dataset[x, ]
#                 test.set  <- dataset[-x, ]
#                 glm.mod <- glm(label ~ mRNA + ORF + Fickett + Hexamer, data = subset, family = binomial(link="logit"))
#                 res <- predict(glm.mod, test.set, type = "response")
#                 res <- data.frame(Prob = res, label = test.set$label)
#                 threshold <- pROC::coords(pROC::roc(res$label, res$Prob), x = "best", ret='threshold')[[1]]
#         })
#         best.cutoff <- round(sum(threshold) / 10, digits = 4)
#
#         performance <- sapply(folds, function(x) {
#                 subset <- dataset[x, ]
#                 test.set  <- dataset[-x, ]
#                 glm.mod <- glm(label ~ mRNA + ORF + Fickett + Hexamer, data = subset, family = binomial(link="logit"))
#                 res <- predict(glm.mod, test.set, type = "response")
#                 res <- data.frame(Prob = res, label = test.set$label)
#
#                 res$Pred <- ifelse(res$Prob >= best.cutoff, "Coding", "NonCoding")
#                 res$label <- ifelse(res$label == 1, "Coding", "NonCoding")
#                 confusion.res <- caret::confusionMatrix(res$Pred, res$label, mode = "everything", positive = "NonCoding")
#                 performance.res <- data.frame(Sensitivity = confusion.res$byClass[1],
#                                               Specificity = confusion.res$byClass[2],
#                                               Accuracy    = confusion.res$overall[1],
#                                               F.Measure   = confusion.res$byClass[7],
#                                               Kappa       = confusion.res$overall[2])
#         })
#
#         Ave.res <- apply(performance, 1, as.numeric)
#         Ave.res <- as.data.frame(t(Ave.res))
#         Ave.res <- rowMeans(Ave.res)
#         output  <- list(Best.Cutoff = best.cutoff, Performance = Ave.res)
# }

###### Reference frequencies ######

#' Make Frequencies File for Log.Dist, Euc.Dist, and hexamer score
#'
#' @description This function is used to calculate the frequencies of lncRNAs and CDs.
#' The Frequencies file can be used to calculate Logarithm-Distance (\code{\link{compute_LogDistance}}),
#' Euclidean-Distance (\code{\link{compute_EucDistance}}), and hexamer score (\code{\link{compute_hexamerScore}}).
#'
#' NOTE: If users need to make frequencies file to build
#' new LncFinder classifier using function \code{\link{extract_features}},
#' please refer to function \code{make_frequencies}.
#'
#' @param cds.seq Coding sequences (mRNA without UTRs). Can be a FASTA file loaded
#' by \code{\link[seqinr]{seqinr-package}}.
#'
#' @param lncRNA.seq Long non-coding RNA sequences. Can be a FASTA file loaded by
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param k An integer that indicates the sliding window size. (Default: \code{6})
#'
#' @param step Integer defaulting to \code{1} for the window step.
#'
#' @param alphabet A vector of single characters that specify the different character
#' of the sequence. (Default: \code{alphabet = c("a", "c", "g", "t")})
#'
#' @param on.orf Logical. Incomplete CDs can lead to a false shift and a
#' inaccurate hexamer frequencies. When \code{on.orf = TRUE}, the frequencies
#' will be calculated on the longest ORF. This parameter is strongly recommended to
#' set as \code{TRUE} when mRNA is used as CDs. Only available when
#' \code{alphabet = c("a", "c", "g", "t")}. (Default: \code{TRUE})
#'
#' @param ignore.illegal Logical. If \code{TRUE}, the sequences with non-nucleotide
#' characters (nucleotide characters: "a", "c", "g", "t") will be ignored when
#' calculating the frequencies. Only available when \code{alphabet = c("a", "c", "g", "t")}.
#' (Default: \code{TRUE})
#'
#' @return Returns a list which consists the frequencies of protein-coding sequences
#' and non-coding sequences.
#'
#' @author HAN Siyu
#' @details This function is used to make frequencies file for the computation of
#' Logarithm-Distance (\code{\link{compute_LogDistance}}), Euclidean-Distance
#' (\code{\link{compute_EucDistance}}),
#' and hexamer score (\code{\link{compute_hexamerScore}}).
#'
#' In order to achieve high accuracy, mRNA should not be regarded as CDs and assigned
#' to parameter \code{cds.seq}. However, CDs of some species may be insufficient
#' for calculating frequencies. In that case, mRNAs can be regarded as CDs with parameter
#' \code{on.orf = TRUE}, and the frequencies will be calculated on ORF region.
#' If \code{on.orf = TRUE}, users can set \code{step = 3} to simulate the translation process.
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#'
#' @seealso \code{\link{make_frequencies}},
#'          \code{\link{compute_LogDistance}},
#'          \code{\link{compute_EucDistance}},
#'          \code{\link{compute_hexamerScore}}.
#'
#' @examples
#' \dontrun{
#' Seqs <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#'
#' referFreq <- make_referFreq(cds.seq = Seqs, lncRNA.seq = Seqs, k = 6, step = 1,
#'                             alphabet = c("a", "c", "g", "t"), on.orf = TRUE,
#'                             ignore.illegal = TRUE)
#' }
#'
#' @export

make_referFreq <- function(cds.seq, lncRNA.seq, k = 6, step = 1, alphabet = c("a", "c", "g", "t"),
                           on.orf = TRUE, ignore.illegal = TRUE) {

        if(on.orf & !all(alphabet %in% c("a", "t", "g", "c"))) {
                stop("Error: The cds.seq has to be DNA sequences!")
        }

        if(!all(alphabet %in% c("a", "t", "g", "c"))) {
                ignore.illegal = FALSE
        }

        message("+ Calculating frequencies. ", Sys.time(), "\n")

        if(on.orf) {
                message("- Finding ORFs")
                orf.seq <- lapply(cds.seq, function(x) {
                        orf.info <- find_ORF(x)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                cds.seq <- orf.seq[!is.na(orf.seq)]
        }

        message("- Calculating frequencies.")

        message("\n", "  Non-coding sequences")
        lnc.freq <- get.freq(lncRNA.seq, alphabet = alphabet, wordsize = k,
                             step = step, freq = F, ignore.illegal = ignore.illegal)
        message("  Completed.", "\n")

        message("  Coding sequences")
        cds.freq <- get.freq(cds.seq, alphabet = alphabet, wordsize = k,
                             step = step, freq = F, ignore.illegal = ignore.illegal)
        message("  Completed.", "\n")

        message("+ Frequencies calculation completed. ", Sys.time())

        Internal.freq <- list(ref.lnc = lnc.freq, ref.cds = cds.freq)
        Internal.freq
}

###### Logarithm-Distance ######

#' Compute Logarithm Distance
#'
#' @description This function can compute Logarithm Distance proposed by method LncFinder
#' (Han et al. 2018). Logarithm Distance can be calculated on full sequence or the longest ORF
#' region. The step and \emph{k} of the sliding window can also be customized.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param referFreq a list obtained from function \code{\link{make_referFreq}}.
#'
#' @param k An integer that indicates the sliding window size. (Default: \code{6})
#'
#' @param step Integer defaulting to \code{1} for the window step.
#'
#' @param alphabet A vector of single characters that specify the different character
#' of the sequence. (Default: \code{alphabet = c("a", "c", "g", "t")})
#'
#' @param on.ORF  Logical. If \code{TRUE}, Logarithm Distance will be calculated on
#' the longest ORF region. NOTE: If \code{TRUE}, the input has to be DNA sequences.
#' (Default: \code{FALSE})
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE}, Logarithm Distance will be calculated on full sequences automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded. Ignored when \code{on.ORF = FALSE}.
#' (Default: \code{FALSE})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#' @author HAN Siyu
#' @details This function can compute Logarithm Distance proposed by LncFinder (HAN et al. 2018).
#' In LncFinder, two schemes are provided to calculate Logarithm Distance:
#' 1) \code{step = 3} and \code{k = 6} on the longest ORF region;
#' 2) \code{step = 1} and \code{k = 6} on full sequence.
#' Method LncFinder uses scheme 1 to extract Logarithm Distance features.
#' Using this function \code{compute_EucDistance}, both \code{step}, \code{k},
#' and calculated region (full sequence or ORF)
#' can be customized to maximize its availability.
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @seealso \code{\link{make_referFreq}},
#'          \code{\link{compute_EucDistance}},
#'          \code{\link{compute_hexamerScore}}.
#'
#' @examples
#' \dontrun{
#' Seqs <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#'
#' referFreq <- make_referFreq(cds.seq = Seqs, lncRNA.seq = Seqs, k = 6, step = 3,
#'                             alphabet = c("a", "c", "g", "t"), on.orf = TRUE,
#'                             ignore.illegal = TRUE)
#'
#' data(demo_DNA.seq)
#' Sequences <- demo_DNA.seq
#'
#' LogDistance <- compute_LogDistance(Sequences, label = "NonCoding", referFreq = referFreq,
#'                                    k = 6, step = 3, alphabet = c("a", "c", "g", "t"),
#'                                    on.ORF = TRUE, auto.full = TRUE, parallel.cores = 2)
#' }
#'
#' @export

compute_LogDistance <- function(Sequences, label = NULL, referFreq,
                                k = 6, step = 1, alphabet = c("a", "c", "g", "t"),
                                on.ORF = FALSE, auto.full = FALSE, parallel.cores = 2) {

        if(on.ORF & !all(alphabet %in% c("a", "t", "g", "c"))) {
                stop("Error: Calculation of Logarithm Distance on ORF region can only be applied to DNA sequences!")
        }

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = "find_ORF", envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        message("Calculating Logarithm Distance...")
        seqFreq <- parallel::parSapply(cl, Sequences, Internal.LogDistance, k = k, step = step,
                                       alphabet = alphabet, referFreq = referFreq)
        parallel::stopCluster(cl)

        seqFreq.df <- data.frame(t(seqFreq))

        if(!is.null(label)) seqFreq.df <- cbind(label = label, seqFreq.df)
        message("\n", "Completed. ", Sys.time(), "\n")

        seqFreq.df
}

###### Euclidean-Distance ######

#' Compute Euclidean Distance
#'
#' @description This function can compute Euclidean Distance proposed by method LncFinder
#' (Han et al. 2018). Euclidean Distance can be calculated on full sequence or the longest ORF
#' region. The step and \emph{k} of the sliding window can also be customized.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param referFreq a list obtained from function \code{\link{make_referFreq}}.
#'
#' @param k An integer that indicates the sliding window size. (Default: \code{6})
#'
#' @param step Integer defaulting to \code{1} for the window step.
#'
#' @param alphabet A vector of single characters that specify the different character
#' of the sequence. (Default: \code{alphabet = c("a", "c", "g", "t")})
#'
#' @param on.ORF  Logical. If \code{TRUE}, Euclidean Distance will be calculated on
#' the longest ORF region. NOTE: If \code{TRUE}, the input has to be DNA sequences.
#' (Default: \code{FALSE})
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE}, Euclidean Distance will be calculated on full sequences automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded. Ignored when \code{on.ORF = FALSE}.
#' (Default: \code{FALSE})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#' @author HAN Siyu
#' @details This function can compute Euclidean Distance proposed by LncFinder (HAN et al. 2018).
#' In LncFinder, two schemes are provided to calculate Euclidean Distance:
#' 1) \code{step = 3} and \code{k = 6} on the longest ORF region;
#' 2) \code{step = 1} and \code{k = 6} on full sequence.
#' Using this function \code{compute_EucDistance}, both \code{step}, \code{k},
#' and calculated region (full sequence or ORF)
#' can be customized to maximize its availability.
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @seealso \code{\link{make_referFreq}},
#'          \code{\link{compute_LogDistance}},
#'          \code{\link{compute_hexamerScore}}.
#'
#' @examples
#' \dontrun{
#' Seqs <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#'
#' referFreq <- make_referFreq(cds.seq = Seqs, lncRNA.seq = Seqs, k = 6, step = 3,
#'                             alphabet = c("a", "c", "g", "t"), on.orf = TRUE,
#'                             ignore.illegal = TRUE)
#'
#' data(demo_DNA.seq)
#' Sequences <- demo_DNA.seq
#'
#' EucDistance <- compute_EucDistance(Sequences, label = "NonCoding", referFreq = referFreq,
#'                                    k = 6, step = 3, alphabet = c("a", "c", "g", "t"),
#'                                    on.ORF = TRUE, auto.full = TRUE, parallel.cores = 2)
#' }
#'
#' @export

compute_EucDistance <- function(Sequences, label = NULL, referFreq,
                                k = 6, step = 1, alphabet = c("a", "c", "g", "t"),
                                on.ORF = FALSE, auto.full = FALSE, parallel.cores = 2) {

        if(on.ORF & !all(alphabet %in% c("a", "t", "g", "c"))) {
                stop("Error: Calculation of Euclidean Distance on ORF region can only be applied to DNA sequences!")
        }

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = "find_ORF", envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        message("Calculating Euclidean Distance...")
        seqFreq <- parallel::parSapply(cl, Sequences, Internal.EucDistance, k = k, step = step,
                                       alphabet = alphabet, referFreq = referFreq)
        parallel::stopCluster(cl)

        seqFreq.df <- data.frame(t(seqFreq))

        if(!is.null(label)) seqFreq.df <- cbind(label = label, seqFreq.df)
        message("\n", "Completed. ", Sys.time(), "\n")

        seqFreq.df
}

###### Hexamer Score ######

#' Compute Hexamer Score
#'
#' @description This function can compute hexamer score proposed by method CPAT
#' (Wang et al. 2013). Hexamer score can be calculated on full sequence or the longest ORF
#' region. The step and \emph{k} of the sliding window can also be customized.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param referFreq A list obtained from function \code{\link{make_referFreq}}.
#'
#' @param k An integer that indicates the sliding window size. (Default: \code{6})
#'
#' @param step Integer defaulting to \code{1} for the window step.
#'
#' @param alphabet A vector of single characters that specify the different character
#' of the sequence. (Default: \code{alphabet = c("a", "c", "g", "t")})
#'
#' @param on.ORF Logical. If \code{TRUE}, hexamer score will be calculated on
#' the longest ORF region. NOTE: If \code{TRUE}, the input has to be DNA sequences.
#' (Default: \code{FALSE})
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE},  hexamer score will be calculated on full sequences automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded. Ignored when \code{on.ORF = FALSE}.
#' (Default: \code{FALSE})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#' @author HAN Siyu
#' @details This function can compute hexamer score proposed by CPAT (Wang et al. 2013).
#' In CPAT, hexamer score is calculated on the longest ORF region, and the step of the
#' sliding window is 3 (i.e. \code{step = 3}). Hexamer means six adjoining bases, thus
#' \code{k = 6}. But in function \code{compute_hexamerScore}, both \code{step}, \code{k},
#' and calculated region (full sequence or ORF)
#' can be customized to maximize its availability.
#'
#' @section References:
#' Liguo Wang, Hyun Jung Park, Surendra Dasari, Shengqin Wang, JeanPierre Kocher, & Wei Li.
#' CPAT: coding-potential assessment tool using an alignment-free logistic regression model.
#' \emph{Nucleic Acids Research}, 2013, 41(6):e74-e74.
#'
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @seealso \code{\link{make_referFreq}},
#'          \code{\link{compute_LogDistance}},
#'          \code{\link{compute_EucDistance}}.
#'
#' @examples
#' \dontrun{
#' Seqs <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#'
#' referFreq <- make_referFreq(cds.seq = Seqs, lncRNA.seq = Seqs, k = 6, step = 1,
#'                             alphabet = c("a", "c", "g", "t"), on.orf = TRUE,
#'                             ignore.illegal = TRUE)
#'
#' data(demo_DNA.seq)
#' Sequences <- demo_DNA.seq
#'
#' hexamerScore <- compute_hexamerScore(Sequences, label = "NonCoding", referFreq = referFreq,
#'                                      k = 6, step = 1, alphabet = c("a", "c", "g", "t"),
#'                                      on.ORF = TRUE, auto.full = TRUE, parallel.cores = 2)
#' }
#'
#' @export

compute_hexamerScore <- function(Sequences, label = NULL, referFreq,
                                 k = 6, step = 1, alphabet = c("a", "c", "g", "t"),
                                 on.ORF = FALSE, auto.full = FALSE, parallel.cores = 2) {

        if(on.ORF & !all(alphabet %in% c("a", "t", "g", "c"))) {
                stop("Error: Calculation of hexamer score on ORF region can only be applied to DNA sequences!")
        }

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = "find_ORF", envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        message("Calculating hexamer score...")
        seqFreq <- parallel::parSapply(cl, Sequences, Internal.hexamerScore, k = k, step = step,
                                       alphabet = alphabet, referFreq = referFreq)
        parallel::stopCluster(cl)

        seqFreq.df <- data.frame(Hexamer.Score = seqFreq)

        if(!is.null(label)) seqFreq.df <- cbind(label = label, seqFreq.df)
        message("\n", "Completed. ", Sys.time(), "\n")

        seqFreq.df
}

###### k-mer scheme ######

#' Compute \emph{k}-mer Features
#'
#' @description This function can calculate the \emph{k}-mer frequencies of the sequences.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param k An integer that indicates the sliding window size. (Default: \code{1:5})
#'
#' @param step Integer defaulting to \code{1} for the window step.
#'
#' @param freq Logical. If TRUE, the frequencies of different patterns are returned
#' instead of counts. (Default: \code{TRUE})
#'
#' @param improved.mode Logical. If TRUE, the frequencies will be normalized using
#' the method proposed by PLEK (Li et al. 2014).
#' Ignored if \code{freq = FALSE}. (Default: \code{FALSE})
#'
#' @param alphabet A vector of single characters that specify the different character
#' of the sequence. (Default: \code{alphabet = c("a", "c", "g", "t")})
#'
#' @param on.ORF Logical. If \code{TRUE}, the \emph{k}-mer frequencies will be calculated on
#' the longest ORF region. NOTE: If \code{TRUE}, the sequences have to be DNA.
#' (Default: \code{FALSE})
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE}, the \emph{k}-mer
#' frequencies will be calculated on the full sequence automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded.
#' Ignored when \code{on.ORF = FALSE}. (Default: \code{FALSE})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#' @author HAN Siyu
#' @details
#' This function can extract \emph{k}-mer features. \code{k} and \code{step} can be customized.
#' The count (\code{freq = FALSE}) or frequencies (\code{freq = TRUE}) of different patterns can be returned.
#' If \code{freq = TRUE}, \code{improved.mode} is available. The improved mode is proposed by method PLEK.
#' (Ref: Li et al. 2014)
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' kmer_res1 <- compute_kmer(Seqs, k = 1:5, step = 1, freq = TRUE, improved.mode = FALSE)
#'
#' kmer_res2 <- compute_kmer(Seqs, k = 1:5, step = 3, freq = TRUE,
#'                           improved.mode = TRUE, on.ORF = TRUE, auto.full = TRUE)
#' }
#'
#' @export

compute_kmer <- function(Sequences, label = NULL, k = 1:5, step = 1, freq = TRUE,
                         improved.mode = FALSE, alphabet = c("a", "c", "g", "t"),
                         on.ORF = FALSE, auto.full = FALSE, parallel.cores = 2) {

        if(on.ORF & !all(alphabet %in% c("a", "t", "g", "c"))) {
                stop("Error: Calculation of k-mer frequencies on ORF region can only be applied to DNA sequences!")
        }

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = "find_ORF", envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        for (i in k) {
                message("Calculating ", i, "-mer frequencies...")
                i.freq <- parallel::parSapply(cl, Sequences, seqinr::count, wordsize = i,
                                              by = step, freq = freq, alphabet = alphabet)
                if(improved.mode & freq) {
                        weight <- 1 / (length(alphabet) ^ (max(k) - i))
                        i.freq <- i.freq * weight
                }
                i.freq <- data.frame(t(i.freq))
                if(i == k[1]) {
                        k.freq <- i.freq
                } else {
                        k.freq <- cbind(k.freq, i.freq)
                }
        }
        parallel::stopCluster(cl)

        if(!is.null(label)) k.freq <- cbind(label = label, k.freq)
        message("\n", "Completed. ", Sys.time(), "\n")
        k.freq
}

###### Fickett Score ######

#' Compute Fickett TESTCODE Score
#'
#' @description This function can compute Fickett TESTCODE score of DNA sequences proposed by James W.Fickett
#' (Fickett JW. 1982). Fickett TESTCODE score can be calculated on full sequence or the longest ORF
#' region.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param on.ORF Logical. If \code{TRUE}, Fickett TESTCODE score will be calculated on
#' the longest ORF region.
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE},  Fickett TESTCODE score will be calculated on full sequences automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded.
#' Ignored when \code{on.ORF = FALSE}. (Default: \code{FALSE})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#' @author HAN Siyu
#' @details This function can compute Fickett TESTCODE score proposed by James W.Fickett (Fickett JW. 1982).
#' Fickett TESTCODE score is selected as feature by method CPAT (Wang et al. 2013) and CPC2 (Kang et al. 2017).
#' In CPAT, Fickett TESTCODE score is calculated on the longest ORF region, but CPC2 calculates the score
#' on full sequence. This function \code{compute_FickettScore} improves the CPAT's code
#' and is capable of computing the score on the longest ORF region as well as full sequence.
#'
#' @section References:
#' James W.Fickett.
#' Recognition of protein coding regions in DNA sequences.
#' \emph{Nucleic Acids Research}, 1982, 10(17):5303-5318.
#'
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' Liguo Wang, Hyun Jung Park, Surendra Dasari, Shengqin Wang, JeanPierre Kocher & Wei Li.
#' CPAT: coding-potential assessment tool using an alignment-free logistic regression model.
#' \emph{Nucleic Acids Research}, 2013, 41(6):e74-e74.
#'
#' Yu-Jian Kang, De-Chang Yang, Lei Kong, Mei Hou, Yu-Qi Meng, Liping Wei & Ge Gao.
#' CPC2: a fast and accurate coding potential calculator based on sequence intrinsic features.
#' \emph{Nucleic Acids Research}, 2017, 45(W1):W12-W16.
#'
#' @importFrom seqinr count
#' @importFrom seqinr s2c
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' FickettScore <- compute_FickettScore(Seqs, label = NULL, on.ORF = TRUE,
#'                                      auto.full = TRUE, parallel.cores = 2)
#' }
#' @export

compute_FickettScore <- function(Sequences, label = NULL, on.ORF = FALSE,
                                 auto.full = FALSE, parallel.cores = 2) {

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = c("find_ORF", "Internal.convertProb"), envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        positionProb = list(a = c(0.94,0.68,0.84,0.93,0.58,0.68,0.45,0.34,0.20,0.22),
                            c = c(0.80,0.70,0.70,0.81,0.66,0.48,0.51,0.33,0.30,0.23),
                            g = c(0.90,0.88,0.74,0.64,0.53,0.48,0.27,0.16,0.08,0.08),
                            t = c(0.97,0.97,0.91,0.68,0.69,0.44,0.54,0.20,0.09,0.09))
        positionWeight = c(a = 0.26, c = 0.18, g = 0.31, t = 0.33)
        positionParam = c(1.9, 1.8, 1.7, 1.6, 1.5 ,1.4, 1.3, 1.2, 1.1, 0)

        contentProb = list(a = c(0.28,0.49,0.44,0.55,0.62,0.49,0.67,0.65,0.81,0.21),
                           c = c(0.82,0.64,0.51,0.64,0.59,0.59,0.43,0.44,0.39,0.31),
                           g = c(0.40,0.54,0.47,0.64,0.64,0.73,0.41,0.41,0.33,0.29),
                           t = c(0.28,0.24,0.39,0.40,0.55,0.75,0.56,0.69,0.51,0.58))
        contentWeight = c(a = 0.11, c = 0.12, g = 0.15, t = 0.14)
        contentParam = c(0.33, 0.31, 0.29, 0.27, 0.25, 0.23, 0.21, 0.19, 0.17, 0)

        message("Calculating Fickett TESTCODE score...")
        FickettScore.res <- parallel::parSapply(cl, Sequences, Internal.FickettScore,
                                                .positionProb = positionProb, .positionWeight = positionWeight,
                                                .positionParam = positionParam, .contentProb = contentProb,
                                                .contentWeight = contentWeight, .contentParam = contentParam)
        parallel::stopCluster(cl)

        FickettScore.df <- data.frame(Fickett.Score = FickettScore.res)

        if(!is.null(label)) FickettScore.df <- cbind(label = label, FickettScore.df)
        message("\n", "Completed. ", Sys.time(), "\n")
        FickettScore.df
}

###### GC Content ######

#' Calculate GC content
#' @description This function can GC content of the input sequences.
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param on.ORF Logical. If \code{TRUE}, GC content will be calculated on
#' the longest ORF region.
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE},  GC content will be calculated on full sequences automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded.
#' Ignored when \code{on.ORF = FALSE}. (Default: \code{FALSE})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#'
#' @author HAN Siyu
#'
#' @details This function can basically compute GC content of DNA sequences:
#' GC content = (nc + ng) / (na + nc + ng + nt).
#' The function will ignored the ambiguous bases.
#'
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @seealso \code{\link[seqinr]{GC}} (package "\code{\link[seqinr]{seqinr-package}}")
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' gcContent <- compute_GC(Seqs, label = "NonCoding",on.ORF = TRUE,
#'                         auto.full = TRUE, parallel.cores = 2)
#' }
#' @export
#'

compute_GC <- function(Sequences, label = NULL, on.ORF = FALSE,
                       auto.full = FALSE, parallel.cores = 2) {

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = c("find_ORF"), envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        message("Calculating GC content...")
        GC.content <- parallel::parSapply(cl, Sequences, function(x) {
                num.a <- sum(x == "a")
                num.c <- sum(x == "c")
                num.g <- sum(x == "g")
                num.t <- sum(x == "t")

                gc.content <- (num.g + num.c) / (num.a + num.c + num.g + num.t)
        })
        parallel::stopCluster(cl)

        GC.content  <- as.data.frame(GC.content, stringsAsFactors = FALSE)

        if(!is.null(label)) GC.content <- cbind(label = label, GC.content)

        message("\n", "Completed. ", Sys.time(), "\n")

        GC.content
}

###### EIIP Features ######

#' Extract the EIIP-derived features
#'
#' @description This function can extract EIIP-derived features proposed by Han et al (2018).
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#' @param spectrum.percent Numeric specifying the percentage of the sorted power spectrum that be
#' used to calculate the quantile-based features. For example, if \code{spectrum.percent = 0.1},
#' the top 10\% percent of the sorted power spectrum will be used to compute the quantiles.
#' @param quantile.probs Numeric. The probabilities with values in [0,1].
#'
#' @return A dataframe including the EIIP-derived features.
#'
#' @author HAN Siyu
#'
#' @details The function \code{compute_EIIP} can extract EIIP (electron-ion interaction pseudo-potential) features including:
#' signal at 1/3 position (\code{Signal.Peak}), average power (\code{Average.Power}), signal to noise ratio (\code{SNR}),
#' and quantile-based features of one specified percentage of the sorted power spectrum
#' (e.g. \code{0\%}, \code{20\%}, \code{40\%}, \code{60\%}, \code{70\%},
#' \code{100\%} when \code{quantile.probs = seq(0, 1, 0.2)} and \code{spectrum.percent =} \code{0.1}).
#'
#' In method LncFinder, EIIP features includes \code{Signal.Peak}, \code{SNR}, 0\% (\code{Signal.Min}),
#' 25\% (\code{Singal.Q1}, 50\% \code{Signal.Q2}), and 75\% (\code{Signal.Max}) of the top 10\% sorted
#' power spectrum, i.e. \code{quantile.prob} \code{= seq(0, 1, 0.25)} and \code{spectrum.percent = 0.1}.
#'
#' @importFrom stats fft
#' @importFrom stats quantile
#'
#' @seealso \code{\link{extract_features}}
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' Lalović, Dragutin, and Veljko Veljković.
#' The global average DNA base composition of coding regions may be determined by the electron-ion interaction potential.
#' \emph{Biosystems}, 1990, 23(4):311-316.
#'
#' Achuthsankar S Nair & Sivarama Pillai Sreenadhan.
#' A coding measure scheme employing electron-ion interaction pseudopotential (EIIP).
#' \emph{Bioinformation}, 2006, 1(6):197-202.
#'
#' @examples
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' EIIP_res <- compute_EIIP(Seqs, label = "NonCoding", spectrum.percent = 0.25,
#'                          quantile.probs = seq(0, 1, 0.25))
#'
#' @export

compute_EIIP <- function(Sequences, label = NULL, spectrum.percent = 0.1,
                         quantile.probs = seq(0, 1, 0.25)) {
        seqs.EIIP <- sapply(Sequences, get_EIIP, percent.length = spectrum.percent, probs = quantile.probs)

        seqs.EIIP  <- as.data.frame(t(seqs.EIIP), stringsAsFactors = FALSE)

        if(!is.null(label)) seqs.EIIP <- cbind(label = label, seqs.EIIP)

        seqs.EIIP
}

###### pI Features ######

#' Compute Theoretical Isoelectric Point
#'
#' @description This function is basically a wrapper for function \code{\link[seqinr]{computePI}}.
#' This function translate DNA sequence into protein, and compute the theoretical isoelectric point
#' (pI) of this protein.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param on.ORF Logical. If \code{TRUE}, pI will be calculated on
#' the longest ORF region. NOTE: If \code{TRUE}, the input has to be DNA sequences.
#' (Default: \code{FALSE})
#'
#' @param auto.full Logical. When \code{on.ORF = TRUE} but no ORF can be found,
#' if \code{auto.full = TRUE},  pI will be calculated on full sequences automatically;
#' if \code{auto.full} is \code{FALSE}, the sequences that have no ORF will be discarded.
#' Ignored when \code{on.ORF = FALSE}. (Default: \code{FALSE})
#'
#' @param ambiguous.base If \code{TRUE}, ambiguous bases are taken into account when
#' translating DNA sequences into proteins.
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return A dataframe.
#' @author HAN Siyu
#' @details This function can compute the pI of DNA sequences. Method CPC2 (Kang et al. 2017) uses this
#' feature to identify lncRNAs, and this feature is evaluated in the article LncFinder (Han et al. 2018).
#'
#' Using this function, the theoretical pI can be computed on full sequence or the longest ORF region.
#' In CPC2, pI is calculated on ORF region.
#'
#' @importFrom seqinr translate
#' @importFrom seqinr computePI
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#'
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Sequences <- demo_DNA.seq
#'
#' pI_res <- compute_pI(Sequences, on.ORF = TRUE, auto.full = FALSE, ambiguous.base = FALSE)
#' }
#' @export

compute_pI <- function(Sequences, label = NULL, on.ORF = FALSE, auto.full = FALSE,
                       ambiguous.base = FALSE, parallel.cores = 2) {

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        message("Processing... ", Sys.time(), "\n")
        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = "find_ORF", envir = environment())

        if(on.ORF & !auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                Sequences <- Sequences[!is.na(Sequences)]
        }

        if(on.ORF & auto.full) {
                message("Calculating ORF region...")
                Sequences <- parallel::parLapply(cl, Sequences, function(x) {
                        orf.info <- find_ORF(x, max.only = TRUE)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- x
                        orf
                })
        }

        message("Calculating theoretical isoelectric point...")
        AA.seq  <- parallel::parLapply(cl, Sequences, seqinr::translate, ambiguous = ambiguous.base)
        pI.val  <- parallel::parSapply(cl, AA.seq,  seqinr::computePI)

        parallel::stopCluster(cl)

        pI      <- data.frame(pI = pI.val, stringsAsFactors = FALSE)

        if(!is.null(label)) pI <- cbind(label = label, pI)
        message("\n", "Completed. ", Sys.time(), "\n")

        pI
}

###### SVM k-fold CV ######

#' \emph{k}-fold Cross Validation for SVM
#' @description This function conduct \emph{k}-fold Cross Validation for SVM.
#'
#' @param dataset The dataset obtained from function \code{\link{extract_features}}.
#' Or datasets used to build the classifier.
#'
#' @param label.col integer specifying the column number of the label. (Default: \code{1})
#'
#' @param positive.class Character. Indicate the positive class of the dataset.
#' (Default: \code{NonCoding}) The value of this parameter should be identical to
#' one of the classes of the response vectors.
#'
#' @param folds.num Integer. Specify the number of folds for cross-validation.
#' (Default: \code{10})
#'
#' @param seed Integer. Used to set the seed for cross-validation. (Default: \code{1})
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}, users can set as \code{-1} to run
#' this function with all cores. If the number of \code{parallel.cores} is more
#' than the \code{folds.num} (number of the folds for cross-validation), the
#' number of \code{parallel.cores} will be set as \code{folds.num} automatically.
#'
#' @param ... additional parameters for function \code{\link[e1071]{svm}}.
#'
#' @return Returns the optimal parameters when \code{return.model = FALSE}.
#' Or returns the best model when \code{return.model = TRUE}.
#'
#' @author HAN Siyu
#'
#' @details During the model tuning, the performance of each combination of
#' parameters will output. Sensitivity, Specificity, Accuracy, F-Measure and Kappa
#' Value are used to evaluate the performances. The best gamma and cost (or best
#' model) are selected based on Accuracy.
#'
#' For the details of parameter gamma and cost, please refer to function
#' \code{\link[e1071]{svm}} of package "e1071".
#'
#' For the details of metrics, please refer to function
#' \code{\link[caret]{confusionMatrix}} of package "caret".
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom e1071 svm
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @seealso \code{\link{extract_features}}, \code{\link{svm_tune}}.
#' @examples
#' \dontrun{
#' data(demo_dataset)
#' my_dataset <- demo_dataset
#'
#' cv_res <- svm_cv(my_dataset, folds.num = 4, seed = 1,
#'                  parallel.core = 2, cost = 3, kernel = "radial", gamma = 0.5)
#'
#' ### Users can set return.model = TRUE to return the best model.
#' }
#' @export

svm_cv <- function(dataset, label.col = 1, positive.class = NULL,
                   folds.num = 10, seed = 1, parallel.cores = 2, ...) {

        names(dataset)[[label.col]] <- "label"
        set.seed(seed)
        folds <- caret::createFolds(dataset$label, k = folds.num, returnTrain = TRUE)

        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        parallel.cores <- ifelse(parallel.cores > folds.num, folds.num, parallel.cores)

        if(parallel.cores == 2 & folds.num > parallel.cores) {
                message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        }

        cl <- parallel::makeCluster(parallel.cores, outfile = "")

        if(is.null(positive.class)) {
                positive.class <- as.character(dataset$label[[1]])
                parallel::clusterExport(cl, varlist = c("dataset", "positive.class"), envir = environment())

        } else {
                parallel::clusterExport(cl, varlist = c("dataset"), envir = environment())
        }
        list(...)

        perf.res <- parallel::parSapply(cl, folds, fold.res, dataset = dataset,
                                        positive.class = positive.class, ...)
        parallel::stopCluster(cl)
        Ave.res2     <- apply(perf.res, 1, as.numeric)
        Ave.res2     <- as.data.frame(t(Ave.res2))
        Ave.res2$Ave.Res <- rowMeans(Ave.res2)
        message("Average Result:")
        print(Ave.res2[ncol(Ave.res2)])
        Ave.res2
        # perf.res
}

###### svm_tune ######

#' Parameter Tuning of SVM
#' @description This function conduct the parameter tuning of SVM. Parameters
#' gamma and cost can be tuned using grid search.
#'
#' @param dataset The dataset obtained from function \code{\link{extract_features}}.
#' Or datasets used to build the classifier.
#'
#' @param label.col integer specifying the column number of the label. (Default: \code{1})
#'
#' @param positive.class Character. Indicate the positive class of the dataset.
#' (Default: \code{NonCoding}) The value of this parameter should be identical to
#' one of the classes of the response vectors.
#'
#' @param folds.num Integer. Specify the number of folds for cross-validation.
#' (Default: \code{10})
#'
#' @param seed Integer. Used to set the seed for cross-validation. (Default: \code{1})
#'
#' @param gamma.range The range of gamma. (Default: \code{2 ^ seq(-5, 0, 1)})
#'
#' @param cost.range The range of cost. (Default: \code{c(1, 4, 8, 16, 24, 32)})
#'
#' @param return.model Logical. If \code{TRUE}, the function will return the best
#' model trained on the full dataset. If \code{FALSE}, this function will return
#' the optimal parameters.
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}, users can set as \code{-1} to run
#' this function with all cores. If the number of \code{parallel.cores} is more
#' than the \code{folds.num} (number of the folds for cross-validation), the
#' number of \code{parallel.cores} will be set as \code{folds.num} automatically.
#'
#' @param ... Additional arguments for function \code{\link[e1071]{svm}}, except
#' \code{scale}, \code{probability}, \code{kernel}, \code{gamma} and \code{cost}.
#'
#' @return Returns the optimal parameters when \code{return.model = FALSE}.
#' Or returns the best model when \code{return.model = TRUE}.
#'
#' @author HAN Siyu
#'
#' @details During the model tuning, the performance of each combination of
#' parameters will output. Sensitivity, Specificity, Accuracy, F-Measure and Kappa
#' Value are used to evaluate the performances. The best gamma and cost (or best
#' model) are selected based on Accuracy.
#'
#' For the details of parameter gamma and cost, please refer to function
#' \code{\link[e1071]{svm}} of package "e1071".
#'
#' For the details of metrics, please refer to function
#' \code{\link[caret]{confusionMatrix}} of package "caret".
#'
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom e1071 svm
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @seealso \code{\link{extract_features}}, \code{\link{svm_cv}}.
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' positive_data <- extract_features(Seqs[1:5], label = "NonCoding",
#'                                   SS.features = FALSE, format = "DNA",
#'                                   frequencies.file = "human",
#'                                   parallel.cores = 2)
#'
#' negative_data <- extract_features(Seqs[6:10], label = "Coding",
#'                                   SS.features = FALSE, format = "DNA",
#'                                   frequencies.file = "human",
#'                                   parallel.cores = 2)
#'
#' my_dataset <- rbind(positive_data, negative_data)
#'
#' ### Or use our data "demo_dataset"
#' data(demo_dataset)
#' my_dataset <- demo_dataset
#'
#' optimal_parameter <- svm_tune(my_dataset, positive.class = "NonCoding",
#'                               folds.num = 2, seed = 1,
#'                               gamma.range = (2 ^ seq(-5, 0, 2)),
#'                               cost.range = c(1, 8, 16),
#'                               return.model = FALSE, parallel.core = 2)
#'
#' ### Users can set return.model = TRUE to return the best model.
#' }
#' @export

svm_tune <- function(dataset, label.col = 1, positive.class = "NonCoding",
                     folds.num = 10, seed = 1,
                     gamma.range = (2 ^ seq(-5, 0, 1)),
                     cost.range = c(1, 4, 8, 16, 24, 32),
                     return.model = TRUE, parallel.cores = 2, ...){

        names(dataset)[[label.col]] <- "label"
        dataset$label <- as.factor(dataset$label)

        set.seed(seed)
        folds <- caret::createFolds(dataset$label, k = folds.num, returnTrain = TRUE)

        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        parallel.cores <- ifelse(parallel.cores > folds.num, folds.num, parallel.cores)

        if(parallel.cores == 2 & folds.num > parallel.cores) {
                message("Users can try to set parallel.cores = -1 to use all cores!", "\n")
        }

        cl <- parallel::makeCluster(parallel.cores)

        res <- c()
        message("+ SVM.tune processing.")
        for(g in gamma.range){
                for(C in cost.range){
                        # parallel::clusterExport(cl, varlist = c("g", "C", "dataset", "positive.class"),
                        #                         envir = environment())
                        message("- gamma = ", g, ", Cost = ", C)
                        perf.res    <- parallel::parSapply(cl, folds, function(x, gamma, cost,
                                                                               positive.label, ...) {
                                subset <- dataset[x, ]
                                test.set  <- dataset[-x, ]
                                svm.mod <- e1071::svm(label ~ ., data = subset, scale = TRUE, probability = TRUE,
                                                      kernel = "radial", gamma = gamma, cost = cost,
                                                      ... = ...)

                                res <- stats::predict(svm.mod, test.set, probability = TRUE)

                                confusion.res <- caret::confusionMatrix(data.frame(res)$res, test.set$label,
                                                                        positive = positive.label, mode = "everything")
                                performance.res <- data.frame(Sensitivity = confusion.res$byClass[1],
                                                              Specificity = confusion.res$byClass[2],
                                                              Accuracy    = confusion.res$overall[1],
                                                              F.Measure   = confusion.res$byClass[7],
                                                              Kappa       = confusion.res$overall[2])

                                performance.res
                        },
                        gamma = g, cost = C,
                        positive.label = positive.class, ... = ...)

                        Ave.res     <- apply(perf.res, 1, as.numeric)
                        Ave.res     <- as.data.frame(t(Ave.res))
                        Ave.res$Ave.Res <- rowMeans(Ave.res)
                        print(t(Ave.res[ncol(Ave.res)]))
                        res <- c(res, list(Ave.res))
                        names(res)[length(res)] <- paste("g = ", g, "; C = ", C, sep = "")
                }
        }
        parallel::stopCluster(cl)
        message("\n", "Best Result:")
        Accuracy <- sapply(res, function(x) x[3,ncol(Ave.res)])
        best.res <- res[Accuracy == max(Accuracy)][1]
        message("$ ", names(best.res))
        print(t(best.res[[1]][ncol(Ave.res)]))
        best.gamma <- as.numeric(strsplit(strsplit(names(best.res), "; ")[[1]][[1]], " = ")[[1]][[2]])
        best.cost  <- as.numeric(strsplit(strsplit(names(best.res), "; ")[[1]][[2]], " = ")[[1]][[2]])
        best.parameters <- c(best.gamma = best.gamma, best.cost = best.cost)
        if(return.model) {
                message("\n", "+ Training the model on the Full Dataset.")
                svm.mod  <- e1071::svm(label ~ ., data = dataset, scale = TRUE,
                                       probability = TRUE, kernel = "radial",
                                       gamma = best.gamma,
                                       cost  = best.cost)
                message("\n", "+ SVM.tune completed.")
                return(svm.mod)
        } else {
                message("\n", "+ SVM.tune completed.")
                return(list(Best.Parameters = best.parameters, Result = res))
        }
}

###### find_orfs ######

#' Find ORFs
#' @description This function can find all the ORFs in one sequence.
#'
#' @param OneSeq Is one sequence. Can be a FASTA file read by package "seqinr"
#' \code{\link[seqinr]{seqinr-package}} or just a string.
#'
#' @param reverse.strand Logical. Whether find ORF on the reverse strand. Default: \code{FALSE}
#'
#' @param max.only Logical. If \code{TRUE}, only the longest ORF will be returned.  Default: \code{TRUE}
#'
#' @return If \code{max.only = TRUE}, the function returns a list which consists the ORF region (\code{ORF.Max.Seq}),
#' length (\code{ORF.Max.Len}) and coverage (\code{ORF.Max.Cov}) of the longest ORF.
#' If \code{max.only = FALSE}, the function returns a dataframe which consists all the ORF sequences.
#' @author HAN Siyu
#' @details This function can extract ORFs of one sequence. It returns
#' ORF region, length and coverage of the longest ORF when \code{max.only = TRUE} or
#' ORF region, start position, end position, length and coverage of all the ORFs when
#' \code{max.only = FALSE}. Coverage is the the ratio
#' of the ORF to transcript length. If \code{reverse.strand = TRUE}, ORF will also be
#' found on reverse strand.
#' @importFrom seqinr getSequence
#' @importFrom seqinr comp
#' @importFrom seqinr s2c
#' @examples
#' ### For one sequence:
#' OneSeq <- c("cccatgcccagctagtaagcttagcc")
#' orf.info_1 <- find_orfs(OneSeq, reverse.strand = TRUE, max.only = FALSE)
#'
#' ### For a FASTA file contains several sequences:
#' \dontrun{
#' ### Use "read.fasta" function of package "seqinr" to read a FASTA file:
#' Seqs <- seqinr::read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#' }
#'
#' ### Or just try to use our data "demo_DNA.seq"
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' ### Use apply function to find the longest ORF:
#' orf.info_2 <- sapply(Seqs, find_orfs, reverse.strand = FALSE, max.only = FALSE)
#' @export

find_orfs <- function(OneSeq, reverse.strand = FALSE, max.only = TRUE) {

        orf.info <- find_ORF(OneSeq, max.only = max.only)
        if(reverse.strand) {
                OneSeq <- unlist(seqinr::getSequence(OneSeq, as.string = TRUE))
                OneSeq <- gsub("\n", "", OneSeq)
                OneSeq <- seqinr::s2c(OneSeq)
                Seq.reverse <- rev(seqinr::comp(OneSeq))
                orf.reverse <- find_ORF(Seq.reverse, max.only = max.only)
                orf.info    <- list(ORF.Forward = orf.info,
                                    ORF.Reverse = orf.reverse)
        }
        orf.info
}

###### run_RNAfold ######

#' Obtain the Secondary Structure Sequences Using RNAfold
#' @description This function can compute secondary structure sequences. The tool
#' "RNAfold" of software "ViennaRNA" is required for this function.
#'
#' @param Sequences A FASTA file loaded by function  \code{\link[seqinr]{read.fasta}} of
#' \code{\link[seqinr]{seqinr-package}}.
#'
#' @param RNAfold.path String. Indicate the path of the program "RNAfold". By
#' default is \code{"RNAfold"} for UNIX/Linux system. (See details.)
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}, users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return Returns data.frame. The first row is RNA sequence; the second row is
#' Dot-Bracket Notation of secondary structure sequences; the last row is minimum
#' free energy (MFE).
#' @author HAN Siyu
#' @details This function is used to compute secondary structure. The output of
#' this function can be used in function \code{\link{make_frequencies}},
#' \code{\link{extract_features}}, \code{\link{build_model}} and
#' \code{\link{lnc_finder}} when parameter \code{SS.features} is set as \code{TRUE}.
#'
#' This function depends on the program "RNAfold" of software "ViennaRNA".
#' (\url{http://www.tbi.univie.ac.at/RNA/index.html})
#'
#' Parameter \code{RNAfold.path} can be simply defined as \code{"RNAfold"} as
#' default when the OS is UNIX/Linux. However, for some OS, such as Windows, users
#' need to specify the \code{RNAfold.path} if the path of "RNAfold" haven't been
#' added in environment variables.
#'
#' This function can print the related information when the OS is UNIX/Linux,
#' such as:
#'
#' \code{"25 of 100, length: 695 nt"},
#'
#' which means around 100 sequences are assigned to this node and the program is
#' computing the 25th sequence. The length of this sequence is 695nt.
#'
#' If users have their own SS data, users can use function \code{\link{read_SS}} to load
#' them, instead of obtaining from RNAfold.
#'
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @seealso \code{\link{read_SS}}
#' @examples
#' \dontrun{
#' ### For a FASTA file contains several sequences,
#' ### Use "read.fasta" function of package "seqinr" to read a FASTA file:
#' Seqs <- read.fasta(file =
#' "http://www.ncbi.nlm.nih.gov/WebSub/html/help/sample_files/nucleotide-sample.txt")
#'
#' ### Or just try to use our data "demo_DNA.seq"
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' ### Windows:
#' RNAfold.path <- '"E:/Program Files/ViennaRNA/RNAfold.exe"'
#' SS.seq_1 <- run_RNAfold(Seqs[1:2], RNAfold.path = RNAfold.path, parallel.cores = 2)
#'
#' ### For UNIX/Linux, "RNAfold.path" can be just defined as "RNAfold" as default:
#' SS.seq_2 <- run_RNAfold(Seqs, RNAfold.path = "RNAfold", parallel.cores = 2)
#' }
#' @export

run_RNAfold <- function(Sequences, RNAfold.path = "RNAfold",
                        parallel.cores = 2){

        Seqs.validate <- Sequences[which(lengths(Sequences) < 30000)]

        if(length(Seqs.validate) < length(Sequences)){
                message("Due to the limitation of RNAfold,")
                message("Sequences with length more than 30000 nt will be omitted.")
                message(length(Sequences) - length(Seqs.validate), " sequences have been removed.", "\n")
                Sequences <- Seqs.validate
        }

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")

        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores, outfile = "")

        message("\n", "Sequences Number: ", length(Sequences), "\n")
        message("Processing...", "\n")
        index <- 1
        info <- paste(ceiling(length(Sequences) / parallel.cores), ",", sep = "")
        parallel::clusterExport(cl, varlist = c("info", "index"), envir = environment())
        sec.seq <- parallel::parSapply(cl, Sequences, secondary_seq, info = info,
                                       RNAfold.path = RNAfold.path)
        parallel::stopCluster(cl)
        sec.seq <- data.frame(sec.seq, stringsAsFactors = FALSE)
        message("\n", "Completed.", "\n")
        sec.seq
}

###### read_SS ######

#' Read Secondary Structure Information
#' @description This function can read secondary structure information from your
#' own file instead of obtaining from function \code{\link{run_RNAfold}}. This function
#' will be useful if users have had secondary structure sequences (Dot-Bracket Notation).
#'
#' @param oneFile.loc String. The location of your sequence file. This file should contains
#' one (and only one) RNA sequence and its secondary structure sequence in Dot-Bracket Notation.
#' This parameter needs to be defined only when \code{separateFile = FALSE}. See Details for more
#' information.
#'
#' @param seqRNA.loc String. The location of your RNA sequences file (FASTA format). If your
#' RNA sequences and secondary structure sequences are in two files, you need to define the
#' locations of two files respectively. And the files with multiple sequences are supported
#' for this option. This parameter needs to be defined only when \code{separateFile} is \code{TRUE}.
#' Location of secondary structure sequences file is also needed (parameter \code{seqSS.loc}).
#' See Details for more information.
#'
#' @param seqSS.loc String. The location of your secondary structure sequences file (FASTA format).
#'
#' @param separateFile Logical. Your RNA sequence(s) and secondary structure sequence(s) are in
#' separate files? If \code{separateFile = FALSE}, your file should have one (and only one) RNA
#' sequence and its secondary structure sequence. No limit when \code{separateFile = TRUE}.
#'
#' @param withMFE Logical. Whether MFE is provided at the end of secondary structure sequence.
#' If \code{withMFE = TRUE}, MFE will be extracted. The format should be in accordance with
#' the output format of RNAfold.
#'
#' @return A dataframe. The first row is RNA sequence, the second row is Dot-Bracket Notation of
#' secondary structure sequence, the third row is MFE (if MFE is provided).
#'
#' @author HAN Siyu
#'
#' @details When users want to predict sequences with secondary structure features, users may have
#' had their own secondary structure sequences. With this function, users can read SS information
#' from their files. Two kind of files are supported: RNA sequence and SS sequence in one file
#' \code{separateFile} is \code{FALSE} or in separate files \code{separateFile = TRUE}.
#'
#' \code{separateFile = FALSE} is used for secondary structure that obtained from some popular
#' programs, such as RNAfold. In this case, the output file only contains one RNA sequence and
#' its SS. Besides, this file only have two rows: RNA sequence and its SS sequences. Thus, this
#' option is more favorable when the file only have one sequence and the sequence are in accordance
#' with the output format of RNAfold.
#'
#' If users obtained the SS sequence from experiments, RNA sequence and SS sequence may be in two
#' files. In this case, users can select \code{separateFile = TRUE}. Two files should be in FASTA
#' format and one file can have multiple sequences. The sequences in two files should have the same
#' order. If your data are obtained from experiments or other sources, it is highly recommended
#' that users should build new model with this data, since the SS sequences of pre-built model are
#' obtained for RNAfold and may have many differences with experimental data.
#'
#' @importFrom seqinr read.fasta
#' @importFrom seqinr getSequence
#' @seealso \code{\link{run_RNAfold}}
#' @examples
#' \dontrun{
#' ### Load sequence data
#' data("demo_DNA.seq")
#' Seqs <- demo_DNA.seq[1:4]
#' ### Convert sequences from vector to string.
#' Seqs <- sapply(Seqs, seqinr::getSequence, as.string = TRUE)
#' ### Write a fasta file.
#' seqinr::write.fasta(Seqs, names = names(Seqs), file.out = "tmp.RNA.fa", as.string = TRUE)
#'
#' ### For Windows system: (Your path of RNAfold.)
#' RNAfold.path <- '"E:/Program Files/ViennaRNA/RNAfold.exe"'
#' ### Define the parameters of RNAfold. See documents of RNAfold for more information.
#' RNAfold.command <- paste(RNAfold.path, " --noPS -i tmp.RNA.fa -o output", sep = "")
#' ### Run RNAfold and output four result files.
#' system(RNAfold.command)
#'
#' ### Read secondary structure information for one file.
#' result_1 <- read_SS(oneFile.loc = "output_ENST00000510062.1.fold",
#'                     separateFile = FALSE, withMFE = TRUE)
#' ### Read secondary sturcture sequences for multiple files.
#' filePath <- dir(pattern = ".fold")
#' result_2 <- sapply(filePath, read_SS, separateFile = FALSE, withMFE = TRUE)
#' result_2 <- as.data.frame(result_2)
#' }
#'
#' @export
#'

read_SS <- function(oneFile.loc, seqRNA.loc, seqSS.loc, separateFile = TRUE, withMFE = TRUE) {
        if(separateFile) {
                RNA.seq <- seqinr::read.fasta(seqRNA.loc, as.string = TRUE)
                SS.seq  <- seqinr::read.fasta(seqSS.loc,  as.string = TRUE)
                if(!all(names(RNA.seq) == names(SS.seq))) stop("Cannot match the names of two files.")
                RNA.row <- sapply(RNA.seq, seqinr::getSequence, as.string = TRUE)
                SS.row  <- sapply(SS.seq,  seqinr::getSequence, as.string = TRUE)
                if(withMFE) {
                        seq.ss  <- mapply(function(RNA, SS) {
                                X1 <- RNA
                                X3 <- as.numeric(substr(SS, nchar(RNA) + 3, nchar(SS) -1))
                                X2 <- substr(SS, 1, nchar(RNA))
                                out <- c(X1, X2, X3)
                        }, RNA.row, SS.row)
                        out <- data.frame(seq.ss, stringsAsFactors = FALSE)
                } else {
                        RNA.Seq <- unlist(RNA.row)
                        SS.Seq  <- unlist(SS.row)
                        out <- data.frame(rbind(RNA.Seq, SS.Seq), stringsAsFactors = FALSE)
                }

        } else {
                readFile <- scan(oneFile.loc, what = character(), sep = "\n", quiet = TRUE)

                # if(length(readFile) > 3) message("If separateFile = FALSE, only one sequence will be read.")
                # if(substr(readFile[[1]], 1, 1) == ">") seqName <- substring(readFile[[1]], 2)

                if(withMFE) {
                        X1 <- readFile[[1]]
                        X3 <- as.numeric(substr(readFile[[2]], nchar(readFile[[1]]) + 3, nchar(readFile[[2]]) -1))
                        X2 <- substr(readFile[[2]], 1, nchar(readFile[[1]]))
                        out <- rbind(X1, X2, X3)
                        out <- data.frame(sequence = out, stringsAsFactors = FALSE)
                } else out <- data.frame(sequence = readFile, stringsAsFactors = FALSE)
        }
        out
}

###### make_frequencies ######

#' Make the frequencies file for new classifier construction
#' @description This function is used to calculate the frequencies of lncRNAs, CDs, and
#' secondary structure sequences. The frequencies file can be used to build the classifier
#' using function \code{\link{extract_features}}. Functions \code{make_frequencies} and
#' \code{extract_features} are useful when users are trying
#' to build their own model.
#'
#' NOTE: Function \code{make_frequencies} makes the frequencies file
#' for building the classifiers of LncFinder method. If users need to calculate Logarithm-Distance,
#' Euclidean-Distance, and hexamer score, the frequencies file need to be computed using function
#' \code{\link{make_referFreq}}.
#'
#' @param cds.seq Coding sequences (mRNA without UTRs). Can be a FASTA file loaded
#' by \code{\link[seqinr]{seqinr-package}} or secondary structure
#' sequences (Dot-Bracket Notation) obtained form function \code{\link{run_RNAfold}}.
#' CDs are used to calculate hexamer frequencies of nucleotide sequences,thus
#' secondary structure is not needed. Parameter \code{cds.format} should be
#' \code{"SS"} when input is secondary structure sequences. (See details for
#' more information.)
#'
#' @param mRNA.seq mRNA sequences with Dot-Bracket Notation. The secondary
#' structure sequences can be obtained from function \code{\link{run_RNAfold}}.
#' mRNA sequences are used to calculate the frequencies of acgu-ACGU and a acguD
#' (see details), thus, mRNA sequences are required only when \code{SS.features = TRUE}.
#'
#' @param lncRNA.seq Long non-coding RNA sequences. Can be a FASTA file loaded by
#' \code{\link[seqinr]{seqinr-package}} or secondary structure
#' sequences (Dot-Bracket Notation) obtained from function \code{\link{run_RNAfold}}.
#' If \code{SS.features = TRUE}, \code{lncRNA.seq} must be RNA sequences with
#' secondary structure sequences and parameter \code{lnc.format} should be defined
#' as \code{"SS"}.
#'
#' @param SS.features Logical. If \code{SS.features = TRUE}, frequencies of secondary
#' structure will also be calculated and the model can be built with secondary
#' structure features. In this case, \code{mRNA.seq} and \code{lncRNA.seq} should
#' be secondary structure sequences.
#'
#' @param cds.format String. Define the format of the sequences of \code{cds.seq}.
#' Can be \code{"DNA"} or \code{"SS"}. \code{"DNA"} for DNA sequences and \code{"SS"}
#' for secondary structure sequences.
#'
#' @param lnc.format String. Define the format of lncRNAs (\code{lncRNA.seq}).
#' Can be \code{"DNA"} or \code{"SS"}. \code{"DNA"} for DNA sequences and \code{"SS"}
#' for secondary structure sequences. This parameter must be defined as \code{"SS"}
#' when \code{SS.features = TURE}.
#'
#' @param check.cds Logical. Incomplete CDs can lead to a false shift and a
#' inaccurate hexamer frequencies. When \code{check.cds = TRUE}, hexamer frequencies
#' will be calculated on the longest ORF. This parameter is strongly recommended to
#' set as \code{TRUE} when mRNA is used as CDs.
#'
#' @param ignore.illegal Logical. If \code{TRUE}, the sequences with non-nucleotide
#' characters (nucleotide characters: "a", "c", "g", "t") will be ignored when
#' calculating hexamer frequencies.
#'
#' @return Returns a list which consists the frequencies of protein-coding sequences
#' and non-coding sequences.
#' @author HAN Siyu
#'
#' @details This function is used to make frequencies file for LncFinder method. This file is needed
#' when users are trying to build their own model.
#'
#' In order to achieve high accuracy, mRNA should not be regarded as CDs and assigned
#' to parameter \code{cds.seq}. However, CDs of some species may be insufficient
#' for calculating frequencies, and mRNAs can be regarded as CDs with parameter
#' \code{check.cds = TRUE}. In this case, hexamer frequencies will be calculated
#' on ORF region.
#'
#' Considering that it is time consuming to obtain secondary structure sequences,
#' users can only provide nucleotide sequences and build a model without secondary
#' structure features (\code{SS.features = } \code{FALSE}). If users want to build a model
#' with secondary structure features, parameter \code{SS.features} should be set
#' as \code{TRUE}. At the same time, the format of the sequences of \code{mRNA.seq}
#' and \code{lnc.seq} should be secondary structure sequences (Dot-Bracket Notation).
#' Secondary structure sequences can be obtained by function \code{\link{run_RNAfold}}.
#'
#' Please note that:
#'
#' SS.features can improve the performance when the species of unevaluated sequences
#' is identical to the species of the sequences that used to build the model.
#'
#' However, if users are trying to predict sequences with the model trained on
#' other species, SS.features may lead to low accuracy.
#'
#' The frequencies file consists three groups: Hexamer Frequencies; acgu-ACGU
#' Frequencies and acguD Frequencies.
#'
#' Hexamer Frequencies are calculated on the original nucleotide sequences by
#' employing \emph{k}-mer scheme (\emph{k} = 6), and the sliding window will slide
#' 3 nt each step.
#'
#' For any secondary structure sequences (Dot-Bracket Notation), if one position
#' is a dot, the corresponding nucleotide of the RNA sequence will be replaced
#' with character "D". acguD Frequencies are the \emph{k}-mer frequencies
#' (\emph{k} = 4) calculated on this new sequences.
#'
#' Similarly, for any secondary structure sequences (Dot-Bracket Notation), if
#' one position is "(" or ")", the corresponding nucleotide of the RNA sequence
#' will be replaced with upper case ("A", "C", "G", "U").
#'
#' A brief example,
#'
#' DNA Sequence:\code{          5'-   t  a  c  a  g  t  t  a  t  g   -3'}
#'
#' RNA Sequence:\code{          5'-   u  a  c  a  g  u  u  a  u  g   -3'}
#'
#' Dot-Bracket Sequence:\code{     5'-   .  .  .  .  (  (  (  (  (  (   -3'}
#'
#' acguD Sequence:\code{         \{     D, D, D, D, g, u, u, a, u, g   \}}
#'
#' acgu-ACGU Sequence:\code{    \{     u, a, c, a, G, U, U, A, U, G   \}}
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @seealso \code{\link{run_RNAfold}}, \code{\link{read_SS}},
#'          \code{\link{build_model}}, \code{\link{extract_features}},
#'          \code{\link{make_referFreq}}.
#' @examples
#' ### Only for examples:
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' \dontrun{
#' ### Obtain the secondary structure sequences (Windows OS):
#' RNAfold.path <- '"E:/Program Files/ViennaRNA/RNAfold.exe"'
#' SS.seq <- run_RNAfold(Seqs, RNAfold.path = RNAfold.path, parallel.cores = 2)
#'
#' ### Make frequencies file with secondary strucutre features,
#' my_file_1 <- make_frequencies(cds.seq = SS.seq, mRNA.seq = SS.seq,
#'                               lncRNA.seq = SS.seq, SS.features = TRUE,
#'                               cds.format = "SS", lnc.format = "SS",
#'                               check.cds = TRUE, ignore.illegal = FALSE)
#' }
#'
#' ### Make frequencies file without secondary strucutre features,
#' my_file_2 <- make_frequencies(cds.seq = Seqs, lncRNA.seq = Seqs,
#'                               SS.features = FALSE, cds.format = "DNA",
#'                               lnc.format = "DNA", check.cds = TRUE,
#'                               ignore.illegal = FALSE)
#'
#' ### The input of cds.seq and lncRNA.seq can also be secondary structure
#' ### sequences when SS.features = FALSE, such as,
#' data(demp_SS.seq)
#' SS.seq <- demo_SS.seq
#' my_file_3 <- make_frequencies(cds.seq = SS.seq, lncRNA.seq = Seqs,
#'                               SS.features = FALSE, cds.format = "SS",
#'                               lnc.format = "DNA", check.cds = TRUE,
#'                               ignore.illegal = FALSE)
#' @export

make_frequencies <- function(cds.seq, mRNA.seq, lncRNA.seq, SS.features = FALSE,
                             cds.format = "DNA", lnc.format = "DNA", check.cds = TRUE,
                             ignore.illegal = TRUE) {

        if(SS.features & lnc.format != "SS") stop("Error: Secondary structure file is required for the selected mode.")

        message("+ Check the format of sequences. ", Sys.time(), "\n")

        if(lnc.format == "DNA") {
                lnc.DNA <- lncRNA.seq
        } else if(lnc.format == "SS") {
                lnc.DNA <- sapply(lncRNA.seq, get.Seq_DNA)
        } else stop("- Error: Wrong format name of lncRNA.")

        if(cds.format == "DNA") {
                cds.DNA <- cds.seq
        } else if(cds.format == "SS") {
                cds.DNA <- sapply(cds.seq, get.Seq_DNA)
        } else stop("- Error: Wrong format name of CDs.")


        if(SS.features) {
                message("+ Calculating frequencies of Secondary structure:")
                message("- Processing sequences.")
                lnc.acguD <- sapply(lncRNA.seq, get.Seq_acguD)
                pct.acguD <- sapply(mRNA.seq,   get.Seq_acguD)
                lnc.SS    <- sapply(lncRNA.seq, get.Seq_acguACGU)
                pct.SS    <- sapply(mRNA.seq,   get.Seq_acguACGU)

                message("- Calculating frequencies of acguD k = 4", "\n")

                message("  Non-coding sequences")
                lnc.acguD.freq <- get.freq(lnc.acguD, alphabet = c("a", "c", "g", "u", "D"),
                                           wordsize = 4, step = 1, freq = T, ignore.illegal = FALSE)
                message("  Completed.", "\n")

                message("  Coding sequences")
                pct.acguD.freq <- get.freq(pct.acguD, alphabet = c("a", "c", "g", "u", "D"),
                                           wordsize = 4, step = 1, freq = T, ignore.illegal = FALSE)
                message("  Completed.", "\n")

                message("- Calculating frequencies of acguACGU k = 3", "\n")

                message("  Non-coding sequences")
                lnc.acguACGU.freq <- get.freq(lnc.SS, alphabet = c("A", "C", "G", "U", "a", "c", "g", "u"),
                                              wordsize = 3, step = 1, freq = T, ignore.illegal = FALSE)
                message("  Completed.", "\n")

                message("  Coding sequences")
                pct.acguACGU.freq <- get.freq(pct.SS, alphabet = c("A", "C", "G", "U", "a", "c", "g", "u"),
                                              wordsize = 3, step = 1, freq = T, ignore.illegal = FALSE)
                message("  Completed.", "\n")
        }

        message("+ Calculating frequencies of DNA k = 6")
        if(check.cds) {
                message("- Processing CDs sequences.")
                orf.seq <- lapply(cds.DNA, function(x) {
                        orf.info <- find_ORF(x)
                        if(orf.info[[1]] >= 12) orf <- seqinr::s2c(orf.info[[3]]) else orf <- NA
                        orf
                })
                cds.DNA <- orf.seq[!is.na(orf.seq)]
                message("- Calculating frequencies.")
        }

        message("\n", "  Non-coding sequences")
        lnc.DNA.freq <- get.freq(lnc.DNA, alphabet = c("a", "c", "g", "t"), wordsize = 6,
                                 step = 3, freq = F, ignore.illegal = ignore.illegal)
        message("  Completed.", "\n")

        message("  Coding sequences")
        cds.DNA.freq <- get.freq(cds.DNA, alphabet = c("a", "c", "g", "t"), wordsize = 6,
                                 step = 3, freq = F, ignore.illegal = ignore.illegal)
        message("  Completed.", "\n")

        message("+ Frequencies calculation completed. ", Sys.time())

        if(SS.features) {
                Internal.freq <- list(DNA.lnc      = lnc.DNA.freq,
                                      DNA.cds      = cds.DNA.freq,
                                      acguD.lnc    = lnc.acguD.freq,
                                      acguD.pct    = pct.acguD.freq,
                                      acguACGU.lnc = lnc.acguACGU.freq,
                                      acguACGU.pct = pct.acguACGU.freq)
        } else {
                Internal.freq <- list(DNA.lnc = lnc.DNA.freq,
                                      DNA.cds = cds.DNA.freq)
        }
        Internal.freq
}

###### extract_features ######

#' Extract the Features
#' @description This function can construct the dataset. This function is only used
#' to extract the features, please use function \code{\link{build_model}} to build
#' new models.
#'
#' @param Sequences mRNA sequences or long non-coding sequences. Can be a FASTA
#' file loaded by \code{\link[seqinr]{seqinr-package}} or
#' secondary structure sequences (Dot-Bracket Notation) obtained from function
#' \code{\link{run_RNAfold}}. If \code{Sequences} are secondary structure
#' sequences file, parameter \code{format} should be defined as \code{"SS"}.
#'
#' @param label Optional. String. Indicate the label of the sequences such as
#' "NonCoding", "Coding".
#'
#' @param SS.features Logical. If \code{SS.features = TRUE}, secondary structure
#' features will be extracted. In this case, \code{Sequences} should be secondary
#' structure sequences (Dot-Bracket Notation) obtained from function
#' \code{\link{run_RNAfold}} and parameter \code{format} should be set as \code{"SS"}.
#'
#' @param format String. Can be \code{"DNA"} or \code{"SS"}. Define the format of
#' \code{Sequences}. \code{"DNA"} for DNA sequences and \code{"SS"} for secondary
#' structure sequences. This parameter must be set as \code{"SS"} when
#' \code{SS.features = TURE}.
#'
#' @param frequencies.file String or a list obtained from function
#' \code{\link{make_frequencies}}. Input species name \code{"human"}, \code{"mouse"}
#' or \code{"wheat"} to use pre-build frequencies files. Or assign a users' own
#' frequencies file (See function \code{\link{make_frequencies}}).
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return Returns a data.frame. 11 features when \code{SS.features} is \code{FALSE},
#' and 19 features when \code{SS.features} is \code{TRUE}.
#'
#' @author HAN Siyu
#' @details This function extracts the features and constructs the dataset.
#'
#' Considering that it is time consuming to obtain secondary structure sequences,
#' users can build the model only with features of sequence and EIIP
#' (\code{SS.features = FALSE}). When \code{SS.features = TRUE}, \code{Sequences}
#' should be secondary structure sequences (Dot-Bracket Notation) obtained from
#' function \code{\link{run_RNAfold}} and parameter \code{format} should be set
#' as \code{"SS"}.
#'
#' Please note that:
#'
#' Secondary structure features (\code{SS.features}) can improve the performance
#' when the species of unevaluated sequences is identical to the species of the
#' sequences that used to build the model.
#'
#' However, if users are trying to predict sequences with the model trained on
#' other species, \code{SS.features} as \code{TRUE} may lead to low accuracy.
#'
#' @section Features:
#' 1. Features based on sequence:
#'
#'    The length and coverage of the longest ORF (\code{ORF.Max.Len} and
#'    \code{ORF.Max.Cov});
#'
#'    Log-Distance.lncRNA (\code{Seq.lnc.Dist});
#'
#'    Log-Distance.protein-coding transcripts (\code{Seq.pct.Dist});
#'
#'    Distance-Ratio.sequence (\code{Seq.Dist.Ratio}).
#'
#'
#' 2. Features based on EIIP (electron-ion interaction pseudopotential) value:
#'
#'    Signal at 1/3 position (\code{Signal.Peak});
#'
#'    Signal to noise ratio (\code{SNR});
#'
#'    the minimum value of the top 10\% power spectrum (\code{Signal.Min});
#'
#'    the quantile Q1 and Q2 of the top 10\% power spectrum (\code{Singal.Q1}
#'    and \code{Signal.Q2})
#'
#'    the maximum value of the top 10\% power spectrum (\code{Signal.Max}).
#'
#'
#' 3. Features based on secondary structure sequence:
#'
#'    Log-Distance.acguD.lncRNA (\code{Dot_lnc.dist});
#'
#'    Log-Distance.acguD.protein-coding transcripts (\code{Dot_pct.dist});
#'
#'    Distance-Ratio.acguD (\code{Dot_Dist.Ratio});
#'
#'    Log-Distance.acgu-ACGU.lncRNA (\code{SS.lnc.dist});
#'
#'    Log-Distance.acgu-ACGU.protein-coding transcripts (\code{SS.pct.dist});
#'
#'    Distance-Ratio.acgu-ACGU (\code{SS.Dist.Ratio});
#'
#'    Minimum free energy (\code{MFE});
#'
#'    Percentage of Unpair-Pair (\code{UP.PCT})
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @seealso \code{\link{svm_tune}}, \code{\link{build_model}},
#'          \code{\link{make_frequencies}}, \code{\link{run_RNAfold}}, \code{\link{read_SS}}.
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' ### Extract features with pre-build frequencies.file:
#' my_features <- extract_features(Seqs, label = "Class.of.the.Sequences",
#'                                 SS.features = FALSE, format = "DNA",
#'                                 frequencies.file = "mouse",
#'                                 parallel.cores = 2)
#'
#' ### Use your own frequencies file by assign frequencies list to parameter
#' ### "frequencies.file".
#' }
#' @export

extract_features <- function(Sequences, label = NULL, SS.features = FALSE, format = "DNA",
                             frequencies.file = "human", parallel.cores = 2) {

        if(SS.features & format != "SS") stop("Error: Secondary structure file is required for the selected mode.")

        message("+ Check the format of sequences. ", Sys.time(), "\n")
        if(format == "DNA") {
                DNA.seq <- Sequences
        } else if(format == "SS") {
                DNA.seq   <- sapply(Sequences, get.Seq_DNA)
        } else stop("- Error: Wrong format name.")

        if(class(frequencies.file) == "character") {
                if(frequencies.file == "human") {
                        Internal.data = Internal.human
                } else if(frequencies.file == "mouse") {
                        Internal.data = Internal.mouse
                } else if(frequencies.file == "wheat") {
                        Internal.data = Internal.wheat
                } else stop("- Error: Wrong frequencies.file name.")
        } else  Internal.data = frequencies.file

        if (parallel.cores == 2) message("Users can try to set parallel.cores = -1 to use all cores!", "\n")

        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)
        cl <- parallel::makeCluster(parallel.cores)
        parallel::clusterExport(cl, varlist = c("Internal.data", "find_ORF"), envir = environment())

        if(SS.features) {
                message("+ Extracting secondary structure features:")
                message("- Processing sequences.")
                UP.seq    <- sapply(Sequences, get.Seq_UP)
                SS.seq    <- sapply(Sequences, get.Seq_acguACGU)
                acguD.seq <- sapply(Sequences, get.Seq_acguD)

                message("- LogDist.acguD k = 4")
                acguD    <- parallel::parSapply(cl, acguD.seq, LogDist.acguD,
                                                wordsize = 4, step = 1,
                                                hexamer.lnc = Internal.data$acguD.lnc,
                                                hexamer.cds = Internal.data$acguD.pct)

                message("- LogDist.acguACGU k = 3")
                acguACGU <- parallel::parSapply(cl, SS.seq, LogDist.acguACGU,
                                                wordsize = 3, step = 1,
                                                hexamer.lnc = Internal.data$acguACGU.lnc,
                                                hexamer.cds = Internal.data$acguACGU.pct)
                message("- MFE")
                MFE      <- as.numeric(as.character(Sequences[3,]))

                message("- Percentage of Unpair-Pair", "\n")
                UP.percentage <- parallel::parSapply(cl, UP.seq, seqinr::count,
                                                     wordsize = 2, freq = TRUE,
                                                     alphabet = c("U", "P"))
                SS.group <- data.frame(t(acguD), t(acguACGU), MFE = MFE, UP.PCT = UP.percentage[3,])
        }

        message("+ Extracting sequence and EIIP features:")
        message("- ORF and LogDist.Seq k = 6")
        Seq.ORF  <- parallel::parSapply(cl, DNA.seq, LogDist.DNA, wordsize = 6, step = 3,
                                        hexamer.lnc = Internal.data$DNA.lnc,
                                        hexamer.cds = Internal.data$DNA.cds)

        message("- EIIP", "\n")
        EIIP.DFT <- parallel::parSapply(cl, DNA.seq, EIIP.DFT)
        parallel::stopCluster(cl)
        features <- data.frame(t(Seq.ORF), t(EIIP.DFT))

        message("+ Feature extraction completed. ", Sys.time(), "\n")
        if(SS.features) features <- cbind(features, SS.group)
        if(!is.null(label)) features <- cbind(label, features)
        features
}

###### build_model ######

#' Build Users' Own Model
#' @description This function is used to build new models with users' own data.
#'
#' @param lncRNA.seq Long non-coding sequences. Can be a FASTA file loaded by
#' \code{\link[seqinr]{seqinr-package}} or secondary structure
#' sequences file (Dot-Bracket Notation) obtained from function
#' \code{\link{run_RNAfold}}. If \code{lncRNA.seq} is secondary structure
#' sequences file, parameter \code{lncRNA.format} should be defined as \code{"SS"}.
#'
#' @param mRNA.seq mRNA sequences. FASTA file loaded by \code{\link[seqinr]{read.fasta}} or
#' secondary structure sequences (Dot-Bracket Notation) obtained from function
#' \code{\link{run_RNAfold}}. If \code{mRNA.seq} is secondary structure sequences
#' file, parameter \code{mRNA.format} should be defined as \code{"SS"}.
#'
#' @param frequencies.file String or a list obtained from function
#' \code{\link{make_frequencies}}. Input species name \code{"human"},
#' \code{"mouse"} or \code{"wheat"} to use pre-build frequencies files. Or assign
#' a users' own frequencies file (Please refer to function
#' \code{\link{make_frequencies}} for more information).
#'
#' @param SS.features Logical. If \code{SS.features = TRUE}, secondary structure
#' features will be used to build the model. In this case, \code{lncRNA.seq} and
#' \code{mRNA.seq} should be secondary structure sequences (Dot-Bracket Notation)
#' obtained from function \code{\link{run_RNAfold}} and parameter
#' \code{lncRNA.format} and \code{mRNA.format} should be set as \code{"SS"}.
#'
#' @param lncRNA.format String. Define the format of \code{lncRNA.seq}. \code{"DNA"}
#' for DNA sequences and \code{"SS"} for secondary structure sequences. Only when
#' both \code{mRNA.format} and \code{lncRNA.format} are set as \code{"SS"}, can
#' the model with secondary structure features be built (\code{SS.features = TRUE}).
#'
#' @param mRNA.format String. Define the format of \code{mRNA.seq}. Can be
#' \code{"DNA"} or \code{"SS"}. \code{"DNA"} for DNA sequences and \code{"SS"}
#' for secondary structure sequences. When this parameter is defined as \code{"DNA"},
#' only the model without secondary structure features can be built. In this case,
#' parameter \code{SS.features} should be set as \code{FALSE}.
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}, users can set as \code{-1} to run
#' this function with all cores. During the process of svm tuning, if the number
#' of \code{parallel.cores} is more than the \code{folds.num} (number of the folds
#' for cross-validation), the number of \code{parallel.cores} will be set as
#' \code{folds.num} automatically.
#'
#' @param folds.num Integer. Specify the number of folds for cross-validation.
#' (Default: \code{10})
#'
#' @param seed Integer. Used to set the seed for cross-validation. (Default: \code{1})
#'
#' @param gamma.range The range of gamma. (Default: \code{2 ^ seq(-5, 0, 1)})
#'
#' @param cost.range The range of cost. (Default: \code{c(1, 4, 8, 16, 24, 32)})
#'
#' @param ... Additional arguments passed to function \code{\link{svm_tune}} for customised SVM model training.
#'
#' @return Returns a svm model.
#'
#' @author HAN Siyu
#'
#' @details This function is used to build a new model with users' own sequences.
#' Users can use function \code{\link{lnc_finder}} to predict the sequences with
#' new models.
#'
#' For the details of \code{frequencies.file}, please refer to function
#' \code{\link{make_frequencies}}.
#'
#' For the details of the features, please refer to function
#' \code{\link{extract_features}}.
#'
#' For the details of svm tuning, please refer to function \code{\link{svm_tune}}.
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom caret createFolds
#' @importFrom caret confusionMatrix
#' @importFrom e1071 svm
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @seealso \code{\link{make_frequencies}}, \code{\link{lnc_finder}},
#'          \code{\link{extract_features}}, \code{\link{svm_tune}},
#'          \code{\link[e1071]{svm}}.
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' ### Build the model with pre-build frequencies.file:
#' my_model <- build_model(lncRNA.seq = Seqs[1:5], mRNA.seq = Seqs[6:10],
#'                         frequencies.file = "human", SS.features = FALSE,
#'                         lncRNA.format = "DNA", mRNA.format = "DNA",
#'                         parallel.cores = 2, folds.num = 2, seed = 1,
#'                         gamma.range = (2 ^ seq(-5, -1, 2)),
#'                         cost.range = c(2, 6, 12, 20))
#'
#' ### Users can use default values of gamma.range and cost.range to find the
#' best parameters.
#' ### Use your own frequencies file by assigning frequencies list to parameter
#' ### "frequencies.file".
#' }
#' @export

build_model <- function(lncRNA.seq, mRNA.seq, frequencies.file, SS.features = FALSE,
                        lncRNA.format = "DNA", mRNA.format = "DNA", parallel.cores = 2,
                        folds.num = 10, seed = 1, gamma.range = (2 ^ seq(-5, 0, 1)),
                        cost.range = c(1, 4, 8, 16, 24, 32), ...) {

        message("+ Extract features of non-coding sequences.")
        lnc.data <- extract_features(lncRNA.seq, label = "NonCoding", SS.features = SS.features,
                                     format = lncRNA.format, frequencies.file = frequencies.file,
                                     parallel.cores = parallel.cores)

        message("+ Extract features of protein-coding sequences.")
        pct.data <- extract_features(mRNA.seq, label = "Coding", SS.features = SS.features,
                                     format = mRNA.format, frequencies.file = frequencies.file,
                                     parallel.cores = parallel.cores)

        dataset <- rbind(lnc.data, pct.data)
        dataset$label <- as.factor(dataset$label)

        svm.mod <- svm_tune(dataset, gamma.range = gamma.range, cost.range = cost.range,
                            seed = seed, folds.num = folds.num, return.model = TRUE,
                            parallel.cores = parallel.cores, ... = ...)

        # message("+ Build the model with the whole dataset.")
        # svm.mod  <- e1071::svm(label ~ ., data = dataset, scale = TRUE, probability = TRUE, kernel = "radial",
        #                 gamma = best.parameters[[1]][[1]], cost = best.parameters[[1]][[2]])

        message("+ Process completed.")
        svm.mod
}

###### lnc_finder ######

#' Long Non-coding RNA Identification
#' @description This function is used to predict sequences are non-coding transcripts
#' or protein-coding transcripts.
#'
#' @param Sequences Unevaluated sequences. Can be a FASTA file loaded by
#' \code{\link[seqinr]{seqinr-package}} or secondary structure sequences
#' (Dot-Bracket Notation) obtained from function \code{\link{run_RNAfold}}. If
#' \code{Sequences} is secondary structure sequences file, parameter \code{format}
#' should be defined as \code{"SS"}.
#'
#' @param SS.features Logical. If \code{SS.features = TRUE}, secondary structure
#' features will be used.
#'
#' @param format String. Define the format of the \code{Sequences}. Can be
#' \code{"DNA"} or \code{"SS"}. \code{"DNA"} for DNA sequences and \code{"SS"}
#' for secondary structure sequences.
#'
#' @param frequencies.file String or a list obtained from function
#' \code{\link{make_frequencies}}. Input species name \code{"human"}, \code{"mouse"}
#' or \code{"wheat"} to use pre-build frequencies files. Or assign a users' own
#' frequencies file (See function \code{\link{make_frequencies}}).
#'
#' @param svm.model String or a svm model obtained from function \code{\link{build_model}}
#' or \code{\link{svm_tune}}. Input species name \code{"human"}, \code{"mouse"}
#' or \code{"wheat"} to use pre-build models. Or assign a users' own model (See
#' function \code{\link{build_model}}).
#'
#' @param parallel.cores Integer. The number of cores for parallel computation.
#' By default the number of cores is \code{2}. Users can set as \code{-1} to run
#' this function with all cores.
#'
#' @return Returns a data.frame. Including the results of prediction (\code{Pred});
#' coding potential (\code{Coding.Potential}) and the features. For the details
#' of the features, please refer to function \code{\link{extract_features}}.
#'
#' @author HAN Siyu
#'
#' @details Considering that it is time consuming to obtain secondary structure
#' sequences, users can input nucleotide sequences and predict these sequences
#' without secondary structure features (Set \code{SS.features} as \code{FALSE}).
#'
#' Please note that:
#'
#' \code{SS.features} can improve the performance when the species of unevaluated
#' sequences is identical to the species of the sequences that used to build the
#' model.
#'
#' However, if users are trying to predict sequences with the model trained on
#' other species, \code{SS.features} may lead to low accuracy.
#'
#' For the details of \code{frequencies.file}, please refer to function
#' \code{\link{make_frequencies}}.
#'
#' For the details of the features, please refer to function
#' \code{\link{extract_features}}.
#'
#' @section References:
#' Siyu Han, Yanchun Liang, Qin Ma, Yangyi Xu, Yu Zhang, Wei Du, Cankun Wang & Ying Li.
#' LncFinder: an integrated platform for long non-coding RNA identification utilizing
#' sequence intrinsic composition, structural information, and physicochemical property.
#' \emph{Briefings in Bioinformatics}, 2019, 20(6):2009-2027.
#'
#' @importFrom seqinr s2c
#' @importFrom seqinr count
#' @importFrom seqinr getSequence
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel clusterExport
#' @importFrom parallel parSapply
#' @importFrom parallel stopCluster
#' @seealso \code{\link{build_model}}, \code{\link{make_frequencies}},
#'          \code{\link{extract_features}}, \code{\link{run_RNAfold}}, \code{\link{read_SS}}.
#' @examples
#' \dontrun{
#' data(demo_DNA.seq)
#' Seqs <- demo_DNA.seq
#'
#' ### Input one sequence:
#' OneSeq <- Seqs[1]
#' result_1 <- lnc_finder(OneSeq, SS.features = FALSE, format = "DNA",
#'                        frequencies.file = "human", svm.model = "human",
#'                        parallel.cores = 2)
#'
#' ### Or several sequences:
#' data(demo_SS.seq)
#' Seqs <- demo_SS.seq
#' result_2 <- lnc_finder(Seqs, SS.features = TRUE, format = "SS",
#'                        frequencies.file = "mouse", svm.model = "mouse",
#'                        parallel.cores = 2)
#'
#' ### A complete work flow:
#' ### Calculate second structure on Windows OS,
#' RNAfold.path <- '"E:/Program Files/ViennaRNA/RNAfold.exe"'
#' SS.seq <- run_RNAfold(Seqs, RNAfold.path = RNAfold.path, parallel.cores = 2)
#'
#' ### Predict the sequences with secondary structure features,
#' result_2 <- lnc_finder(SS.seq, SS.features = TRUE, format = "SS",
#'                        frequencies.file = "mouse", svm.model = "mouse",
#'                        parallel.cores = 2)
#'
#' ### Predict sequences with your own model by assigning a new svm.model and
#' ### frequencies.file to parameters "svm.model" and "frequencies.file"
#' }
#' @export

lnc_finder <- function(Sequences, SS.features = FALSE, format = "DNA", frequencies.file = "human",
                       svm.model = "human", parallel.cores = 2) {

        if(class(svm.model)[1] == "character") {
                if(SS.features) {
                        if(svm.model == "human") {
                                svm.mod <- human.mod
                        } else if(svm.model == "mouse") {
                                svm.mod <- mouse.mod
                        } else if(svm.model == "wheat") {
                                svm.mod <- wheat.mod
                        } else stop("Wrong svm.model name.")
                } else {
                        if(svm.model == "human") {
                                svm.mod <- human.mod.no_ss
                        } else if(svm.model == "mouse") {
                                svm.mod <- mouse.mod.no_ss
                        } else if(svm.model == "wheat") {
                                svm.mod <- wheat.mod.no_ss
                        } else stop("Wrong svm.model name.")
                }
        } else svm.mod <- svm.model

        parallel.cores <- ifelse(parallel.cores == -1, parallel::detectCores(), parallel.cores)

        seq.data <- extract_features(Sequences, SS.features = SS.features, format = format,
                                     frequencies.file = frequencies.file, parallel.cores = parallel.cores)

        message("+ Predicting...", "\n")
        pred.res <- stats::predict(svm.mod, seq.data, probability = TRUE)
        results  <- data.frame(Pred = pred.res)
        prob.res <- data.frame(Result = results, Coding.Potential = (attr(pred.res, "probabilities")[,2]))
        output   <- cbind(prob.res, seq.data)
        message("+ Process completed.", "\n")
        output
}

Try the LncFinder package in your browser

Any scripts or data that you put into this service are public.

LncFinder documentation built on Dec. 11, 2021, 9:39 a.m.