#' @title Import CDS as Biostrings or data.table object
#' @description This function reads an organism specific CDS stored in a
#' defined file format.
#' @param file a character string specifying the path to the file storing
#' the CDS.
#' @param format a character string specifying the file format used to store
#' the genome, e.g. \code{format = "fasta"} (default) or \code{format = "gbk"}.
#' @param obj.type a character string specifying the object stype in which the
#' genomic sequence shall be represented.
#' Either as \code{obj.type = "Biostrings"} (default) or as
#' \code{obj.type = "data.table"}.
#' @param delete_corrupt a logical value specifying whether potential CDS
#' sequences that cannot be divided by 3 shall be
#' be excluded from the the dataset. Default is \code{delete_corrupt = FALSE}.
#' @param ... additional arguments that are used by
#' \code{\link[seqinr]{read.fasta}}.
#' @author Hajk-Georg Drost
#' @details The \code{read.cds} function takes a string specifying the path
#' to the cds file of interest as first argument.
#' It is possible to read in different proteome file standards such as
#' \emph{fasta} or \emph{genebank}.
#' CDS stored in fasta files can be downloaded from
#' http://www.ensembl.org/info/data/ftp/index.html.
#' @return A data.table storing the gene id in the first column and the
#' corresponding sequence as string in the second column.
#' @family cds
#' @family readers
#' @import Biostrings
#' @export
read_cds <-
             format = "fasta",
             obj.type = "Biostrings",
             delete_corrupt = FALSE,
             ...) {
        if (!is.element(format, c("fasta", "gbk")))
            stop("Please choose a file format that is
                 supported by this function.",
                 call. = FALSE)

        if (!is.element(obj.type, c("Biostrings", "data.table")))
                "Please specify a valid object type: obj.type = 'Biostrings'
                (default) or obj.type = 'data.table'.",
                call. = FALSE

        if (!file.exists(file))
            stop("The file path you specified does not seem to exist: '", file,"'.", call. = FALSE)

        geneids <- seqs <- NULL

        if (obj.type == "Biostrings") {
                cds_file <-
                    Biostrings::readBStringSet(filepath = file,
                                               format = format, ...)
            }, error = function(e) {
                        "File ",
                        " could not be read properly. \n",
                        "Please make sure that ",
                        " contains only CDS sequences and is in ",
                        " format."
                    call. = FALSE


        if (obj.type == "data.table") {
            geneids <- seqs <- NULL

                cds_file <-
                    Biostrings::readDNAStringSet(filepath = file, format = format, ...)

                cds_names <-
                    as.vector(unlist(sapply(cds_file@ranges@NAMES, function(x) {
                        return(strsplit(x, " ")[[1]][1])

                cds.dt <-
                    data.table::data.table(geneids = cds_names ,
                                           seqs = tolower(as.character(cds_file)))

                data.table::setkey(cds.dt, geneids)

                mod3 <-
                    function(x) {
                        return((nchar(x) %% 3) == 0)

                all_triplets <- as.logical(cds.dt[ , mod3(seqs)])

                n_seqs <- nrow(cds.dt)

            }, error = function(e) {
                    "File ",
                    " could not be read properly.",
                    "Please make sure that ",
                    " contains only CDS sequences and is in ",
                    " format."

            if (!all(all_triplets)) {
                    "There seem to be ",
                    " coding sequences in your input dataset which cannot be properly divided in base triplets, because their sequence length cannot be divided by 3."
                corrupted_file <-
                    paste0(basename(file), "_corrupted_cds_seqs.fasta")

                    "A fasta file storing all corrupted coding sequences for inspection was generated and stored at '",
                    file.path(getwd(), corrupted_file),
                corrupted_seqs <- as.data.frame(cds.dt[which(!all_triplets)])
                seq_vector <- corrupted_seqs$seqs
                names(seq_vector) <- corrupted_seqs$geneids
                corrupted_seqs_biostrings <- Biostrings::DNAStringSet(seq_vector, use.names = TRUE)
                Biostrings::writeXStringSet(corrupted_seqs_biostrings, filepath = corrupted_file)

                if (delete_corrupt) {
                        "You chose option 'delete_corrupt = TRUE', thus corrupted coding sequences were removed.",
                        "If after consulting the file '",
                        "' you still wish to retain all coding sequences please specify the argument 'delete_corrupt = FALSE'."
                    return(cds.dt[-which(!all_triplets) , list(geneids, seqs)])

                if (!delete_corrupt) {
                        "You chose option 'delete_corrupt = FALSE', thus corrupted coding sequences were retained for subsequent analyses.")
                        "The following modifications were made to the CDS sequences that were not divisible by 3:")
                        "- If the sequence had 1 residue nucleotide then the last nucleotide of the sequence was removed.")
                        "- If the sequence had 2 residue nucleotides then the last two nucleotides of the sequence were removed.")
                        "If after consulting the file '",
                        "' you wish to remove all corrupted coding sequences please specify the argument 'delete_corrupt = TRUE'."

                    mod3_residue_1 <-
                        function(x) {
                            return((nchar(x) %% 3) == 1)

                    mod3_residue_2 <-
                        function(x) {
                            return((nchar(x) %% 3) == 2)

                    residue_1 <- cds.dt[ , mod3_residue_1(seqs)]
                    residue_2 <- cds.dt[ , mod3_residue_2(seqs)]

                    residue_1_seqs <- as.character(cds.dt[which(residue_1) , seqs])
                    residue_2_seqs <- as.character(cds.dt[which(residue_2) , seqs])

                    residue_1_seqs_vec <- as.character(sapply(residue_1_seqs, function(x) {
                        stringr::str_sub(x, 1, nchar(x) - 1)

                    residue_2_seqs_vec <- as.character(sapply(residue_2_seqs, function(x) {
                        stringr::str_sub(x, 1, nchar(x) - 2)

                    cds.dt[which(residue_1) , seqs := residue_1_seqs_vec]
                    cds.dt[which(residue_2) , seqs := residue_2_seqs_vec]

                    all_triplets_new <- cds.dt[ , mod3(seqs)]

                    if (any(!all_triplets_new)) {
                        stop("Something went wring during the trimming process. Not all sequences were trimmed properly.", call. = FALSE)
                    } else {
                        message("All corrupted CDS were trimmed.")

                    n_seqs_new <- nrow(cds.dt)

                    if(!(n_seqs == n_seqs_new))

                         stop("After trimming corrupted CDS some sequences seem to be lost. Please check what might have gone wrong with the sequence trimming.", call. = FALSE)
            } else {

