R/read_proteome.R

Defines functions read_proteome

Documented in read_proteome

#' @title Import Proteome as Biostrings or data.table object
#' @description This function reads an organism specific proteome stored in a
#' defined file format.
#' @param file a character string specifying the path to the file storing
#' the proteome.
#' @param format a character string specifying the file format used to store the
#' genome, e.g. \code{format = "fasta"} (default) or \code{format = "gbk"}.
#' @param obj.type a character string specifying the object stype in which the
#' genomic sequence shall be represented.
#' Either as \code{obj.type = "Biostrings"} (default) or as
#' \code{obj.type = "data.table"}.
#' @param ... additional arguments that are used by
#' \code{\link[seqinr]{read.fasta}}.
#' @author Hajk-Georg Drost
#' @details This function takes a string specifying the path to the
#' proteome file of interest as first argument.
#'
#' It is possible to read in different proteome file standards such as
#' \emph{fasta} or \emph{genebank}.
#' @return Either a \code{Biostrings} or \code{data.table} object.
#' @family readers
#' @family proteome
#' @import Biostrings
#' @export
read_proteome <-
    function(file,
             format = "fasta",
             obj.type = "Biostrings",
             ...) {
        if (!is.element(format, c("fasta", "gbk")))
            stop("Please choose a file format that is supported by
                 this function.",
                 call. = FALSE)

        if (!is.element(obj.type, c("Biostrings", "data.table")))
            stop(
                "Please specify a valid object type:
                obj.type = 'Biostrings' (default) or obj.type = 'data.table'.",
                call. = FALSE
            )

        if (!file.exists(file))
            stop("The file path you specified does not seem to exist: '", file,"'.", call. = FALSE)


        geneids <- NULL

        if (obj.type == "Biostrings") {
            tryCatch({
                proteome <-
                    Biostrings::readAAStringSet(filepath = file,
                                                format = format, ...)
            }, error = function(e) {
                stop(
                    paste0(
                        "File ",
                        file,
                        " could not be read properly. \n",
                        "Please make sure that ",
                        file,
                        " contains only amino acid sequences and is in ",
                        format,
                        " format."
                    ),
                    call. = FALSE
                )
            })

            return(proteome)
        }

        if (obj.type == "data.table") {
            tryCatch({
                proteome <-
                    Biostrings::readAAStringSet(filepath = file,
                                                format = format, ...)
                proteome_names <-
                    as.vector(unlist(lapply(proteome@ranges@NAMES, function(x) {
                        return(strsplit(x, " ")[[1]][1])
                    })))
                proteome.dt <-
                    data.table::data.table(geneids = proteome_names,
                                           seqs =
                                               tolower(as.character(proteome)))

                data.table::setkey(proteome.dt, geneids)

            }, error = function(e) {
                stop(
                    paste0(
                        "File ",
                        file,
                        " could not be read properly. \n",
                        "Please make sure that ",
                        file,
                        " contains only amino acid sequences and is in ",
                        format,
                        " format."
                    ),
                    call. = FALSE
                )
            })

            return(proteome.dt)
        }
    }
HajkD/biomartr documentation built on Dec. 9, 2023, 7:25 p.m.