R/makeTxDbFromGFF.R

Defines functions makeTxDbFromGFF .detect_file_format .prepareGFFMetadata .rename_by_dbxrefTag .tidy_seqinfo .make_Seqinfo_from_chrominfo

Documented in makeTxDbFromGFF

### =========================================================================
### makeTxDbFromGFF()
### -------------------------------------------------------------------------


.make_Seqinfo_from_chrominfo <- function(chrominfo, circ_seqs=NULL)
{
    if (is(chrominfo, "Seqinfo"))
        return(chrominfo)
    if (!is.data.frame(chrominfo))
        stop(wmsg("'chrominfo' must be a data.frame, a Seqinfo object, ",
                  "or NULL"))
    .REQUIRED_COLS <- c("chrom", "length")
    .OPTIONAL_COLS <- "is_circular"
    check_colnames(chrominfo, .REQUIRED_COLS, .OPTIONAL_COLS, "chrominfo")
    ans <- Seqinfo(as.character(chrominfo$chrom), chrominfo$length)
    if (!has_col(chrominfo, "is_circular")) {
        is_circ <- GenomeInfoDb:::make_circ_flags_from_circ_seqs(
                                            seqlevels(ans),
                                            circ_seqs)
    } else if (!is.null(circ_seqs)) {
        stop(wmsg("'circ_seqs' should not be specified when 'chrominfo' ",
                  "has an \"is_circular\" column"))
    } else {
        is_circ <- chrominfo$is_circular
    }
    isCircular(ans) <- is_circ
    ans
}

.tidy_seqinfo <- function(gr, circ_seqs=NULL, chrominfo=NULL)
{
    if (is.null(chrominfo)) {
        seqlevels <- seqlevels(gr)
        seqlevels[rankSeqlevels(seqlevels)] <- seqlevels
        seqlevels(gr) <- seqlevels
        isCircular(gr) <- GenomeInfoDb:::make_circ_flags_from_circ_seqs(
                                                   seqlevels(gr),
                                                   circ_seqs)
        return(gr)
    }
    si <- .make_Seqinfo_from_chrominfo(chrominfo, circ_seqs)
    suppressWarnings(seqinfo(gr) <- merge(seqinfo(gr), si))
    dangling_seqlevels <- setdiff(seqlevelsInUse(gr), seqlevels(si))
    if (length(dangling_seqlevels) != 0L) {
        in1string <- paste0(dangling_seqlevels, collapse=", ")
        stop(wmsg("'chrominfo' must describe at least all the chromosomes ",
                  "of the genomic features imported from the file. ",
                  "Chromosomes missing from 'chrominfo': ", in1string))
    }
    seqlevels(gr) <- seqlevels(si)
    gr
}

.rename_by_dbxrefTag <- function(gr, dbxrefTag) {
    dbxref <- unlist(gr$Dbxref, use.names=FALSE)
    m <- grepl(paste0("^", dbxrefTag, ":"), dbxref)
    gr$Name[S4Vectors:::quick_togroup(gr$Dbxref)[m]] <- dbxref[m]
    gr
}

.prepareGFFMetadata <- function(file, dataSource=NA, organism=NA,
                                taxonomyId=NA, miRBaseBuild=NA, metadata)
{
    message("Prepare the 'metadata' data frame ... ", appendLF=FALSE)
    if (!isSingleStringOrNA(dataSource))
        stop("'dataSource' must be a single string or NA")
    if (!isSingleStringOrNA(organism))
        stop("'organism' must be a single string or NA")
    if (!isSingleStringOrNA(miRBaseBuild))
        stop("'miRBaseBuild' must be a single string or NA")
    if (identical(dataSource, NA)) {
        if (is.character(file)) {
            dataSource <- file
        } else {
            dataSource <- as.character(file)
            if (inherits(file, "connection"))
                dataSource <- showConnections(all=TRUE)[dataSource,
                                                        "description"]
        }
    }
    if (!is.na(taxonomyId)) {
        GenomeInfoDb:::check_tax_id(taxonomyId)
    } else if (!is.na(organism)) {
        taxonomyId <- GenomeInfoDb:::lookup_tax_id_by_organism(organism)
    }
    df <- data.frame(
        name=c("Data source", "Organism", "Taxonomy ID", "miRBase build ID"),
        value=c(dataSource, organism, taxonomyId, miRBaseBuild))
    metadata <- rbind(df, metadata)
    message("OK")
    metadata
}

.detect_file_format <- function(file)
{
    if (is(file, "connection"))
        file <- summary(file)$description
    if (isSingleString(file)) {
        file2 <- try(FileForFormat(file), silent=TRUE)
        if (inherits(file2, "try-error"))
            return(tools::file_ext(file))
        file <- file2
    }
    if (is(file, "BiocFile")) {
        if (is(file, "GFF3File"))
            return("gff3")
        if (is(file, "GTFFile"))
            return("gtf")
        desc <- rtracklayer:::resourceDescription(file)
        if (is(file, "CompressedFile")) {
            ## Mimic what import,CompressedFile,missing,missing method does.
            desc <- tools::file_path_sans_ext(desc)
        }
        format <- tools::file_ext(desc)
        return(format)
    }
    stop(wmsg("Invalid 'file'. Must be a path to a file, or an URL, ",
              "or a connection object, or a GFF3File or GTFFile object."))
}

### Based on makeTxDbFromGRanges().
makeTxDbFromGFF <- function(file,
                            format=c("auto", "gff3", "gtf"),
                            dataSource=NA,
                            organism=NA,
                            taxonomyId=NA,
                            circ_seqs=NULL,
                            chrominfo=NULL,
                            miRBaseBuild=NA,
                            metadata=NULL,
                            dbxrefTag)
{
    format <- match.arg(format)
    if (format == "auto") {
        format <- .detect_file_format(file)
        if (!(format %in% c("gff3", "gff", "gtf")))
            stop(wmsg("Cannot detect whether 'file' is a GFF3 or GTF file. ",
                      "Please use the 'format' argument to specify the ",
                      "format (\"gff3\" or \"gtf\")."))
    }
    if (format == "gff3") {
        colnames <- GFF3_COLNAMES
    } else if (format == "gtf") {
        colnames <- GTF_COLNAMES
    } else { # format == "gff"
        ## We don't know a priori if the file is GFF3 or GTF.
        ## TODO: Maybe use sniffGFFVersion() to detect whether the file is
        ## GFF3 or GTF (maybe do this in .detect_file_format()).
        colnames <- union(GFF3_COLNAMES, GTF_COLNAMES)
    }

    message("Import genomic features from the file as a GRanges object ... ",
            appendLF=FALSE)
    gr <- import(file, format=format, colnames=colnames,
                       feature.type=GFF_FEATURE_TYPES)
    gr <- .tidy_seqinfo(gr, circ_seqs, chrominfo)
    if (!missing(dbxrefTag)) {
        gr <- .rename_by_dbxrefTag(gr, dbxrefTag)
    }

    message("OK")

    metadata <- .prepareGFFMetadata(file, dataSource, organism, taxonomyId,
                                    miRBaseBuild, metadata)

    message("Make the TxDb object ... ", appendLF=FALSE)
    txdb <- makeTxDbFromGRanges(gr, metadata=metadata)
    message("OK")
    txdb
}
Bioconductor/GenomicFeatures documentation built on Aug. 29, 2021, 1:03 p.m.