R/makeEnsemblFasta.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Documented in ensemblFastaToFaFile makeEnsemblFastaToAHM

### =========================================================================
### makeEnsemblFastaAHM() and ensemblFastaToFaFile()
### -------------------------------------------------------------------------
###

## Adjust this expression in order to save painful-reprocessing of older files.
## .ensemblReleaseRegex <- ".*release-(69|7[[:digit:]]|8[[:digit:]])"
## .ensemblReleaseRegex <- ".*release-(79|8[[:digit:]])"
## for a speed run just do one set
## .ensemblReleaseRegex <- ".*release-81"

## list directories below url/dir satisfying regex
.ensemblDirUrl <-
    function(url, dir, regex)
{
    lst <- .listRemoteFiles(url)
    releases <- paste0(url, lst)
    paste(grep(regex, releases, value=TRUE), dir, sep="/")
}

## NOTE: httr >= 1.2.0 doesn't support ftp last modified date and size
## FIXME: This should be combined with .httrFileInfo() and .ftpFileInfo()
.ensemblMetadataFromUrl <- function(sourceUrl, twobit=FALSE, http=FALSE) {
    releaseRegex <- ".*(release-[[:digit:]]+).*"
    if (!twobit){
        title <- sub("\\.gz$", "", basename(sourceUrl))
    }else{
        title <- sub("\\.fa\\.gz$", ".2bit", basename(sourceUrl))
    }
    root <- setNames(rep(NA_character_, length(sourceUrl)), title)

    releaseNum <- sub("release-", "", sub(releaseRegex, "\\1", sourceUrl[1]))

    # as of release 96 a file is present with species index for mappings
    species_index <- GenomeInfoDb:::fetch_species_index_from_Ensembl_FTP(release=releaseNum)

    species <- vapply(strsplit(sourceUrl, '/'), function(x) x[[7]], character(1))
    genome <- vapply(species, FUN.VALUE=character(1), USE.NAMES=FALSE,
                     FUN=function(spc, tbl){
                         message(spc, "\n")
                         tbl[tbl$species == spc, "assembly"]
                     }, tbl=species_index)
    taxonomyId <- vapply(species, FUN.VALUE=integer(1), USE.NAMES=FALSE,
                     FUN=function(spc, tbl){
                         message(spc, "\n")
                         tbl[tbl$species == spc, "taxonomy_id"]
                     }, tbl=species_index)

    species <- sub("_", " ", species,fixed=TRUE)

    if (http) {
       ftpInfo <- .httrFileInfo(sourceUrl)
       sourceSize <- ftpInfo$size
       sourceLastModDate <- ftpInfo$date
    } else {
        sourceSize <- as.numeric(NA)
        sourceLastModDate <- as.POSIXct(NA)
    }

    list(annotationHubRoot = root, title=title, species = species,
         taxonomyId = as.integer(taxonomyId),
         genome = genome,
         sourceSize=sourceSize,
         sourceLastModifiedDate=sourceLastModDate,
         sourceVersion = sub(releaseRegex, "\\1", sourceUrl))
}

.ensemblFastaTypes <-
    c("cdna\\.all", "dna_rm\\.toplevel", "dna_sm\\.toplevel",
      "dna\\.toplevel", "ncrna", "pep\\.all")

## get urls
.ensemblFastaSourceUrls <-
    function(baseUrl, baseDir, regex, baseTypes=.ensemblFastaTypes)
{
    want <- .ensemblDirUrl(baseUrl, baseDir, regex)

    .processUrl <- function(url) {
        listing <- .ftpDirectoryInfo(url)

        subdirIdx <- grepl(".*/.*:", listing)
        subdir <- sub("^.{2}(.*):$", "\\1", listing[subdirIdx])
        fileTypes <- paste(baseTypes, collapse="|")
        pat <- sprintf(".*(%s)\\.fa\\.gz$", fileTypes)

        fastaIdx <- grepl(pat, listing)
        fasta <- sub(".* ", "", listing[fastaIdx])

        ## match subdir w/ fasta
        subdir <- subdir[cumsum(subdirIdx)[fastaIdx]]

        ## Prefer "primary_assembly" to "toplevel" resources.
        organisms <- unique(sub("(.+?)\\..*", "\\1", fasta, perl=TRUE))
        keepIdxList <- sapply(organisms, function(x) {
            orgFiles <- fasta[grep(paste0("^", x, "\\."), fasta)]
            reBoth <- paste0("dna", c("_rm", "_sm", ""),
                "\\.(primary_assembly|toplevel)\\.")
            toplevelIdx <-
                sapply(reBoth, function(x) length(grep(x, orgFiles)) > 1)
            reToplevel <- paste0("dna", c("_rm", "_sm", ""),
                "\\.toplevel\\.")[toplevelIdx]

            isRedundant <-
                sapply(reToplevel, function(x) grepl(x, orgFiles))
            retVal <- rep(TRUE, length(orgFiles))
            if (!is.null(dim(isRedundant))) {
              retVal <- !apply(isRedundant, 1, any)
            }

            retVal
        })
        keepIdx <- base::unlist(keepIdxList)
        fasta <- fasta[keepIdx]
        subdir <- subdir[keepIdx]

        sprintf("%s%s/%s", url, subdir, fasta)
    }
    res <- base::unlist(lapply(want, .processUrl), use.names=FALSE)

    if (length(res) == 0) {
        txt <- sprintf("no fasta files at %s",
                       paste(sQuote(want), collapse=", "))
        stop(paste(strwrap(txt, exdent=2), collapse="\n"))
    }
    res
}

## metadata generator
makeEnsemblFastaToAHM <-
    function(currentMetadata, baseUrl = "ftp://ftp.ensembl.org/pub/",
             baseDir = "fasta/", release,
             justRunUnitTest = FALSE, BiocVersion = BiocManager::version())
{
    time1 <- Sys.time()
    regex <- paste0(".*release-", release)
    sourceUrl <- .ensemblFastaSourceUrls(baseUrl, baseDir, regex)
    if (justRunUnitTest)
        sourceUrl <- sourceUrl[1:5]

    sourceFile <- sub(baseUrl, "ensembl/", sourceUrl)
    meta <- .ensemblMetadataFromUrl(sourceUrl)
    dnaType <- local({
        x <- basename(dirname(sourceFile))
        sub("(dna|rna)", "\\U\\1", x, perl=TRUE)
    })
    description <- paste("FASTA", dnaType, "sequence for", meta$species)

    ## rdatapaths db table needs an extra row for the index file
    rdataPath <- sub(".gz$", ".bgz", sourceFile)
    rdps <- rep(rdataPath, each=3)
    rdatapaths <- split(rdps, f=as.factor(rep(1:length(rdataPath),each=3)))
    ## second record of each set becomes the '.fai' file
    rdatapaths <- lapply(rdatapaths,
                         function(x){x[2] <- paste0(x[2],".fai") ; x[2] <-
                                         paste0(x[3],".gzi") ; return(x)})

    Map(AnnotationHubMetadata,
        Description=description,
        Genome=meta$genome,
        RDataPath=rdatapaths,
        SourceUrl=sourceUrl,
        SourceVersion=meta$sourceVersion,
        Species=meta$species,
        TaxonomyId=meta$taxonomyId,
        Title=meta$title,
        SourceSize=meta$sourceSize,
        SourceLastModifiedDate=meta$sourceLastModifiedDate,
        MoreArgs=list(
          BiocVersion=BiocVersion,
          Coordinate_1_based = TRUE,
          DataProvider="Ensembl",
          Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
          SourceType="FASTA",
          DispatchClass="FaFile",
          RDataClass=c("FaFile", "FaFile", "FaFile"),
          RDataDateAdded=Sys.time(),
          Recipe="AnnotationHubData:::ensemblFastaToFaFile",
          Tags=c("FASTA", "ensembl", "sequence")))
}

## Used in makeEnsemblFastaAHM() and makeGencodeFastaToAHM():
## Unzips .gz file, indexes it and saves as .rz and .rz.fai.
.fastaToFaFile <- function(ahm)
{
    ## target output file
    faOut <- outputFile(ahm)[[1]]
    srcFile <- sub('.bgz$','.gz',faOut)
    ## unzip and index
    bgzip(srcFile)
    indexFa(faOut)
}

ensemblFastaToFaFile <- function(ahm)
{
    .fastaToFaFile(ahm)
}

## create dispatch class and newResources() method
makeAnnotationHubResource("EnsemblFastaImportPreparer", makeEnsemblFastaToAHM)

Any scripts or data that you put into this service are public.

AnnotationHubData documentation built on April 17, 2021, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makeEnsemblFasta.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions ensemblFastaToFaFile .fastaToFaFile makeEnsemblFastaToAHM .ensemblFastaSourceUrls .ensemblMetadataFromUrl .ensemblDirUrl

Documented in ensemblFastaToFaFile makeEnsemblFastaToAHM

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData Transform public data resources into Bioconductor Data Structures

R/makeEnsemblFasta.R In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions ensemblFastaToFaFile .fastaToFaFile makeEnsemblFastaToAHM .ensemblFastaSourceUrls .ensemblMetadataFromUrl .ensemblDirUrl

Documented in ensemblFastaToFaFile makeEnsemblFastaToAHM

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makeEnsemblFasta.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures