R/makeNCBIToOrgDbs.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Documented in makeNCBIToOrgDbsToAHM

### =========================================================================
### makeNCBIToOrgDbs ('non-standard' OrgDbs)
### -------------------------------------------------------------------------
###

## This recipe makes 'non-standard' OrgDb sqlite files from data
## at ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/. These OrgDbs are less
## comprehensive than the 'standard' packages available in the
## Bioconductor repo. This code generates 1000 sqlite files.

## This recipe should be run right before a new release. The
## biocversion should be the current devel version, soon to roll over
## to the new release.

## The 'standard' OrgDbs are generated with makeStandardOrgDbsToSqlite.R.

.NCBIMetadataFromUrl <- function(baseUrl, justRunUnitTest, biocVersion) {
    load(system.file('extdata','viableIDs.rda', package='AnnotationForge'))
    ids <- results

    if (justRunUnitTest) ids <- head(ids)
    ## FIXME: need different solution; this subset produces NAs
    if (length(biocVersion) > 1) {
        stop(paste("'biocVersion' must be a single value. Make sure new",
                   "'OrgDbs' go into the CORRECT Bioconductor version!"))
    }
    ## Marc's note:
    ## need to find an alternative to this... old school table of tax Ids
    if (!exists("specData")) {
    load(system.file("data", "specData.rda", package = "GenomeInfoDbData"))
    }
    sd <- specData[!is.na(specData[[3]]),]
    ## need to find offenders
    lookup <- function(id){
        message(paste0("looking up value for: ", id))
        GenomeInfoDb:::lookup_organism_by_tax_id(id, all=TRUE)
    }
    ## Some taxonomy IDs cannot be looked up at all - so discard
    ids <- as.numeric(ids[ids %in% sd$tax_id])
    res <- lapply(ids,lookup)
    taxonomyId <-
        as.integer(as.character(unlist(lapply(res, function(x){x$tax_id}))))
    genus <- unlist(lapply(res, function(x){x$genus}))
    species <- unlist(lapply(res, function(x){x$species}))
    genus <- gsub(" ", "_", genus)
    genus <- gsub("/", "|", genus)
    species <- gsub(" ", "_", species)
    species <- gsub("/", "|", species)

    oriSpecies <- paste(genus, species)
    fullSpecies <- gsub(" ", "_", oriSpecies)

    title <- paste0("org.", fullSpecies, ".eg", ".sqlite")
    rDataPath <- paste0("ncbi/uniprot/",biocVersion,"/",title)

    genome <- setNames(rep("NCBI genomes", length(fullSpecies)), title)
    dateMessage <- paste0('NCBI gene annotations as of ', as.character(date()))
    sourceVersion <- rep(dateMessage, length(fullSpecies))
    description <- paste("NCBI gene ID based annotations about", oriSpecies)
    sourceUrls <- c(baseUrl,"ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz")
    sourceUrl <- rep(list(sourceUrls), length(fullSpecies))

    tryCatch({
        aws <- system2("aws",
                       args=paste0("s3 ls s3://annotationhub/ncbi/uniprot/",
                           biocVersion," --recursive"), stdout=TRUE, stderr=TRUE)
    }, error=function(e){
        aws <- character(0)
    }, finally={
        if (!exists("aws"))  aws <- character(0)
        aws <- gsub("\\s+", " ", stringr::str_trim(aws))
        aws <- aws[-1]
    })

    if (length(aws)){
        s3titles <- sapply(strsplit(sapply(strsplit(aws, " "),"[[", 4), "/"),"[[",4)
        subset <- (title %in% s3titles)
        if(any(subset)){
            lst <- lapply(list(title=title, species = oriSpecies,
                               taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
                               sourceVersion = sourceVersion,
                               description=description, rDataPath=rDataPath), "[", subset)
        }else{
            lst <- list(title=title, species = oriSpecies,
                        taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
                        sourceVersion = sourceVersion,
                        description=description, rDataPath=rDataPath)
        }

    }else{

        lst <- list(title=title, species = oriSpecies,
                    taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
                    sourceVersion = sourceVersion,
                    description=description, rDataPath=rDataPath)
    }

    lst
}

needToRerunNonStandardOrgDb <- function(biocVersion =  BiocManager::version(),
                                        baseUrl =
                                        "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/"){
    load(system.file('extdata','viableIDs.rda', package='AnnotationForge'))
    ids <- results

    ## FIXME: need different solution; this subset produces NAs
    if (length(biocVersion) > 1) {
        stop(paste("'biocVersion' must be a single value. Make sure new",
                   "'OrgDbs' go into the CORRECT Bioconductor version!"))
    }
    ## Marc's note:
    ## need to find an alternative to this... old school table of tax Ids
    if (!exists("specData")) {
    load(system.file("data", "specData.rda", package = "GenomeInfoDbData"))
    }
    sd <- specData[!is.na(specData[[3]]),]
    ## need to find offenders
    lookup <- function(id){
        message(paste0("looking up value for: ", id))
        GenomeInfoDb:::lookup_organism_by_tax_id(id, all=TRUE)
    }
    ## Some taxonomy IDs cannot be looked up at all - so discard
    ids <- as.numeric(ids[ids %in% sd$tax_id])
    res <- lapply(ids,lookup)
    taxonomyId <-
        as.integer(as.character(unlist(lapply(res, function(x){x$tax_id}))))
    genus <- unlist(lapply(res, function(x){x$genus}))
    species <- unlist(lapply(res, function(x){x$species}))
    genus <- gsub(" ", "_", genus)
    genus <- gsub("/", "|", genus)
    species <- gsub(" ", "_", species)
    species <- gsub("/", "|", species)

    oriSpecies <- paste(genus, species)
    fullSpecies <- gsub(" ", "_", oriSpecies)

    title <- paste0("org.", fullSpecies, ".eg", ".sqlite")

    tryCatch({
        aws <- system2("aws",
                       args=paste0("s3 ls s3://annotationhub/ncbi/uniprot/",
                           biocVersion," --recursive"), stdout=TRUE, stderr=TRUE)
    }, error=function(e){
        aws <- character(0)
        stop("Cannot access AWS. Unable to determine")
    }, finally={
        if (!exists("aws"))  aws <- character(0)
        aws <- gsub("\\s+", " ", stringr::str_trim(aws))
        aws <- aws[-1]
    })

    if (length(aws)){
        s3titles <- sapply(strsplit(sapply(strsplit(aws, " "),"[[", 4), "/"),"[[",4)
        subset <- !(title %in% s3titles)
        res <- any(subset)
    }else{
        res <- TRUE
    }
    res
}

## STEP 1: make function to process metadata into AHMs
makeNCBIToOrgDbsToAHM <-
    function(currentMetadata, justRunUnitTest = FALSE,
             BiocVersion =  BiocManager::version(),
             baseUrl = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/") {
    meta <- .NCBIMetadataFromUrl(baseUrl, justRunUnitTest,
                                 biocVersion=BiocVersion[[1]])

    message("Processing ", length(meta[[1]]), " files.")

    Map(AnnotationHubMetadata,
        Description=meta$description,
        Genome=meta$genome,
        SourceUrl=meta$sourceUrl,
        SourceVersion=meta$sourceVersion,
        Species=meta$species,
        TaxonomyId=meta$taxonomyId,
        Title=meta$title,
        RDataPath=meta$rDataPath,
        MoreArgs=c(currentMetadata, list(
            BiocVersion=package_version(BiocVersion),
            Coordinate_1_based = TRUE,
            DataProvider = baseUrl,
            Maintainer = paste("Bioconductor Package Maintainer",
                               "<maintainer@bioconductor.org>"),
            RDataClass = "OrgDb",
            DispatchClass = "SQLiteFile",
            SourceType="NCBI/UniProt",
            RDataDateAdded = Sys.time(),
            Recipe = "AnnotationHubData:::NCBIToOrgDbs",
            Tags = c("NCBI", "Gene", "Annotation"))))
}

## STEP 2: Make a recipe function that takes an AnnotationHubRecipe object.
NCBIToOrgDbs <- function(ahm){
    fullSpecies <- ahm@Species
    genus <- unlist(strsplit(fullSpecies,split=" "))[1]
    species <- unlist(strsplit(fullSpecies,split=" "))[2]
    dbname <- makeOrgPackageFromNCBI(version="1.0.0",
                                     maintainer=ahm@Maintainer,
                                     author=ahm@Maintainer,
                                     outputDir=ahm@HubRoot,
                                     tax_id=as.character(ahm@TaxonomyId),
                                     genus=genus,
                                     species=species,
                                     NCBIFilesDir=ahm@HubRoot,
                                     databaseOnly=TRUE,
                                     rebuildCache=TRUE)
    file.rename(from=file.path(ahm@HubRoot, dbname), to=outputFile(ahm))
    outputFile(ahm)
}

## STEP 3: Call the helper to set up the newResources() method
makeAnnotationHubResource("NCBIImportPreparer",
                          makeNCBIToOrgDbsToAHM)

Any scripts or data that you put into this service are public.

AnnotationHubData documentation built on April 17, 2021, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makeNCBIToOrgDbs.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions NCBIToOrgDbs makeNCBIToOrgDbsToAHM needToRerunNonStandardOrgDb .NCBIMetadataFromUrl

Documented in makeNCBIToOrgDbsToAHM

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData Transform public data resources into Bioconductor Data Structures

R/makeNCBIToOrgDbs.R In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions NCBIToOrgDbs makeNCBIToOrgDbsToAHM needToRerunNonStandardOrgDb .NCBIMetadataFromUrl

Documented in makeNCBIToOrgDbsToAHM

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makeNCBIToOrgDbs.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures