R/makedbSNPVCF.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

### link to data description:
### http://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf/

### ----------------------------------------------------------------------
### March 2016

### Files in AH that point to ftp.ncbi.nih.gov/snp/organisms/*
### are no longer available:

### We have 38 of them:

### > length(query(hub, c("dbsnp", "vcf")))
### [1] 38
### 
### 
### Of the 38 full urls seen with query(hub, c("dbsnp", "vcf"))$sourceurl,
### there are 5 unique base directories:
### 
### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/
### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b141_GRCh37p13/
### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh37p13/
### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b142_GRCh38/
### ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b141_GRCh38/
### 
### On the web site (ftp://ftp.ncbi.nih.gov/snp/organisms/) there 7 base
### directories. b141 is no longer there and b144, b146 have been added.

### The recipe has been updated to look in 
###    ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/
###    ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/
### ----------------------------------------------------------------------

.dbSNPBaseUrl <-"ftp://ftp.ncbi.nlm.nih.gov/"

## Chosing files from archive so all files have a date stamp.
## It looks like files with no stamp in the current
## directory are either 'in progress' or 'subject to change'.
.getdbSNP <- function(justRunUnitTest) {
    baseUrl <- paste0(.dbSNPBaseUrl, "pub/clinvar/")
    paths <- c(GRCh37="vcf_GRCh37/archive_1.0/2016/", 
               GRCh38="vcf_GRCh38/archive_1.0/2016/")
    files <- c("clinvar_20160203", "clinvar_20160203_papu", 
               "common_and_clinical_20160203", 
               "common_no_known_medical_impact_20160203")
    urls <- setNames(paste0(baseUrl, paths), names(paths))

    if (justRunUnitTest)
        urls <- urls[1]
  
    genome <- rep(names(urls), each=length(files))
    df <- .ftpFileInfo(url=urls, extension=paste0(files, ".vcf.gz"))
    df$genome <- gsub("GRCh37clinical", "GRCh37", genome)
    df <- cbind(df, title=basename(df$fileurl), stringsAsFactors = FALSE)
    rownames(df) <- NULL
 
    map <- c(
        `All` = .expandLine("VCF of all variations that meet the criteria
        to be in a  VCF file.  This file is created once per dbSNP build."),
        `All_papu` = .expandLine("VCF of all variations found in the 
        psuedoautosomal region (PAR), alternate loci, patch sequences and 
        unlocalized or unplaced contigs(papu)"),
        `common_all` = .expandLine("VCF of all variations that are polymorphic
        in a least one population the 1000 Genomes project or any of the 
        following handles: 1000GENOMES, CSHL-HAPMAP, EGP_SNPS NHLBI-ESP,
        PGA-UW-FHCRC. A variation is polymorphic if the minor allele 
        frequency is at least 0.01 and the minor allele is present in 
        at least two samples."),
        `clinvar` = .expandLine("VCF of variations from clinvar where 'YYYYMMDD' 
        represents the date the file was created. This file is created 
        weekly."), 
        `common_and_clinical` = .expandLine("Variations from common_all.vcf.gz
        that are clinical.  A clinical variation is one the appears in 
        clinvar_YYYYMMDD.vcf.gz with at least one of the following clinical
        significance codes: 4 - probable-pathogenic, 5 - pathogenic, 
        6 - drug-response, 7 - histocompatibility, 255 - other, 
        This file is created weekly."),
        `common_no_known_medical_impact` = .expandLine("Variations from 
        common_all.vcf.gz that do not meet the clinical criteria described 
        above.  This file is created weekly."))
 
    description <- character(length(title))
    for (i in seq_along(map))
        description[grep(names(map)[i], df$title)] <- map[[i]]

    cbind(df, description, stringsAsFactors = FALSE)
}

makedbSNPVCF <- function(currentMetadata, justRunUnitTest=TRUE,
                         BiocVersion=BiocManager::version()) {
    rsrc <- .getdbSNP(justRunUnitTest)

    ## input_sources table
    sourceSize <- as.numeric(rsrc$size)
    sourceUrls <- rsrc$fileurl
    sourceVersion <- gsub(" ", "_", rsrc$date) 
    sourceLastModifiedDate <- rsrc$date
 
    ## resources table
    title <- rsrc$title
    description <- rsrc$description
    genome <- rsrc$genome
 
    ## rdatapath should have 2 entries -for the VCF and its TabixFile
    rdatapath <- sub(.dbSNPBaseUrl, "", rsrc$fileurl)
    rdps <- rep(rdatapath, each=2) 
    rdatapaths <- split(rdps, f=as.factor(rep(seq_along(rdatapath),each=2)))
    rdatapaths <- lapply(rdatapaths,
                         function(x){x[2] <- paste0(x[2],".tbi") ; return(x)}) 
 
    tags <- lapply(genome, 
        function(tag) c("dbSNP", tag, "VCF")
    )
 
    Map(AnnotationHubMetadata,
        SourceSize=sourceSize,
        SourceUrl=sourceUrls,
        SourceVersion=sourceVersion,
        SourceLastModifiedDate=sourceLastModifiedDate,
 
        Description=description,
        Title=title,
        Genome=genome,
        Tags=tags,
        RDataPath=rdatapaths,
 
        MoreArgs=list(
            BiocVersion=BiocVersion,
            # input sources 
            SourceType= "VCF",
 
            # resources
            Species="Homo sapiens",
            TaxonomyId=9606L, 
            DataProvider = "dbSNP",
            Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
            Coordinate_1_based = FALSE,
            Location_Prefix = .dbSNPBaseUrl,
            RDataDateAdded = Sys.time(),

            #rdata table
            DispatchClass= "dbSNPVCFFile" ,
            RDataClass = c("VcfFile", "VcfFile"),

            Recipe = "AnnotationHubData:::ncbi_dbSNPVCFFile"))
}


## recipe
ncbi_dbSNPVCFFile <- function(ahm)
{
    ## The tbi file exists online, just download it.
    faIn <- normalizePath(inputFiles(ahm))  # file on ftp site
    faOut1 <- normalizePath(outputFile(ahm))[1] # vcf.gz file on localDir
    faOut2 <- outputFile(ahm)[2]  # vcf.gz.tbi file on localDir
 
    if(!file.exists(faOut2)) {
        tbiFile <- paste0(metadata(ahm)$Location_Prefix, 
                          metadata(ahm)$RDataPath[2]) 
        tbi <- download.file(tbiFile, faOut2)
    }
    faOut2
}

makeAnnotationHubResource("dbSNPVCFPreparer", makedbSNPVCF, quiet=TRUE)

Any scripts or data that you put into this service are public.

AnnotationHubData documentation built on April 17, 2021, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makedbSNPVCF.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions ncbi_dbSNPVCFFile makedbSNPVCF .getdbSNP

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData Transform public data resources into Bioconductor Data Structures

R/makedbSNPVCF.R In AnnotationHubData: Transform public data resources into Bioconductor Data Structures

Defines functions ncbi_dbSNPVCFFile makedbSNPVCF .getdbSNP

Try the AnnotationHubData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

AnnotationHubData
Transform public data resources into Bioconductor Data Structures

R/makedbSNPVCF.R
In AnnotationHubData: Transform public data resources into Bioconductor Data Structures