# Create reference ranges for internal package use. The data used to create
# the reference ranges was downloaded from Pharmvar v5.1.14 located at:
# https://www.pharmvar.org/download
#
# -------------------------------------------------------------------------
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(VariantAnnotation))
# Extract reference ranges from VCF files ---------------------------------
genes <- dir(path = here("data-raw", "pharmvar-5.1.14"))
reference_ranges <- vector("list", length(genes))
for (i in seq_along(genes)) {
files <- list.files(
path = here("data-raw", "pharmvar-5.1.14", genes[[i]], "GRCh38"),
pattern = "*.vcf",
full.names = TRUE
)
# Name files by their allele/suballele
names(files) <- gsub(".vcf", "", basename(files))
# Read in all vcfs for the given gene
message(paste("Reading in VCF files for", genes[[i]]))
vcfs <- parallel::mclapply(files, readVcf, genome = "GRCh38", mc.cores = 6)
# Extract the GRanges for the alleles and collapse into a single object
message("Extracting rowRanges...")
rowranges <- GRangesList(lapply(vcfs, rowRanges))
rowranges <- unlist(rowranges, use.names = TRUE)
# Extract only the unique ranges from the object -- drop names to avoid confusion
uniq <- unique(rowranges)
names(uniq) <- NULL
# Drop all unused levels
seqlevels(uniq, pruning.mode = "coarse") <- unique(as.character(seqnames(uniq)))
reference_ranges[[i]] <- uniq
message("Done.")
}
names(reference_ranges) <- genes
# Create haplotype reference GRanges --------------------------------------
haplotype_files <- list.files(
path = here("data-raw"),
pattern = "*.haplotypes.tsv",
recursive = TRUE,
full.names = TRUE
)
haplotype_files <- haplotype_files[grepl("GRCh38", haplotype_files)]
haplotypes <- rbindlist(lapply(haplotype_files, fread))
# Reference positions contain no information
haplotypes <- haplotypes[ReferenceSequence != "REFERENCE"]
chrom_info <- setDT(getChromInfoFromNCBI("GRCh38"))
haplotypes <- merge(
haplotypes,
chrom_info,
by.x = "ReferenceSequence",
by.y = "RefSeqAccn",
all.x = TRUE,
all.y = FALSE
)
# Create a GRanges object from the data.frame
haplotypes_gr <- makeGRangesFromDataFrame(
haplotypes,
keep.extra.columns = TRUE,
seqnames.field = "UCSCStyleName",
start.field = "Variant Start",
end.field = "Variant Stop"
)
genome(haplotypes_gr) <- "GRCh38"
# Save to package
usethis::use_data(reference_ranges, haplotypes_gr, overwrite = TRUE, internal = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.