data-raw/create-reference-data.R

# Create reference ranges for internal package use. The data used to create
# the reference ranges was downloaded from Pharmvar v5.1.14 located at:
# https://www.pharmvar.org/download
#
# -------------------------------------------------------------------------
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(VariantAnnotation))


# Extract reference ranges from VCF files ---------------------------------
genes <- dir(path = here("data-raw", "pharmvar-5.1.14"))
reference_ranges <- vector("list", length(genes))
for (i in seq_along(genes)) {
  files <- list.files(
    path = here("data-raw", "pharmvar-5.1.14", genes[[i]], "GRCh38"),
    pattern = "*.vcf",
    full.names = TRUE
  )

  # Name files by their allele/suballele
  names(files) <- gsub(".vcf", "", basename(files))

  # Read in all vcfs for the given gene
  message(paste("Reading in VCF files for", genes[[i]]))
  vcfs <- parallel::mclapply(files, readVcf, genome = "GRCh38", mc.cores = 6)

  # Extract the GRanges for the alleles and collapse into a single object
  message("Extracting rowRanges...")
  rowranges <- GRangesList(lapply(vcfs, rowRanges))
  rowranges <- unlist(rowranges, use.names = TRUE)

  # Extract only the unique ranges from the object -- drop names to avoid confusion
  uniq <- unique(rowranges)
  names(uniq) <- NULL

  # Drop all unused levels
  seqlevels(uniq, pruning.mode = "coarse") <- unique(as.character(seqnames(uniq)))

  reference_ranges[[i]] <- uniq
  message("Done.")
}
names(reference_ranges) <- genes


# Create haplotype reference GRanges --------------------------------------

haplotype_files <- list.files(
  path = here("data-raw"),
  pattern = "*.haplotypes.tsv",
  recursive = TRUE,
  full.names = TRUE
)
haplotype_files <- haplotype_files[grepl("GRCh38", haplotype_files)]
haplotypes <- rbindlist(lapply(haplotype_files, fread))

# Reference positions contain no information
haplotypes <- haplotypes[ReferenceSequence != "REFERENCE"]

chrom_info <- setDT(getChromInfoFromNCBI("GRCh38"))
haplotypes <- merge(
  haplotypes,
  chrom_info,
  by.x = "ReferenceSequence",
  by.y = "RefSeqAccn",
  all.x = TRUE,
  all.y = FALSE
)

# Create a GRanges object from the data.frame
haplotypes_gr <- makeGRangesFromDataFrame(
  haplotypes,
  keep.extra.columns = TRUE,
  seqnames.field = "UCSCStyleName",
  start.field = "Variant Start",
  end.field = "Variant Stop"
)
genome(haplotypes_gr) <- "GRCh38"

# Save to package
usethis::use_data(reference_ranges, haplotypes_gr, overwrite = TRUE, internal = TRUE)
coriell-research/pgx documentation built on June 4, 2022, 11:08 a.m.