inst/scripts/download_extdata.R

#!/usr/bin/env R
# By Migdal 2018
# Downloads HLA alignments files
# If alignment files contain sequences for multiple genes those will be split into separate files (eg. DRB genes)
library("dplyr")
library("RCurl")
library("XML")
library("rvest")
devtools::load_all()

out_dir <- "alignments"
hla_alignmnets_url <- "ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/alignments/"
hla_alignmnets_db <- getURL(hla_alignmnets_url,
                            verbose=TRUE,
                            ftp.use.epsv= FALSE,
                            dirlistonly = TRUE
)
hla_alignmnets_db <- strsplit(hla_alignmnets_db, "\n")[[1]] %>%
  grep("_prot.txt", ., value = TRUE) %>%
  grep("ClassI_prot.txt", ., invert = TRUE, value = TRUE)
for (aln in hla_alignmnets_db) {
    download.file(paste0(hla_alignmnets_url, aln), destfile = paste0(out_dir, aln))
}
# check if alignemnt files contains mixed records
multi_aln <- lapply(
  paste0(out_dir, hla_alignmnets_db),
  function(aln_file) {
    numbers <- rownames(readHlaAlignments(aln_file))
    numbers <- vapply(strsplit(numbers, "\\*"), `[[`, 1, FUN.VALUE = character(length = 1))
    return(length(unique(numbers)) > 1)
  }
) %>%
  unlist()
for (aln_file in paste0(out_dir, hla_alignmnets_db)[multi_aln]) {
  aln <-readHlaAlignments(aln_file)
  genes <- rownames(aln)
  ref_name <- genes[1]
  genes <- vapply(strsplit(genes, "\\*"), `[[`, 1, FUN.VALUE = character(length = 1))
  genes <- unique(genes)
  raw_aln <- readLines(aln_file)
  ref_seq_ids <- grep(ref_name, raw_aln, fixed = TRUE)
  for (i in 1:length(genes)) {
    keep <- ! grepl(paste(genes[-i], collapse = "|"), raw_aln)
    keep[ref_seq_ids] <- TRUE
    writeLines(text = raw_aln[keep],
               con = paste0(out_dir, genes[i], "_prot.txt"),
               sep = "\n"
    )
  }
  unlink(aln_file)
}
Genentech/MiDAS documentation built on Feb. 5, 2024, 11:52 a.m.