# Title: coil_functions.R
# Author: Stuart Lee
# Description: Helper functions for extracting barcodes
# for use in COI and estMOI L - see
# http://www.broadinstitute.org/infect/malaria/coil/
#' Parse estMOI log files in a directory into tidy data frame
#'
#' @param path path containing .txt files containing estMOI estimates
#' @note Assumes that outPrefix set as option in estMOI is set to sample identifer.
#' @return a data.frame
#' @export
readEstMOI <- function(path) {
# regex to get outPrefix
estmoi_pattern<- "(.*\\/)(.*)(.moi.*.txt)"
estmoi_files <- list.files(path, pattern = ".txt$", full.names = TRUE)
readerMOI <- function(file) {
out_df <- read.table(file, header = FALSE, sep = "\t",
comment.char = "", skip = 1, fill = TRUE,
na.strings = c("", NA))
colnames(out_df) <- c("K", "count", "proportion", "estimate")
out_df$sample.id <- gsub(estmoi_pattern, "\\2", file)
out_df
}
estmoi_data <- do.call(rbind, lapply(estmoi_files, readerMOI))
estmoi_data
}
#' Extract barcode for use in COIL program
#'
#' @param gdsfile a \code{\link[SeqArray]{SeqVarGDSClass}} object
#' @param variant.id a numeric vector of variant identifiers
#' @param barcode.file character name of output file
#'
#' @details Generate SNP barcode using supplied variant ids. This
#' is constructed by collapsing alleles over the variant ids, forming
#' a barcode for each sample. Then a barcode file containing
#' sample ids + barcode is generated and can be supplied to the COIL web-app
#' or to the command line COIL program.
#' @references Galinsky, Kevin, et al. "COIL: a methodology for evaluating
#' malarial complexity of infection using likelihood from single nucleotide
#' polymorphism data." Malaria journal 14.1 (2015): 4.
#' @note Need to look at how COIL constructed it's orginal barcode. Esp. het calls.
#' @return data.frame containing barcode for each sample
#' @importFrom SeqArray seqSetFilter seqGetData seqGetFilter
#' @importFrom SeqVarTools getGenotypeAlleles
#' @export
extractBarcode <- function(gdsfile, variant.id, barcode.file) {
stopifnot(inherits(gdsfile, "SeqVarGDSClass"))
old.filter <- seqGetFilter(gdsfile)
seqSetFilter(gdsfile, variant.id = variant.id)
gt <- getGenotypeAlleles(gdsfile)
sample.id <- seqGetData(gdsfile, "sample.id")
# find hets and recode
findHets <- apply(gt, 2,
function(y) unlist(lapply(strsplit(y, "/"),
function(x) x[1] != x[2])))
gt[!findHets & !is.na(findHets)] <- substr(gt[!findHets & !is.na(findHets)], 1, 1)
gt[findHets & !is.na(findHets)] <- "N"
gt[is.na(findHets)] <- "X"
# paste haplotypes together
barcode <- apply(gt, 1, paste, collapse = "")
# add in sample.id
stopifnot(length(sample.id) == length(barcode))
final_barcode <- data.frame(sample.id, barcode)
# write out barcode file
datafile <- file(barcode.file, open = "wt")
on.exit(close(datafile))
# construct header and write it out
header <- paste0("# COIL barcode generated by moimix ", Sys.time(), "\n",
"#ID Barcode")
writeLines(header, con = datafile)
write.table(final_barcode, datafile, col.names = FALSE, row.names = FALSE, quote = FALSE)
# reset filter
seqSetFilter(gdsfile, sample.id = sample.id, variant.sel = old.filter$variant.sel)
# return the final barcode
invisible(final_barcode)
}
#' Helper functions for extracting variant IDs corresponding to previously published
#' barcodes.
#' @param gdsfile a \code{\link[SeqArray]{SeqVarGDSClass}} object
#' @param publication a character string representing a publication
#' @importFrom SeqArray granges
#' @importFrom IRanges findOverlaps
#' @importFrom S4Vectors subjectHits
#' @export
getBarcodeVariants <- function(gdsfile, publication = "Volkman2008") {
stopifnot(inherits(gdsfile, "SeqVarGDSClass"))
stopifnot(publication %in% c("Volkman2008", "Harrison2016"))
# set-up GRanges object
all_coordinates <- granges(gdsfile)
publication_coordinates <- barcodes(publication)
# subsetByOverlaps
overlaps <- findOverlaps(all_coordinates, publication_coordinates)
in_all <- subjectHits(overlaps)
if (length(in_all) == 0) {
message(paste("No overlaps found between call-set and",
publication,
"barcode. Returning an empty GenomicRanges object"))
}
return(all_coordinates[in_all])
}
#' Generate GenomicRanges object from GDS file
#'
#' @param publication a character string indicating malaria publication (default Volkmann2008)
#' @details Return a GenomicRanges object containing Pf barcode information from Volkmann, 2008.
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @export
barcodes <- function(publication) {
if(publication == "Volkman2008"){
volkman.data <- matrix(c("Pf_01_000130573",1,130573, "Pf_01_000539044",1,539044,
"Pf_02_000842803",2,842803,"Pf_04_000282592",4,282592,
"Pf_05_000931601",5,931601,"Pf_06_000145472",6,145472,
"Pf_06_000937750",6,937750,"Pf_07_000277104", 7,277104,
"Pf_07_000490877", 7,490877,"Pf_07_000545046",7,545046,
"Pf_07_000657939", 7,657939,"Pf_07_000671839",7,671839,
"Pf_07_000683772",7,683772,"Pf_07_000792356",7,792356,
"Pf_07_001415182",7,1415182,"Pf_08_000613716",8,613716,
"Pf_09_000634010",9,634010, "Pf_10_000082376",10,82376,
"Pf_10_001403751",10,1403751,"Pf_11_000117114",11,117114,
"Pf_11_000406215",11,406215, "Pf_13_000158614",13,158614,
"Pf_13_001429265",13,1429265,"Pf_14_000755729",14,755729),
nrow = 24, ncol = 3, byrow = TRUE)
volkman.data <- data.frame(volkman.data, stringsAsFactors = FALSE)
names(volkman.data) <- c("id", "chromosome", "position")
volkman.data$chromosome <- paste0("Pf3D7_",
formatC(as.numeric(volkman.data$chromosome),
width = 2, format = "d", flag = "0"),
"_v3")
volkman.data$position <- as.integer(volkman.data$position)
makeGRangesFromDataFrame(volkman.data,
start.field = "position",
end.field = "position",
keep.extra.columns = TRUE)
} else {
stop("No barcode corresponding to that publication.")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.