R/coil_functions.R

Defines functions barcodes getBarcodeVariants extractBarcode readEstMOI

Documented in barcodes extractBarcode getBarcodeVariants readEstMOI

# Title: coil_functions.R
# Author: Stuart Lee
# Description: Helper functions for extracting barcodes
# for use in COI and estMOI L - see 
# http://www.broadinstitute.org/infect/malaria/coil/

#' Parse estMOI log files in a directory into tidy data frame
#' 
#' @param path path containing .txt files containing estMOI estimates
#' @note Assumes that outPrefix set as option in estMOI is set to sample identifer.
#' @return a data.frame
#' @export
readEstMOI <- function(path) {
    # regex to get outPrefix
    estmoi_pattern<-  "(.*\\/)(.*)(.moi.*.txt)"
    estmoi_files <- list.files(path, pattern = ".txt$", full.names = TRUE)
    
    readerMOI <- function(file) {
        out_df <- read.table(file, header = FALSE, sep = "\t",
                             comment.char = "", skip = 1, fill = TRUE,
                             na.strings = c("", NA))
        colnames(out_df) <- c("K", "count", "proportion", "estimate")
        out_df$sample.id <- gsub(estmoi_pattern, "\\2", file) 
        out_df
        
    }
    estmoi_data <- do.call(rbind,  lapply(estmoi_files, readerMOI))
    estmoi_data
    
}
 
#' Extract barcode for use in COIL program
#' 
#' @param gdsfile a \code{\link[SeqArray]{SeqVarGDSClass}} object
#' @param variant.id a numeric vector of variant identifiers
#' @param barcode.file character name of output file
#' 
#' @details Generate SNP barcode using supplied variant ids. This
#' is constructed by collapsing alleles over the variant ids, forming
#' a barcode for each sample. Then a barcode file containing
#' sample ids + barcode is generated and can be supplied to the COIL web-app
#' or to the command line COIL program. 
#' @references Galinsky, Kevin, et al. "COIL: a methodology for evaluating 
#' malarial complexity of infection using likelihood from single nucleotide 
#' polymorphism data." Malaria journal 14.1 (2015): 4. 
#' @note Need to look at how COIL constructed it's orginal barcode. Esp. het calls.
#' @return data.frame containing barcode for each sample
#' @importFrom SeqArray seqSetFilter seqGetData seqGetFilter
#' @importFrom SeqVarTools getGenotypeAlleles
#' @export
extractBarcode <- function(gdsfile, variant.id, barcode.file) {
    stopifnot(inherits(gdsfile, "SeqVarGDSClass"))
    
    old.filter <- seqGetFilter(gdsfile)
    seqSetFilter(gdsfile, variant.id = variant.id)
    
    gt <- getGenotypeAlleles(gdsfile)
    sample.id <- seqGetData(gdsfile, "sample.id")
    # find hets and recode 
    findHets <- apply(gt, 2, 
                      function(y) unlist(lapply(strsplit(y, "/"), 
                                                function(x) x[1] != x[2])))
    gt[!findHets & !is.na(findHets)] <- substr(gt[!findHets & !is.na(findHets)], 1, 1)

    gt[findHets & !is.na(findHets)] <- "N"

    gt[is.na(findHets)] <- "X"
    # paste haplotypes together
    barcode <- apply(gt, 1, paste, collapse = "")
    # add in sample.id
    stopifnot(length(sample.id) == length(barcode))
    final_barcode <- data.frame(sample.id, barcode)
    # write out barcode file
    datafile <- file(barcode.file, open = "wt")
    on.exit(close(datafile))
    # construct header and write it out
    header <- paste0("# COIL barcode generated by moimix ", Sys.time(), "\n",
                     "#ID Barcode")
    writeLines(header, con = datafile)
    write.table(final_barcode, datafile, col.names = FALSE, row.names = FALSE, quote = FALSE)
    
    # reset filter
    seqSetFilter(gdsfile, sample.id = sample.id, variant.sel = old.filter$variant.sel)
    # return the final barcode
    invisible(final_barcode)
}


#' Helper functions for extracting variant IDs corresponding to previously published
#' barcodes.
#' @param gdsfile a \code{\link[SeqArray]{SeqVarGDSClass}} object
#' @param publication a character string representing a publication
#' @importFrom SeqArray granges
#' @importFrom IRanges findOverlaps 
#' @importFrom S4Vectors subjectHits 
#' @export 
getBarcodeVariants <- function(gdsfile, publication = "Volkman2008") {
    stopifnot(inherits(gdsfile, "SeqVarGDSClass"))
    stopifnot(publication %in% c("Volkman2008", "Harrison2016"))
    # set-up GRanges object
    all_coordinates <- granges(gdsfile)
    publication_coordinates <- barcodes(publication)
    
    # subsetByOverlaps
    overlaps <- findOverlaps(all_coordinates, publication_coordinates)

    in_all <- subjectHits(overlaps)
    if (length(in_all) == 0) {
        message(paste("No overlaps found between call-set and",
                      publication, 
                      "barcode. Returning an empty GenomicRanges object"))
    }
    return(all_coordinates[in_all])
    
}

#' Generate GenomicRanges object from GDS file
#' 
#' @param publication a character string indicating malaria publication (default Volkmann2008)
#' @details Return a GenomicRanges object containing Pf barcode information from Volkmann, 2008.
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @export
barcodes <- function(publication) {
    
    if(publication == "Volkman2008"){
        volkman.data <- matrix(c("Pf_01_000130573",1,130573, "Pf_01_000539044",1,539044,
                                 "Pf_02_000842803",2,842803,"Pf_04_000282592",4,282592,
                                 "Pf_05_000931601",5,931601,"Pf_06_000145472",6,145472,
                                 "Pf_06_000937750",6,937750,"Pf_07_000277104", 7,277104,
                                 "Pf_07_000490877", 7,490877,"Pf_07_000545046",7,545046,
                                 "Pf_07_000657939", 7,657939,"Pf_07_000671839",7,671839,  
                                 "Pf_07_000683772",7,683772,"Pf_07_000792356",7,792356,
                                 "Pf_07_001415182",7,1415182,"Pf_08_000613716",8,613716,
                                 "Pf_09_000634010",9,634010, "Pf_10_000082376",10,82376,  
                                 "Pf_10_001403751",10,1403751,"Pf_11_000117114",11,117114,
                                 "Pf_11_000406215",11,406215, "Pf_13_000158614",13,158614,
                                 "Pf_13_001429265",13,1429265,"Pf_14_000755729",14,755729), 
                               nrow = 24, ncol = 3, byrow = TRUE)
        volkman.data <- data.frame(volkman.data, stringsAsFactors = FALSE)
        names(volkman.data) <- c("id", "chromosome", "position")
        volkman.data$chromosome <- paste0("Pf3D7_", 
                                          formatC(as.numeric(volkman.data$chromosome), 
                                                  width = 2, format = "d", flag = "0"),
                                          "_v3")
        volkman.data$position <- as.integer(volkman.data$position)
        makeGRangesFromDataFrame(volkman.data, 
                                 start.field = "position", 
                                 end.field = "position",
                                 keep.extra.columns = TRUE)
    } else {
        stop("No barcode corresponding to that publication.")
    }
}
sa-lee/moimix documentation built on April 23, 2020, 10:32 a.m.