DOQTL: Genotyping and QTL Mapping in DO Mice

Documented in do2sanger do2sanger.helper

################################################################################ 
# Read in the genotypes of DO mice, assign the most probable genotype based on 
# the posterior probabilities and map the Sanger SNPs onto the DO samples.
# These functions split the DO genotypes at the midpoint between MUGA markers.
# Daniel Gatti
# Dan.Gatti@jax.org
# Sept. 7, 2013
################################################################################
# Arguments: do.files: character vector of *.genotype.probs.Rdata files that 
#                      contain the posterior probabilities.
#            snps: data.frame containing 4 columns containing the SNP IDs,
#                  the chromosome, the Mb location and the cM location.
#            snp.file: character containing the full path to the Sanger SNP VCF file.
#                      Defaults to the SNP file at ftp.jax.org.
#            return.val: Either "numeric" to return the SNPs coded as 0, 1 or 2 or
#                        "character" to return the SNPs as character allele calls.
do2sanger = function(do.files, snps, output.file = "do2sanger.txt", 
            snp.file = "ftp://ftp.jax.org/SNPtools/variants/mgp.v4.snps.dbSNP.vcf.gz",
            return.val = c("numeric", "character")) {
  if(missing(do.files)) {
    stop("do.files cannot be missing. Please enter a set of DO sample files.")
  } # if(missing(do.files))
  if(missing(snps)) {
    stop(paste("snps cannot be missing because we need the locations of the",
         "SNPs. Please provide SNP locations that match the SNPS in the",
         "do.files."))
  } # if(missing(do.files))
  # Read in the first file to get the genotype and SNP dimensions.
  load(do.files[1])
  states = colnames(prsmth)
  snps = snps[snps[,1] %in% rownames(prsmth),]
  prsmth = prsmth[rownames(prsmth)[[3]] %in% snps[,1],]
  gt = matrix("", length(do.files), nrow(prsmth), dimnames = list(
       sub("\\.genotype\\.probs\\.Rdata$", "", do.files), rownames(prsmth)))
  # Read in the DO files and call the maximum genotypes.
  print("Calling DO genotypes...")
  for(i in 1:length(do.files)) {
    load(do.files[i])
    gt[i,] = states[apply(prsmth, 1, which.max)]
  } # for(i)
  # Split up the MUGA markers by chromosomes.
  markers = snps[snps[,1] %in% rownames(prsmth),]
  markers = split(x = markers, f = markers[,2])
#  sanger = get.variants(file = snp.file, chr = 1, start = 0, end = 4,
#           strains = c("129S1/SvImJ", "A/J", "C57BL/6J", "CAST/EiJ", 
#           "NOD/ShiLtJ", "NZO/HlLtJ", "PWK/PhJ", "WSB/EiJ"))
  sanger = read.vcf(vcf.file = snp.file, chr = 1, start = 0, end = 4,
           strains = c("A_J", "129S1_SvImJ", "CAST_EiJ", 
           "NOD_ShiLtJ", "NZO_HlLtJ", "PWK_PhJ", "WSB_EiJ"))
  keep = setdiff(1:ncol(sanger), grep("qual", colnames(sanger)))
  for(c in 1:length(markers)) {
    curr.chr = names(markers)[c]
    print(paste("CHR", curr.chr))
    num.snps = 0
    # Start writing SNPs at the beginning of the chromosome 1.
    print("Mapping Sanger SNPs to DO samples...")
    con = file(description = paste("chr", curr.chr, ".", output.file, sep = ""),
          open = "w")
    writeLines(paste("ID", "CHR", "GRC38.bp", "Ref", "Alt", 
               paste(rownames(gt), collapse = "\t"), sep = "\t"), con)
    gc()
    # Start of Chr.
    result = do2sanger.helper(snp.file = snp.file, chr = curr.chr, start = 0, 
             end = mean(markers[[curr.chr]][1:2,3]), geno = gt[,markers[[curr.chr]][1,1]],
             keep = keep, return.val = return.val)
    if(!is.null(result)) {
      result = apply(result, 1, paste, collapse = "\t")
      writeLines(result, con)
      num.snps = num.snps + nrow(result)
    } # if(!is.null(result))
    # Middle SNPs.
    for(i in 2:(nrow(markers[[curr.chr]]) - 1)) {
      if(i %% 100 == 0) {
        print(i)
      } # if(i %% 10 == 0)
      if(all(!is.na(markers[[curr.chr]][(i-1):(i+1),3]))) {
        result = do2sanger.helper(snp.file = snp.file, chr = curr.chr,
                 start = mean(markers[[curr.chr]][(i-1):i,3]), 
                 end = mean(markers[[curr.chr]][i:(i+1),3]), 
                 geno = gt[,markers[[curr.chr]][i,1]], keep = keep,
                 return.val = return.val) 
        if(!is.null(result)) {
          result = apply(result, 1, paste, collapse = "\t")
          writeLines(result, con)      
          num.snps = num.snps + nrow(result)
        } # if(!is.null(result))
      } # if(all(!is.na(markers[[curr.chr]][(i-1):(i+1),3])))
    gc()
    } # for(i)
    # End of Chr.
    if(all(!is.na(markers[[curr.chr]][(i-1):(i+1),3]))) {
      result = do2sanger.helper(snp.file = snp.file, chr = curr.chr,
               start = mean(markers[[curr.chr]][(nrow(markers[[curr.chr]])-1):nrow(markers[[curr.chr]]),3]), 
               end = 200, geno = gt[,markers[[curr.chr]][nrow(markers[[curr.chr]]),1]], keep = keep,
               return.val = return.val)
      if(!is.null(result)) {
        result = apply(result, 1, paste, collapse = "\t")
        writeLines(result, con)
        num.snps = num.snps + nrow(result)
      } # if(!is.null(result))
    } # if(all(!is.na(markers[[curr.chr]][(i-1):(i+1),3])))
    close(con)
    gc()
    print(paste("CHR", curr.chr, ":", num.snps))
  } # for(c)
} # do2sanger()
##########
# Helper function to get Sanger SNPs and place them in the DO samples.
do2sanger.helper = function(snp.file, chr, start, end, geno, keep, return.val) {
#  sanger = get.variants(file = snp.file, chr = chr, start = start, end = end,
#           strains = c("129S1/SvImJ", "A/J", "C57BL/6J", "CAST/EiJ", 
#           "NOD/ShiLtJ", "NZO/HlLtJ", "PWK/PhJ", "WSB/EiJ"), quality = 1)
  sanger = read.vcf(vcf.file = snp.file, chr = chr, start = start, 
           end = end, strains = c("A_J", "129S1_SvImJ", "NOD_ShiLtJ",
           "NZO_HlLtJ", "CAST_EiJ", "PWK_PhJ", "WSB_EiJ"))
  qual = grep("qual", colnames(sanger))
  qual = matrix(as.numeric(unlist(sanger[,qual])), nrow = nrow(sanger))
  remove = which(rowMeans(is.na(qual)) > 0.01)
  qual   = qual[-remove,]
  sanger = sanger[-remove,]
  sanger = sanger[rowMeans(qual) == 1,]
  if(length(sanger) > 0 && nrow(sanger) > 0) {
    sanger = sanger[,keep]
    # Add in C57BL/6J.
    colnames(sanger) = sub("^X", "", colnames(sanger))
    sanger = data.frame(sanger[,1:5], "A_J" = sanger[,"A_J"], 
             "C57BL_6J" = paste0(sanger$REF, sanger$REF),
             "129S1_SvImJ" = sanger[,"129S1_SvImJ"], 
             "NOD_ShiLtJ" = sanger[,"NOD_ShiLtJ"],
             "NZO_HlLtJ" = sanger[,"NZO_HlLtJ"],
             "CAST_EiJ" = sanger[,"CAST_EiJ"],
             "PWK_PhJ" = sanger[,"PWK_PhJ"],
             "WSB_EiJ" = sanger[,"WSB_EiJ"])
    colnames(sanger) = sub("^X", "", colnames(sanger))
    geno.cols = (ncol(sanger) - 7):ncol(sanger)
    map = do.colors[,1:2]
    map[,2] = sub("/", "_", map[,2])
    map = map[match(colnames(sanger)[geno.cols], map[,2]),]
    colnames(sanger)[geno.cols] = map[,1]
  
    # Here, we assume that the founders are homozygous. This may not be the case 
    # and this is an area for improvement. 
    sanger[,geno.cols] = apply(sanger[,geno.cols], 2, substring, 1, 1)
    hdr = sanger[,1:5]
    sanger = as.matrix(sanger[,-1:-5])
    # Turn the sanger SNPs into 0 for reference allele and 1 for alternate allele.
    sanger = matrix(as.numeric(sanger != hdr[,4]), nrow(sanger), ncol(sanger),
             dimnames = dimnames(sanger))
    # Create a 2 x n matrix with founder haplotypes for each DO founder.
    samples = names(geno)
    geno = matrix(unlist(strsplit(geno, split = "")), nrow = 2, dimnames = 
           list(NULL, samples))
    if(return.val == "numeric") {
      # Convert the founder haplotypes to numeric genotypes.
##### NOTE: We are assuming bi-allelic SNPs here! This may not be the case
#####       for all Sanger SNPs. 
      geno = matrix(sanger[,geno[1,]] + sanger[,geno[2,]],
             nrow(sanger), ncol(geno), dimnames = list(NULL, samples))
      # Code the major allele as 0 .
      num.zero = rowMeans(geno == 0, na.rm = TRUE)
      num.two  = rowMeans(geno == 2, na.rm = TRUE)
      swap.rows = which(num.two > num.zero)
      geno[swap.rows,] = 2 - geno[swap.rows,]
    } # if(return.val == "numeric")
    return(cbind(hdr, geno)) 
  } else {
    return(NULL)
  } # else
} # do2sanger.helper()