R/phase.R

Defines functions phaseFilter phasePosterior phaseWrite phaseReadPair phaseReadSample phase

Documented in phase phaseFilter phasePosterior phaseReadPair phaseReadSample phaseWrite

#' @name phase 
#' @title PHASE
#' @description Run PHASE to estimate the phase of loci in diploid data.
#' 
#' @param g a \linkS4class{gtypes} object.
#' @param loci vector or data.frame of loci in 'g' that are to be phased. If a 
#'   data.frame, it should have columns named 
#'   \code{locus} (name of locus in 'g'),  
#'   \code{group} (number identifying loci in same linkage group), and
#'   \code{position} (integer identifying location of each locus in a
#'      linkage group).
#' @param positions position along chromosome of each locus.
#' @param type type of each locus.
#' @param num.iter number of PHASE MCMC iterations.
#' @param thinning number of PHASE MCMC iterations to thin by.
#' @param burnin number of PHASE MCMC iterations for burnin.
#' @param model PHASE model type.
#' @param ran.seed PHASE random number seed.
#' @param final.run.factor optional.
#' @param save.posterior logical. Save posterior sample in output list?
#' @param in.file name to use for PHASE input file.
#' @param out.file name to use for PHASE output files.
#' @param delete.files logical. Delete PHASE input and output files when done?
#' @param ph.res result from \code{phase.run}.
#' @param thresh minimum probability for a genotype to be selected (0.5 - 1).
#' @param keep.missing logical. T = keep missing data from original data set. 
#'   F = Use estimated genotypes from PHASE.
#'  
#' @note PHASE is not included with \code{strataG} and must be downloaded 
#'   separately. Additionally, it must be installed such that it can be run from 
#'   the command line in the current working directory. See the vignette 
#'   for \code{external.programs} for installation instructions. 
#' 
#' @details
#' \tabular{ll}{
#'   \code{phase} \tab runs PHASE assuming that the executable is installed 
#'     properly and available on the command line.\cr
#'   \code{phaseWrite} \tab writes a PHASE formatted file.\cr
#'   \code{phaseReadPair} \tab reads the '_pair' output file.\cr
#'   \code{phaseReadSample} \tab reads the '_sample' output file.\cr
#'   \code{phaseFilter} \tab filters the result from \code{phase.run} to 
#'     extract one genotype for each sample.\cr
#'   \code{phasePosterior} \tab create a data.frame of all genotypes for each 
#'     posterior sample.\cr
#' }
#'  
#' @return
#' \describe{
#'  \item{phase}{a list containing:
#'    \tabular{ll}{
#'      \code{locus.name} \tab new locus name, which is a combination of loci
#'         in group.\cr
#'      \code{gtype.probs} \tab a data.frame listing the estimated genotype 
#'        for every sample along with probability.\cr
#'      \code{orig.gtypes} \tab the original gtypes object for the
#'         composite loci.\cr
#'      \code{posterior} \tab a list of \code{num.iter} data.frames 
#'         representing posterior sample of genotypes for each sample.\cr
#'    }}
#'  \item{phaseWrite}{a list with the input filename and the 
#'    \linkS4class{gtypes} object used.}
#'  \item{phaseReadPair}{a data.frame of genotype probabilities.}
#'  \item{phaseReadSample}{a list of data.frames representing the 
#'    posterior sample of genotypes for one set of loci for each sample.}
#'  \item{phaseFilter}{a matrix of genotypes for each sample.}
#'  \item{phasePosterior}{a list of data.frames representing the posterior 
#'    sample of all genotypes for each sample.}
#' }
#' 
#' @references Stephens, M., and Donnelly, P. (2003). A comparison of Bayesian
#'   methods for haplotype reconstruction from population genotype data.
#'   American Journal of Human Genetics 73:1162-1169. Available at:
#'   \url{http://stephenslab.uchicago.edu/software.html#phase}
#' 
#' @author Eric Archer \email{eric.archer@@noaa.gov}
#' 
#' @examples \dontrun{
#' data(bowhead.snps)
#' data(bowhead.snp.position)
#' snps <- df2gtypes(bowhead.snps, ploidy = 2, description = "Bowhead SNPS")
#' summary(snps)
#' 
#' # Run PHASE on all data
#' phase.results <- phase(snps, bowhead.snp.position, num.iter = 100, 
#'   save.posterior = FALSE)
#' 
#' # Filter phase results
#' filtered.results <- phaseFilter(phase.results, thresh = 0.5)
#' 
#' # Convert phased genotypes to gtypes
#' ids <- rownames(filtered.results)
#' strata <- bowhead.snps$Stock[match(ids, bowhead.snps$LABID)]
#' filtered.df <- cbind(id = ids, strata = strata, filtered.results)
#' phased.snps <- df2gtypes(filtered.df, ploidy = 2, description = "Bowhead phased SNPs")
#' summary(phased.snps)
#' }
#' 
#' @export
#' 
phase <- function(g, loci, positions = NULL, type = NULL,
  num.iter = 100000, thinning = 100, burnin = 100000, model = "new", 
  ran.seed = NULL, final.run.factor = NULL, save.posterior = FALSE, 
  in.file = "phase_in", out.file = "phase_out", delete.files = TRUE) { 
  
  if(getPloidy(g) != 2) stop("'g' must be diploid")
  
  # check loci format
  if(!is.data.frame(loci)) {
    if(!(is.character(loci) & is.vector(loci))) {
      stop("'loci' must be a data.frame or character vector")
    }
    if(is.null(positions)) positions <- rep(1, getNumLoci(g))
    if(length(positions) != length(loci)) {
      stop("'positions' must be same length as 'loci'")
    }
    loci <- data.frame(locus = loci, position = positions, group = 1)
  }
  loci$group <- as.character(loci$group)
  loci$position <- as.numeric(loci$position)
  
  if(is.null(type)) type <- rep("S", length(unique(loci$group)))
  if(length(type) != length(unique(loci$group))) {
    stop("'type' must be same length as number of locus groups")
  }
  names(type) <- unique(loci$group)
  
  result <- lapply(unique(loci$group), function(grp) {
    lets <- paste(sample(c(0:9, letters), 10, replace = TRUE), collapse = "")
    in.file <- paste("phase_in_", lets, sep = "")
    out.file <- paste("phase_out_", lets, sep = "")
    
    # Write input file
    group.df <- loci[loci$group == grp, ]  
    locus.type <- rep(type[grp], nrow(group.df))
    in.file.data <- phaseWrite(
      g, 
      loci = group.df$locus, 
      positions = group.df$position, 
      type = locus.type, in.file
    )

    # Set parameters
    M.opt <- switch(model, new = "-MR", old = "-MS", hybrid = "-MQ", "")           
    S.opt <- ifelse(is.null(ran.seed), "", paste("-S", ran.seed, sep = ""))
    X.opt <- ifelse(
      is.null(final.run.factor), 
      "", 
      paste("-X", final.run.factor, sep = "")
    )
    s.opt <- ifelse(save.posterior, "-s", "")
    in.file.opt <- paste("\"", in.file, "\"", sep = "")
    out.file.opt <- paste("\"", out.file, "\"", sep = "")
    iter.params <- paste(trunc(num.iter), trunc(thinning), trunc(burnin))
    phase.cmd <- paste(
      "PHASE", M.opt, S.opt, X.opt, s.opt, 
      in.file.opt, out.file.opt, iter.params
    )
    
    # Run Phase
    err.code <- system(phase.cmd)  
    if(err.code == 127) {
      stop("You do not have PHASE installed.") 
    } else if(!err.code == 0) {
      stop(paste("Error running PHASE. Error code", err.code, "returned."))
      cat("\n")
    }
    
    # Read output
    opts <- options(warn = -1)
    gtype.probs <- phaseReadPair(paste(out.file, "_pairs", sep = ""))
    if(is.null(gtype.probs)) {
      alleles <- rep(NA, nrow(g$genotypes))
      gtype.probs <- data.frame(
        id = getIndNames(g), a1 = alleles, a2 = alleles, pr = rep(1, getNumInd(g))
      )
    }
    new.locus.name <- paste(group.df$locus, collapse = "_")
    alleles <- paste(new.locus.name, 1:2, sep = ".")
    colnames(gtype.probs)[1:3] <- c("id", alleles) 
    rownames(gtype.probs) <- NULL
    options(opts)
    
    locus.result <- list(
      locus.name = new.locus.name, gtype.probs = gtype.probs, 
      orig.gtypes = in.file.data$gtypes
    )  
    
    if(save.posterior) {
      file <- paste(out.file, "_sample", sep = "")
      l.type <- paste(locus.type, collapse = "")
      locus.result$posterior <- phaseReadSample(file, l.type)
      for(i in 1:length(locus.result$posterior)) {
        colnames(locus.result$posterior[[i]]) <- c("id", alleles)
      }
    }
    
    if(delete.files) {
      file.remove(c(dir(pattern = in.file), dir(pattern = out.file)))
    }
    
    locus.result  
  })
  
  names(result) <- lapply(result, function(x) x$locus.name)
  class(result) <- c("phase.result", class(result))
  result
}


#' @rdname phase
#' @export
#' 
phaseReadSample <- function(out.file, type) {
  if(!file.exists(out.file)) return(NULL)
  post.file <- scan(file = out.file, what = "character", 
                    sep = "\n", quiet = TRUE)
  iter.start <- grep(type, post.file) + 1
  lapply(iter.start, function(start) {
    num.samples <- as.integer(post.file[start - 3])
    end <- start + (num.samples * 3) - 3
    as.data.frame(t(sapply(seq(start, end, by = 3), function(i) {
      id <- strsplit(post.file[i], " ")[[1]][2]    
      hap1 <- gsub(" ", "", post.file[i + 1])
      hap2 <- gsub(" ", "", post.file[i + 2])
      c(id, hap1, hap2)
    })), stringsAsFactors = FALSE)
  })
}


#' @rdname phase
#' @export
#' 
phaseReadPair <- function(out.file) {   
  if(!file.exists(out.file)) return(NULL)
  pair.file <- scan(file = out.file, what = "character", 
                    sep = "\n", quiet = TRUE)
  
  id.start <- grep("IND:", pair.file)
  gtype.probs <- lapply(1:length(id.start), function(i) {
    id.end <- ifelse(i == length(id.start), 
                     length(pair.file), 
                     id.start[i + 1] - 1)
    id <- sub("IND: ", "", pair.file[id.start[i]])
    t(sapply((id.start[i] + 1):id.end, function(j) {
      line.split <- unlist(strsplit(pair.file[j], " , "))
      names(line.split) <- c("hap1", "hap2", "pr")
      c(id = id, line.split)
    }))
  })             
  gtype.probs <- as.data.frame(do.call(rbind, gtype.probs), 
                               stringsAsFactors = FALSE)  
  gtype.probs$pr <- as.numeric(as.character(gtype.probs$pr))
  gtype.probs
}


#' @rdname phase
#' @export
#' 
phaseWrite <- function(g, loci, positions = NULL, 
                       type = rep("S", length(loci)), in.file = "phase_in") {
  
  if(getPloidy(g) != 2) stop("'g' must be diploid")
  
  # Make sure locus.names and locus.positions are sorted properly
  if(is.null(positions)) positions <- rep(1, length(getNumLoci(g)))
  asc.order <- order(positions)
  loci <- loci[asc.order]
  positions <- positions[asc.order]
  
  sub.g <- g[, loci, ]
  write(c(
    getNumInd(sub.g), 
    length(loci),
    paste("P", paste(positions, collapse = " ")),
    paste(type, collapse = ""), ""
  ), file = in.file)
  
  g.mat <- as.matrix(sub.g, ids = TRUE, strata = FALSE)
  ids <- g.mat[, "id"]
  g.mat <- g.mat[, -1]
  g.mat[is.na(g.mat)] <- "?"
  for(i in 1:nrow(g.mat)) {
    write(c(
      ids[i],
      paste(g.mat[i, seq(1, ncol(g.mat) - 1, 2)], collapse = " "),
      paste(g.mat[i, seq(2, ncol(g.mat), 2)], collapse = " ")
    ), file = in.file, append = TRUE)
  }
  
  invisible(list(filename = in.file, gtypes = sub.g))
}


#' @rdname phase
#' @export
#' 
phasePosterior <- function(ph.res, keep.missing = TRUE) {    
  if(!"phase.result" %in% class(ph.res)) {
    stop("'ph.res' is not a result from 'phase.run'.")
  }
  
  num.iter <- length(ph.res[[1]]$posterior)
  lapply(1:num.iter, function(iter) {
    do.call(cbind, lapply(1:length(ph.res), function(locus) {
      ph.res <- ph.res[[locus]]
      post.df <- ph.res$posterior[[iter]]
      
      if(keep.missing) {
        for(i in 1:nrow(post.df)) {
          ids <- which(getIndNames(ph.res$orig.gtypes) == post.df[i, 1])
          if(any(is.na(as.array(ph.res$orig.gtypes, ids = ids)))) {
            post.df[i, 2:3] <- NA
          }
        }
      }
      
      cols <- if(locus == 1) {1:3} else {2:3}
      post.df[, cols]
    })) 
  })
}


#' @rdname phase
#' @export
#' 
phaseFilter <- function(ph.res, thresh = 0.5, keep.missing = TRUE) {
  if(!"phase.result" %in% class(ph.res)) {
    stop("'ph.res' is not a result from 'phase.run'.")
  }
  
  filtered <- lapply(ph.res, function(x) {
    gtype.probs <- x$gtype.probs
    pr.vec <- unique(gtype.probs[, 1])
    locus.filtered <- do.call(rbind, lapply(pr.vec, function(i) {
      this.id <- gtype.probs[gtype.probs[, 1] == i, ]
      max.index <- which.max(this.id$pr)
      if(length(max.index) == 0) return(this.id[1, ])
      kept.line <- this.id[max.index, ]
      if(as.numeric(kept.line$pr) < thresh) kept.line[, 2:3] <- c(NA, NA)
      kept.line
    })) 
    rownames(locus.filtered) <- NULL
    
    if(keep.missing) {
      for(i in 1:nrow(locus.filtered)) {
        ids <- setdiff(getIndNames(x$orig.gtypes), locus.filtered[i, 1])
        id.mat <- as.matrix(x$orig.gtypes, strata = FALSE)
        id.mat <- id.mat[id.mat[, "id"] %in% ids, , drop = FALSE]
        if(any(is.na(id.mat))) locus.filtered[i, 2:3] <- NA
      }
    }
    
    locus.filtered
  })
  
  ids <- data.frame(
    id = sort(unique(unlist(lapply(filtered, function(x) x$id))))
  )
  filtered <- as.matrix(do.call(cbind, lapply(filtered, function(x) {
    merge(ids, x, by = "id", all.x = TRUE)[, 2:3]
  })))
  rownames(filtered) <- ids$id
  colnames(filtered) <- paste(rep(names(ph.res), each = 2), ".", 1:2, sep = "")
  filtered
}
EricArcher/strataG documentation built on Feb. 12, 2023, 4:11 a.m.