R/H_support_functions.R

Defines functions getHap buildHAPnames buildHAPsets haplo.em.fitter geno.count.pairs setupGeno locus haplo.em.control haplo.em

Documented in getHap

#' Haplotype EM Computation
#'
#' haplo.em function port from haplo.stats v 1.8.2
#' @param geno The genotype columns of the loci set being analyzed.
#' @param locus.label Character vector of unique loci being analyzed.
#' @param miss.val Character vector of genos column names.
#' @param weight Weights for observations (rows of geno matrix)
#' @param control List of control parameters. The default is constructed by the function haplo.em.control.
#' @note This function is for internal BIGDAWG use only.
haplo.em  <- function(geno, locus.label=NA, miss.val=c(0,NA), weight=NULL, control = haplo.em.control() ) {
  
  
  n.loci <- ncol(geno)/2
  n.subject <- nrow(geno)
  subj.id <- 1:n.subject
  
  if(n.loci<2) stop("Must have at least 2 loci for haplotype estimation!")
  
  # set up weight
  if(any(is.null(weight))){
    weight <- rep(1,n.subject)
  }
  
  if(any(weight<0)){
    stop("negative weights not allowed")
  }
  
  if(length(weight)!=n.subject){
    stop("Length of weight != number of subjects (nrow of geno)")
  }
  
  # Create locus label if not included
  if(all(is.na(locus.label))) locus.label<- paste("loc-",1:n.loci,sep="")
  
  if(length(locus.label)!=n.loci){
    stop("length of locus.label != n.loci")
  }
  
  
  ## recode geno to integer values, accounting for missing values
  ## replaced loci with setupGeno (JPS: 10/20/2011)
  temp.geno <- setupGeno(geno, miss.val=miss.val, locus.label=locus.label)
  
  
  # Compute the max number of pairs of haplotypes over all subjects
  max.pairs <- geno.count.pairs(temp.geno)
  max.haps <- 2*sum(max.pairs)
  
  
  ## This system-max for integer values is used in haplo.em.control
  ## for setting max.haps.limit
  #intMax <- .C("checkIntMax",
  #             intMax = as.integer(0),
  #             PACKAGE="intMax
  
  if(max.haps > control$max.haps.limit) max.haps <- control$max.haps.limit
  
  # check whether to delete some rows - now defunct, but use 
  # dummy to not break code that uses this in returned list
  rows.rem <- numeric(0) 
  
  
  geno.vec <- as.vector(temp.geno)
  geno.vec <- ifelse(is.na(geno.vec),0,geno.vec)
  
  allele.labels <- attr(temp.geno, "unique.alleles")
  
  
  if(length(allele.labels)!=n.loci)
    stop("Number of loci in alleles list != n.loci")
  
  n.alleles <- numeric(n.loci)
  a.freq <- vector("list",n.loci)
  
  for(i in 1:n.loci){
    n.alleles[i] <- length(allele.labels[[i]])
    j <- (i-1)*2 + 1
    p <- table(temp.geno[,c(j, (j+1))], exclude=NA)
    p <- p/sum(p)
    a.freq[[i]] <- list(p=p)
  }
  
  if(is.null(control$loci.insert.order)) {
    control$loci.insert.order <- 1:n.loci
  }
  
  
  # need zero-offset for loci-insert-order when
  # pass to C
  
  loci.insert.order <- (control$loci.insert.order - 1)
  
  
  if(length(loci.insert.order) != n.loci){
    stop("length of loci.insert.order != n.loci")
  }
  
  if(sum( abs(sort(loci.insert.order) - (0:(n.loci-1)))) > 0){
    stop("All loci are not accounted for in  loci.insert.order")
  }
  
  if(control$insert.batch.size > n.loci){
    control$insert.batch.size <- n.loci
  }
  
  
  if(!is.null(control$iseed)) {
    set.seed(control$iseed) } else
    {  runif(1)
      control$iseed <- .Random.seed
    }
  
  
  # The seeds for the ranAS183 random number generator used in the C function
  # hwe_sim must be between 1 and 30000, but bigger is better (we think), so we
  # add 10000
  
  seed.array <- runif(3)
  
  iseed1 = 10000 + 20000*seed.array[1]
  iseed2 = 10000 + 20000*seed.array[2]
  iseed3 = 10000 + 20000*seed.array[3]
  
  
  fit <- haplo.em.fitter(
    n.loci,
    n.subject,
    weight,
    geno.vec,
    n.alleles,
    max.haps,
    max.iter=control$max.iter,
    loci.insert.order,		      
    min.posterior=control$min.posterior,
    tol=control$tol,
    insert.batch.size=control$insert.batch.size,
    random.start=control$random.start,            
    iseed1=iseed1,          
    iseed2=iseed2,
    iseed3=iseed3,
    verbose=control$verbose)
  
  
  # if n.try > 1 try, remaining tries are random starts for posteriors
  if(control$n.try > 1){
    for(i in 2:control$n.try){
      
      seed.array <- runif(3)
      
      iseed1 = 10000 + 20000*seed.array[1]
      iseed2 = 10000 + 20000*seed.array[2]
      iseed3 = 10000 + 20000*seed.array[3]
      
      fit.new <- haplo.em.fitter(
        n.loci,
        n.subject,
        weight,
        geno.vec,
        n.alleles,
        max.haps,
        max.iter=control$max.iter,
        loci.insert.order,		      
        min.posterior=control$min.posterior,
        tol=control$tol,
        insert.batch.size=control$insert.batch.size,
        random.start=1,            
        iseed1=iseed1,          
        iseed2=iseed2,
        iseed3=iseed3,
        verbose=control$verbose)
      
      if(fit.new$tmp1$lnlike > fit$tmp1$lnlike)
      { fit <- fit.new
      }
    } 
  }
  
  tmp1 <- fit$tmp1
  tmp2 <- fit$tmp2
  
  u.hap <- matrix(tmp2$u.hap,nrow=tmp2$n.u.hap,byrow=TRUE)
  
  
  # code alleles for haplotpes with original labels
  # use I() to keep char vectors to factors in making a data.frame
  haplotype <- data.frame(I(allele.labels[[1]][u.hap[,1]]))
  for(j in 2:n.loci){
    haplotype <- cbind(haplotype, I(allele.labels[[j]][u.hap[,j]]))
  }
  
  # haplotype <- data.frame(haplotype)
  names(haplotype) <- locus.label
  
  
  # convert from 0-offset in C to 1-offset in S, and recode hap codes
  # to 1,2,..., n_uhap
  
  hap1code  <- tmp2$hap1code  + 1
  hap2code  <- tmp2$hap2code  + 1
  uhapcode  <- tmp2$u.hap.code + 1
  
  n1 <- length(uhapcode)
  n2 <- length(hap1code)
  
  tmp <- as.numeric(factor(c(uhapcode, hap1code, hap2code)))
  uhapcode <- tmp[1:n1]
  hap1code <- tmp[(n1+1):(n1+n2)]
  hap2code <- tmp[(n1+n2+1):(n1+2*n2)]
  
  uhap.df <- data.frame(uhapcode, tmp2$hap.prob, u.hap)
  
  names(uhap.df) <- c("hap.code","hap.prob",locus.label)
  
  indx.subj = tmp2$indx.subj + 1
  
  # in rare cases of very low LD, a subject gets removed by the trimming
  # of haplotypes add warning and offer options in this case
  
  if(length(unique(tmp2$indx.subj)) < n.subject) {
    unique.subj <- unique(indx.subj)
    rows.rem <- c(rows.rem, which(is.na(match(1:n.subject, unique.subj))))
    warning("Subject(s) ", paste(rows.rem,sep=','), " removed in trimming steps.\n Try decreasing min.posterior control parameter to reduce trimming.\n")
  }
  
  subj.used.id <-  subj.id[indx.subj]
  
  
  # compute lnlike if no LD. This is a rough approximation which will
  # be accurate only if all haplotypes are considered in the list
  # of enumerated haplotypes. If there is no trimming
  # (min.posterior = 0), and it is possible to enumerate all
  # possible pairs of haplotypes, then hap.prob.noLD will sum
  # to 1. But, with trimming, this may not occur, so we 
  # rescale to force them to sum to 1. This may not lead to
  # an accurate test for no LD by the likelihood ratio statistic.
  
  hap.prob.noLD <- a.freq[[1]]$p[u.hap[,1]]
  df.noLD <- length(a.freq[[1]]$p) - 1
  
  for(j in 2:n.loci){
    hap.prob.noLD <- hap.prob.noLD *  a.freq[[j]]$p[u.hap[,j]]
    df.noLD <- df.noLD + length(a.freq[[j]]$p) - 1
  }
  hap.prob.noLD <-  hap.prob.noLD/sum(hap.prob.noLD)
  
  prior.noLD <- hap.prob.noLD[hap1code]*hap.prob.noLD[hap2code]
  prior.noLD <- ifelse(hap1code!=hap2code, 2*prior.noLD, prior.noLD)
  ppheno.noLD <- tapply(prior.noLD, indx.subj, sum)
  lnlike.noLD <- sum(log(ppheno.noLD))
  
  lr <- 2*(tmp1$lnlike - lnlike.noLD)
  df.LD <- sum(tmp2$hap.prob > 0.0000001) - 1
  df.lr <-  df.LD - df.noLD
  
  
  obj <- list(
    lnlike=tmp1$lnlike,
    lnlike.noLD=lnlike.noLD,
    lr = lr,
    df.lr = df.lr,
    hap.prob = tmp2$hap.prob,
    hap.prob.noLD = hap.prob.noLD,
    converge = tmp1$converge,
    locus.label = locus.label,
    indx.subj = indx.subj,    
    subj.id = subj.used.id, 
    post = tmp2$post,
    hap1code = hap1code,
    hap2code = hap2code,
    haplotype = haplotype,
    nreps = table(indx.subj),
    rows.rem = rows.rem,
    max.pairs=max.pairs,
    control=control)
  
  
  class(obj) <- "haplo.em"
  
  return(obj)
  
}

#' Haplotype EM Computation Control Parameters
#'
#' haplo.em.control function port from haplo.stats v 1.8.2
#' @param loci.insert.order Numeric vector with specific order to insert the loci. If this value is NULL, the insert order will be in sequential order (1, 2, ..., No. Loci).
#' @param insert.batch.size Number of loci to be inserted in a single batch.
#' @param min.posterior Minimum posterior probability of a haplotype pair, conditional on observed marker genotypes. Posteriors below this minimum value will have their pair of haplotypes "trimmed" off the list of possible pairs. If all markers in low LD, we recommend using the default. If markers have at least moderate LD, can increase this value to use less memory.
#' @param tol If the change in log-likelihood value between EM steps is less than the tolerance (tol), it has converged.
#' @param max.iter Maximum number of iterations allowed for the EM algorithm before it stops and prints an error. If the error is printed, double max.iter.
#' @param random.start If random.start = 0, then the inititial starting values of the posteriors for the first EM attempt will be based on assuming equal posterior probabilities (conditional on genotypes). If random.start = 1, then the initial starting values of the first EM attempt will be based on assuming a uniform distribution for the initial posterior probabilities.
#' @param n.try Number of times to try to maximize the lnlike by the EM algorithm. The first try uses, as initial starting values for the posteriors, either equal values or uniform random variables, as determined by random.start. All subsequent tries will use random uniform values as initial starting values for the posterior probabilities.
#' @param iseed An integer or a saved copy of .Random.seed. This allows simulations to be reproduced by using the same initial seed.
#' @param max.haps.limit Maximum number of haplotypes for the input genotypes. It is used as the amount of memory to allocate in C for the progressive-insertion E-M steps. Within haplo.em, the first step is to try to allocate the sum of the result of geno.count.pairs(), if that exceeds max.haps.limit, start by allocating max.haps.limit. If that is exceeded in the progressive-insertions steps, the C function doubles the memory until it can no longer request more.
#' @param verbose Logical, if TRUE, print procedural messages to the screen. If FALSE, do not print any messages.
#' @note This function is for internal BIGDAWG use only.
haplo.em.control <- function(loci.insert.order=NULL, insert.batch.size = 6, min.posterior=1e-9, tol=0.00001, max.iter=5000, random.start=0, n.try = 10, iseed=NULL, max.haps.limit = 2e6, verbose=0) {
  
  if(min.posterior < 0 | min.posterior > .8) {
    warning("The value of min.posterior is out of range, the devault value of 1e-9 is used instead")
    min.posterior <-  1e-9
  }
  
  if(tol < 0 | tol > .5) {
    warning("The value of tol is out of range, the default value of 0.00001 is used instead")
    tol <- 0.00001
  }
  
  
  if(max.iter < 0) { 
    warning("The value of max.iter is < 0, the default value of 5000 is used instead")
    max.iter <- 5000
  }
  
  if(random.start !=0  & random.start != 1){
    warning("The value of random.start is not valid, the default value of 0 is used instead")
    random.start <- 0
  }
  
  if(n.try < 1){
    warning("The value of n.try is not valid, the default value of 10 is used instead")
    n.try <- 10
  }
  
  # max.haps.limit is used to allow user to specify max number of hap pairs.
  # -If run-time is an issue with a small data set, set to a low number
  # -If set too-low for number of haplotypes, the C functions must request more, which takes extra time
  
  # Employ the system's intMax as the upper limit allowed by integer index (int representation in R)
  # In most situations, the process is unable to allocate memory for intMax,
  # which is roughly 5-6G of total memory in haplo.em test cases
  intMax <- .C("checkIntMax",
               intMax = as.integer(0),
               PACKAGE="BIGDAWG")$intMax
  
  if(max.haps.limit < 1000 | max.haps.limit > intMax) {
    warning("The value of max.haps.limit is not valid, the default value of 2e6 is used instead")
    max.haps.limit = 2e6
  }
  
  
  return(list(loci.insert.order=loci.insert.order, 
              insert.batch.size = insert.batch.size,
              min.posterior=min.posterior, 
              tol=tol, 
              max.iter=max.iter, 
              random.start=random.start,
              n.try=n.try,
              iseed=iseed,
              max.haps.limit=max.haps.limit,
              verbose=verbose))
}

#' Creates an object of class "locus"
#'
#' locus function port from haplo.stats v 1.8.2
#' @param allele1 A vector containing the labels for 1 allele for a set of individuals, or optionally a matrix with 2 columns each containing an allele for each person.
#' @param allele2 A vector containing the labels for the second allele for a set of individuals. If allele 1 is a matrix, allele 2 need not be specified.
#' @param chrom.label A label describing the chromosome the alleles belong to
#' @param locus.alias A vector containing one or more aliases describing the locus.
#' @param x.linked A logical value denoting whether the chromosome is x linked
#' @param sex A vector containing the gender of each individual (required if x.linked=T)
#' @param male.code The code denoting a male in the sex vector
#' @param female.code The code denoting a female in the sex vector
#' @param miss.val a vector of codes denoting missing values
#' @note This function is for internal BIGDAWG use only.
locus <- function(allele1,allele2, chrom.label=NULL, locus.alias=NULL, x.linked=FALSE, sex=NULL, male.code="M", female.code="F", miss.val=NA) {
  
  # Title: Create an object of locus class
  
  if (missing(allele1))
    stop("Error: allele1 is missing")
  
  if (missing(allele2)) {
    if (!is.matrix(allele1)) {
      stop("Error: allele1 not a matrix when allele2 is missing")
    }
    else {
      if (ncol(allele1)!=2) 
        stop("Error: allele1 must be matrix with 2 columns when allele2 is missing")
      allele2 <- allele1[,2]
      allele1 <- allele1[,1]
    }
  }
  
  if (length(allele1)!=length(allele2))
    stop("Error: allele vectors not of the same length") 
  
  # convert factor to character:
  if ((any(is.factor(allele1))) | (any(is.factor(allele2)))){
    allele1 <- as.character(allele1)
    allele2 <- as.character(allele2)
  }
  
  # fix miss.val so NA is kept as a missing code
  if (!any(is.na(miss.val)))
    miss.val <- c(miss.val,NA)
  
  n <- length(allele1)
  t <- factor(c(allele1,allele2),exclude=miss.val)
  a1 <- as.numeric(t[1:n])
  a2 <- as.numeric(t[(n+1):(2*n)])
  
  geno <- cbind(a1,a2)
  attr(geno,"chrom.label") <- chrom.label
  attr(geno,"locus.alias") <- locus.alias
  attr(geno,"x.linked") <- x.linked
  
  if(exists("is.R") && is.function(is.R) && is.R()) {
    class(geno) <- "model.matrix"
  } else {
    oldClass(geno) <- "model.matrix"
  }
  
  attr(geno,"allele.labels") <- levels(t)
  if (x.linked) {
    attr(geno,"male.code") <- male.code
    attr(geno,"female.code") <- female.code
    # Stop if x-linked related errors
    x.sexcheck(geno,sex,stop=TRUE)
  }
  
  return(geno)
}

#' Locus Object Creation from Genotype Matrix
#'
#' setupGeno function port from haplo.stats v 1.8.2
#' @param geno Matrix of alleles
#' @param miss.val A vector of codes denoting missing values for allele1 and allele2
#' @param locus.label vector of labels for the loci
#' @note This function is for internal BIGDAWG use only.
setupGeno <- function(geno, miss.val = c(0,NA), locus.label=NULL) {
  
  # Title: Create an object of class model.matrix
  loci <- NULL
  unique.alleles <- list(NULL)
  if(missing(geno))
    stop("Error: Required argument geno is missing")
  if(!is.data.frame(geno) & !is.matrix(geno))
    stop("Error: Argument geno is not a data.frame or matrix")
  if(ncol(geno) %% 2)
    stop("Error: Number of columns in geno object is not an even number")
  exp.num.loc <- ncol(geno)/2
  
  if(is.null(locus.label)) locus.label <- paste("loc",1:exp.num.loc, sep="-")
  
  for(i in 1:exp.num.loc) {
    
    tmp <- locus(geno[, (i * 2 - 1)], geno[, i * 2], chrom.label = NULL,
                 locus.alias=locus.label[i], x.linked = FALSE, sex = NULL,
                 male.code = "M", female.code = "F", miss.val = miss.val)
    ualleles <- attr(tmp, "allele.labels")
    tmp <- as.matrix(tmp)
    dimnames(tmp)[[2]][1] <- paste(locus.label[i], dimnames(tmp)[[2]][1], sep = ".")
    dimnames(tmp)[[2]][2] <- paste(locus.label[i], dimnames(tmp)[[2]][2], sep = ".")
    loci <- cbind(loci, tmp)
    unique.alleles[[i]] <- ualleles
  }
  
  # assign class model.matrix so it can be put into a data.frame
  # and to hold the unique.alleles attributes
  # negative is that subsetting does not work.
  if(exists("is.R") && is.function(is.R) && is.R()) {
    class(loci) <- "model.matrix"
  }
  else {
    oldClass(loci) <- "model.matrix"
  }
  
  attr(loci, "unique.alleles") <- unique.alleles
  
  return(loci)
}

#' Counts of Total Haplotype Pairs
#'
#' geno.count.pairs function port from haplo.stats v 1.8.2
#' @param geno Matrix of alleles
#' @note This function is for internal BIGDAWG use only.
geno.count.pairs <- function(geno) {
  
  # Given a matrix of genotypes (pairs of columns for each locus),
  # compute number of possible pairs of haplotypes for each subject (the rows
  # of the geno matrix).
  
  # Assume missing values are all coded to NA, done easily by geno.recode()
  
  # define function to iteratively update number of hets
  # across loci, accounting for all possible combinations
  # of missing alleles across loci
  
  haplo.count.pairs <- function(nhet.vec,nhom.vec){
    
    n.loci <- length(nhet.vec)
    cum.het <- 0
    wt.het <- 1
    
    for(locus in 1:n.loci){
      n.het <- nhet.vec[locus]
      n.hom <- nhom.vec[locus]
      
      if(n.het>=1 & n.hom>=1){
        cum.het <- as.vector(rbind(cum.het+1, cum.het))
        wt.het  <- as.vector(rbind(wt.het*n.het, wt.het*n.hom))
      }
      
      if(n.het>=1 & n.hom==0){
        cum.het <- cum.het + 1
        wt.het <- wt.het * n.het
      }
    }
    
    cum.het <- ifelse(cum.het==0,1,cum.het) # patch in case no hets
    n.pairs <- sum(2^(cum.het-1)*wt.het)
    return(n.pairs)
  }
  
  # initial setup for counting pairs of haplotypes
  # assume missing values are all coded to NA
  
  nc <- ncol(geno)
  n.loci <- nc/2
  nr <- nrow(geno)
  n.pairs <- numeric(nr)
  odd <- seq(from=1,to=2*n.loci,by=2)
  
  # find number of alleles at each locus
  
  n.alleles <- numeric(n.loci)
  for (i in 1:n.loci) {
    a1 <- geno[,2*i-1]
    a2 <- geno[,2*i]
    temp <- c(a1,a2)
    temp <- temp[!is.na(temp)]
    n.alleles[i] <- length(unique(temp))
  }
  
  # count pairs of haplotypes for subjects without any missing alleles
  
  any.miss <- apply(is.na(geno),1,any)
  
  if(sum(!any.miss) > 0){
    indx.nomiss <- (1:nr)[!any.miss]
    geno.nomiss <- geno[indx.nomiss,,drop=FALSE]
    h1 <- geno.nomiss[,odd,drop=FALSE]
    h2 <- geno.nomiss[,(odd+1),drop=FALSE]
    n.het <- apply(h1!=h2,1,sum)
    n.het <- ifelse(n.het==0,1,n.het)
    n.pairs[indx.nomiss] <- 2^(n.het-1)
  }
  
  # count pairs for subjects with missing alleles
  
  if(sum(any.miss) > 0){
    
    indx.miss <- (1:nr)[any.miss]
    n.subj.miss <- length(indx.miss)
    geno.miss <- geno[indx.miss,,drop=FALSE]
    
    # count number of possible het, hom at each locus
    loc <- 0
    het.mat <- hom.mat <- NULL
    
    for(i in odd){
      a1 <- geno.miss[,i]
      a2 <- geno.miss[,(i+1)]
      loc <- loc+1
      n.a <- n.alleles[loc]
      n.miss <- 1*(is.na(a1)) + 1*(is.na(a2))
      n.het <- ifelse(a1!=a2,1,0)
      n.het <- ifelse(n.miss==1,(n.a-1), n.het)
      n.het <- ifelse(n.miss==2,n.a*(n.a-1)/2, n.het)
      
      n.hom <- ifelse(a1==a2,1,0)
      n.hom <- ifelse(n.miss==1,1,n.hom)
      n.hom <- ifelse(n.miss==2,n.a,n.hom)
      
      het.mat <- cbind(het.mat,n.het)
      hom.mat <- cbind(hom.mat,n.hom)
    }
    
    pairs.miss <- NULL
    for(i in 1:n.subj.miss){
      pairs.miss <- c(pairs.miss, haplo.count.pairs(het.mat[i,], hom.mat[i,]))
    }
    
    n.pairs[indx.miss] <- pairs.miss
  }
  
  return(n.pairs)
}

#' Locus Object Creation from Genotype Matrix
#'
#' setupGeno function port from haplo.stats v 1.8.2
#' @param n.loci number of loci in genotype matrix
#' @param n.subject number of subjects in the sample
#' @param weight numeric weights
#' @param geno.vec vectorized genotype matrix
#' @param n.alleles numeric vector giving number of alleles at each marker
#' @param max.haps maximum unique haplotypes in the sample
#' @param max.iter maximum iterations to perform in the fitter
#' @param loci.insert.order order to insert loci for progressive insertion
#' @param min.posterior after insertion and maximization, discard haplotype pairs per person that do not meet minimum posterior prob
#' @param tol convergence tolerance for E-M steps
#' @param insert.batch.size number of markers to insert per batch
#' @param random.start	
#' @param iseed1 random seed for algorithm
#' @param iseed2 random seed for algorithm
#' @param iseed3 random seed for algorithm
#' @param verbose logical, print long, verbose output from E-M steps?
#' @note This function is for internal BIGDAWG use only.
haplo.em.fitter  <- function(n.loci, n.subject, weight, geno.vec, n.alleles, max.haps, max.iter, loci.insert.order, min.posterior, tol, insert.batch.size, random.start, iseed1, iseed2, iseed3, verbose) {
  
  converge <- 0
  min.prior <- 0.0
  n.unique <- 0
  lnlike <- 0.0
  n.u.hap <- 0
  n.hap.pairs <- 0
  
  on.exit( 
    .C("haplo_free_memory", PACKAGE="BIGDAWG") 
  )
  
  tmp1 <- .C("haplo_em_pin",
             n.loci=as.integer(n.loci),
             n.subject=as.integer(n.subject),
             weight=as.double(weight),
             geno.vec=as.integer(geno.vec),
             n.alleles = as.integer(n.alleles),
             max.haps = as.integer(max.haps),
             max.iter=as.integer(max.iter),
             loci.insert.order=as.integer(loci.insert.order),
             min.prior=as.double(min.prior),
             min.posterior=as.double(min.posterior),
             tol=as.double(tol),
             insert.batch.size=as.integer(insert.batch.size),
             converge=as.integer(converge),
             lnlike=as.double(lnlike),
             n.u.hap=as.integer(n.u.hap),
             n.hap.pairs=as.integer(n.hap.pairs),
             random.start=as.integer(random.start),            
             iseed1=as.integer(iseed1),          
             iseed2=as.integer(iseed2),
             iseed3=as.integer(iseed3),
             verbose=as.integer(verbose),
             PACKAGE="BIGDAWG")
  
  
  tmp2 <- .C("haplo_em_ret_info",
             n.u.hap=as.integer(tmp1$n.u.hap),
             n.loci=as.integer(tmp1$n.loci),
             n.pairs=as.integer(tmp1$n.hap.pairs),
             hap.prob=as.double(numeric(tmp1$n.u.hap)),
             u.hap=as.integer(numeric(tmp1$n.u.hap*tmp1$n.loci)),
             u.hap.code=as.integer(numeric(tmp1$n.u.hap)),
             indx.subj=as.integer(numeric(tmp1$n.hap.pairs)),
             post=as.double(numeric(tmp1$n.hap.pairs)),
             hap1code=as.integer(numeric(tmp1$n.hap.pairs)),
             hap2code=as.integer(numeric(tmp1$n.hap.pairs)),
             PACKAGE="BIGDAWG")
  
  obj <- list(tmp1=tmp1, tmp2=tmp2)
  
  return(obj)
  
}

#' Haplotype List Builder
#'
#' Builds table of haplotypes from combinations
#' @param Combn Combination of loci to extraction from genos
#' @param genos The genotype columns of the loci set being analyzed.
#' @param loci Character vector of unique loci being analyzed.
#' @param loci.ColNames Character vector of genos column names.
#' @note This function is for internal BIGDAWG use only.
buildHAPsets <- function(Combn,genos,loci,loci.ColNames) {
  
  # Range in matrix
  Set.H <- loci.ColNames %in% loci[Combn]
  return(genos[,Set.H])
  
}

#' Haplotype Name Builder
#'
#' Builds table of names for HAPsets
#' @param Combn Combination of loci to extraction from genos
#' @param loci Character vector of unique loci being analyzed.
#' @note This function is for internal BIGDAWG use only.
buildHAPnames <- function(Combn,loci) {
  
  return(paste(loci[Combn],collapse="~"))

}

#' Haplotype Table Maker
#'
#' Builds table of haplotypes
#' @param HaploEM Haplotype output object from haplo.stat::haplo.em function.
#' @param SID Index number (i.e., row number) of sample ID from genotype matrix.
#' @note This function is for internal BIGDAWG use only.
getHap <- function(SID,HaploEM) {
  
  # SID subject number
  # haplotype object from Haplo.em
  
  # Range in matrix
  Range <- which(HaploEM$indx.subj==SID)
  HapGet <- which.max(HaploEM$post[Range])
  
  # Which haplotype (when more than one possibility)
  Hap1.no <- HaploEM$hap1code[Range][HapGet]
  Hap2.no <- HaploEM$hap2code[Range][HapGet]
  
  # Combine into a haplotype string
  Hap1 <- paste(HaploEM$haplotype[Hap1.no,],collapse="~")
  Hap2 <- paste(HaploEM$haplotype[Hap2.no,],collapse="~")
  
  # Output haplotype
  return(c(Hap1,Hap2))
  
}
pappasd/BIGDAWG documentation built on Aug. 19, 2020, 7:21 p.m.