HapSampler: QTL-marker haplotype builder

#' @useDynLib HapSampler
#' @title Haplotype Sampler
#' @aliases hapsampler
#' @description 
#'  Marker-QTL haplotypes are sampled, and their probabilities calculated via
#'  MCMC methods. 
#' @param data a data frame object generated by running \code{\link{read_data}}.
#' @param  trait the column name in the data file of the trait. This must be
#'          specified.
#' @param  nchains  the number of parallel chains to be run.
#' @param  runlength the run length of each chain
#' @param probthresh a probability theshold value.  Data on animals whose haplotype probability is less
#' than \code{probthresh} will be ignored. 
#' 
#' @details 
#' This function implements a MCMC algorithm for sampling the space of marker-QTL genotypes, 
#' conditional on the observed marker haplotypes for each animal, the trait values, and 
#' a penetrance function (i.e. the probability of the trait given the putative QTL alleles). 
#' 
#' To guard against convergence issues, parallel chains are run by setting 
#' the argument \code{nchains} to some integer value greater than 1. 
#' \code{runlength} is the run length of a chain. 
#'
#' There are several plotting functions (such as \code{\link{plotLike}}, 
#' \code{\link{plotEffects}}, and \code{\link{plotScatter}}) available for 
#' viewing the results. 
#'
#' @return 
#' A list like object of class HS is returned with elements
#' \itemize{
#' \item trait:  the trait name
#' \item nchains: the number of chains
#' \item canonical.hap: this is the haplotype that is the most common in the input data. 
#' \item haplotypes: a data.frame object containing
#' \itemize{
#'    \item Haplotype:  the haplotype index
#'    \item Count: the frequency of the haplotype in the input data file after data on those animals with a haplotype probability less than 
#'    \code{probthresh} have been removed.
#'    \item MeanProb(Q):  the probability of this haplotype carrying the Q qtl allele, averaged over the mean of the Q allele probabilities from each chain
#'     \item Prob(Q):Chain1:  the mean probability of this haplotype carrying the Q qtl allele, for chain 1. This field is repeated for each chain that is run.
#' }
#' } 
#' @examples
#'
#'     complete.name <- system.file("extdata", "dataexample.dat", package="HapSampler")
#'     # read in phenotypic data which is space separated
#'     dt <- read_data(path=dirname(complete.name),
#'                                  file=basename(complete.name))
#'     
#'     #  perform analysis for the NAVEL trait
#'     hapres <- hapsampler(data=dt, trait="NAVEL", nchains=2, runlength=5)
#'     
#'     # a plot of the log likelihood for each chain against run length
#'     plotLike(hapres)
#'
#' @export
 hapsampler <- function (data = NULL, trait = NULL, 
                         nchains=3,
                         runlength=30,
                         probthresh = 0.95) 
{
    ## performing checks
    if(!is.numeric(nchains))
       stop(" nchains must be a counting number.")
    if(nchains < 1)
       stop(" nchains must be greater than 0.")
    if(!is.numeric(runlength))
       stop(" runlength must be a  counting number.")
    if(runlength < 1)
       stop(" runlength must be greater than 0.")
    if (is.null(data)) 
        stop(" The name of the output file from read_data() needs to be specified.")
    if (!is.data.frame(data)) 
        stop("data argument must be of type data.frame. Run read_data() to input data.")
    if (is.null(trait)) 
        stop(" The name of the trait in the data file must be specified. ")
    indx <- which(names(data) == trait)
    if (length(indx) == 0) 
        stop(" The name of the trait is not a column name in the data file.")

    if (probthresh < 0 || probthresh > 1) 
        stop(" probthresh must be a deciminal value betwen 0 and 1.")
    indx <- which(names(data) == "prob")
    if (length(indx) == 0) 
        stop(" The data object  must contain a column heading called prob.")

pen.resample <- runlength ## to be consistent with John's original naming


    ## create directory structure
    if (.Platform$OS.type == "unix") {
        dir_path <- paste(getwd(), "/", sep = "")
    } else {
        dir_path <- paste(getwd(), "\\", sep = "")
    }
    datapath <- paste(dir_path, "TmpData", sep = "")
    dir.create(datapath, showWarnings = FALSE, recursive = FALSE, mode = "0777")

    datapath <- paste(dir_path, "Temp", sep = "")
    dir.create(datapath, showWarnings = FALSE, recursive = FALSE, mode = "0777")


    data <- subset(data, data$prob > 0.95)
    haps <- c(data$hap1, data$hap2)
    nhap <- max(haps)
    ct <- array(NA, nhap)
    for (i in 1:nhap) ct[i] <- sum(haps == i)
    canonical.hap <- which(ct == max(ct))

    #id <- as.character(data[[id]])
    # adding a generic animal id
    id <- paste("Animal",1:nrow(data), sep="")
    cphen <- data[[trait]]
    h1 <- with(data, hap1)
    h2 <- with(data, hap2)
    maxhap <- max(max(h1), max(h2))
    #pen.resample <- 100
    nburn <- as.integer(pen.resample/2)
    nkeep <- pen.resample - nburn
    nsamp <- as.integer(2000)
    #nchains <- 3
    n <- array(NA, 3)
    m0 <- 0
    t0 <- 0
    a <- 0
    b <- 0
    nanis <- length(h1)
    min.n <- nanis/40
    cat(" set min.n to 2 for testing only .... \n")

 

for (i in 1:nchains) {

  hapstore <- array(0,maxhap)
  phap1store <- array(0,nanis)
  phap2store <- array(0,nanis)
  chain <- NULL
  mustore <- array(0,3)
  sigstore <- array(0,3)
  kept <- 0

  # starting values
  samp.pen.mu <- array(NA,3)
  samp.pen.mu[1] <- min(cphen,na.rm=T)
  samp.pen.mu[3] <- max(cphen,na.rm=T)
  samp.pen.mu[2] <- mean(samp.pen.mu,na.rm=T)



  samp.pen.sd <- array(sqrt(var(cphen,na.rm=T)),3)/5


  j <- 1
  nfail <- 0
  while (j <= pen.resample) {
    cat(" Performing run ",j, "in chain ", i, "\n")
    
    if (j == 1) {
      hap.assign <- sample.int(2,maxhap,replace=T) # In here rather than outside
                                        # loop to guard against an invalid 
                                        # first random sample.
    }

    xxx <- .forfortran.cont(id,cphen,h1,h2,
        samp.pen.mu,samp.pen.sd,hap.assign,nsamp)

   ## Running code
   .Fortran("runf90code")

   ## Read results
   samplog <- read.table("Temp/samplog.txt",header=T)
   pi_i <- samplog$pi_i[dim(samplog)[1]]
   haplog <- read.table("Temp/haplog.txt",header=T)
   anilog <- read.table("Temp/anilog.txt",header=T)
   g <- anilog$last1 + anilog$last2 - 2

   ## Sample parameters for normal distributions
   fail <- FALSE
   vmu <- vsig <- array(NA,3)

   for (zz in 1:3) {  # For each genotype class  (AA, AB, BB)
      ss <- subset(anilog,g == zz-1)
      y <- ss$pheno[!is.na(ss$pheno)]
      n[zz] <- length(y)

      ## cat(zz, "n[zz],  = ", n[zz], "and min.n is ", min.n, " .. \n")
      if (n[zz] < min.n) { # Guard against low allele frequency spaces
        fail <- TRUE
        ## cat(" in here don't know why ... ", n[zz], min.n, "\n")
      } else {

        # Initialise
        tau <- 1
        mtau <- a+n[zz]/2

        for (qq in 1:10) {  # Keep last of 10 samples

          v <- 1/(tau*n[zz] + t0)
          m <- v*(tau*sum(y)+t0*m0)
          mu <- rnorm(1,m,sqrt(v))

          tau <- rgamma(1,mtau,b+sum((y-mu)^2)/2)
          sigma <- 1/tau
          # print(c(mu,sigma))
        }
        vmu[zz] <- mu
        vsig[zz] <- sigma
      }  ## if else
    }  ## end for each zz

    if (!fail) {
      print("in here ")
      nfail <- 0
      samp.pen.mu <- vmu
      samp.pen.sd <- vsig
      hap.assign <- haplog$last
      chain <- rbind(chain,c(j,pi_i,samp.pen.mu,samp.pen.sd))
      j <- j + 1

      if (j > nburn) {  # Accumulate
        kept <- kept + 1
        hapstore <- hapstore + haplog$ppoll
        phap1store <- phap1store + anilog$phap1
  
        phap2store <- phap2store + anilog$phap2
        mustore <- mustore + samp.pen.mu
        sigstore <- sigstore + samp.pen.sd
      }
    } else {
      nfail <- nfail + 1
    }  ## if (!fail) else
    if (nfail > 10) {
stop(" Hapsampler terminated. Bad run. Rerun.")
}
  }  ##  while (j <= pen.resample)

 #############

  ofn <- paste("Temp/anisols_",i,".txt",sep="")

  ## phap1 average haplotype probability across iterations of chain that is being kept. 
  ## phap2 average haplotype probability across iterations of chain that is being kept. 
  anisols <- data.frame(
        id = anilog$id,
        pheno = anilog$pheno,
        hap1 = anilog$hap1,
        hap2 = anilog$hap2,
        phap1 = phap1store/kept,
        phap2 = phap2store/kept)

  write.table(anisols,ofn,quote=F,row.names=F)

  #############

  ofn <- paste("Temp/mu_",i,".txt",sep="")

  musols <- data.frame(
        muAA = mustore[1]/kept,
        muAB = mustore[2]/kept,
        muBB = mustore[3]/kept,
        sdAA = sigstore[1]/kept,
        sdAB = sigstore[2]/kept,
        sdBB = sigstore[3]/kept)

  write.table(musols,ofn,quote=F,row.names=F)

  ofn <- paste("Temp/hapsols_",i,".txt",sep="")

  hapsols <- data.frame(
        haplotype = haplog$haplotype,
        P = hapstore/kept)

  write.table(hapsols,ofn,quote=F,row.names=F)

  #############

  ofn <- paste("Temp/chain_",i,".txt",sep="")

  colnames(chain) <- c("sample","loglik","AAmu","ABmu",
                                        "BBmu","AAsd","ABsd","BBsd")

  write.table(chain,ofn,quote=F,row.names=F)







} ## for (i in 1:nchains)



#------------------------------------------
# Generate summary of haps and their probs
# and return with list object
#------------------------------------------
 # Flips
  # Identify those that need to be flipped
  flip <- array(FALSE,nchains)

  for (repl in 1:nchains) {
    cf <- read.table(paste("Temp/hapsols_",repl,
                                        ".txt",sep=""),header=T)$P
    if (cf[canonical.hap]  < 0.5) flip[repl] <- T
  }


 # First need haplotype counts
  ## anisols contains average hap prob across iterations that are being kept. 
  anis = read.table("Temp/anisols_1.txt",header=T)
  haps <- c(anis$hap1,anis$hap2)

  nhap <- max(haps)

  ct <- array(NA,nhap)
  for (i in 1:nhap) {
    ct[i] <- sum(haps == i)
  }

  repl <- 1
  csum <- data.frame(
    haplotype = read.table(paste("Temp/hapsols_",repl,
                                        ".txt",sep=""),header=T)$haplotype)
chain <- NULL
  for (repl in 1:nchains) {
    cf <- read.table(paste("Temp/hapsols_",repl,
                                        ".txt",sep=""),header=T)$P
    if (flip[repl]) {cf <- 1 - cf}
    chain <- cbind(chain,cf)
  }

 csum <- cbind(csum,
    data.frame(
        count = ct,
        PA = rowMeans(chain)))


  ## form haplotype structure
  ohp <- csum
  ohp <- cbind(ohp,chain)
  ohp <- subset(ohp,ohp$count > 0)
  nhap <- dim(ohp)[1]

  # Order probabilities for plotting
  ohp <- ohp[order(ohp$PA, decreasing = TRUE),]
  names(ohp) <- c("Haploype", "Count", "MeanProb(Q)", paste("Prob(Q):Chain", 1:nchains, sep=""))
  # change classes
  for(ii in 1:ncol(ohp))
     ohp[[ii]] <- as.numeric(ohp[[ii]]) 
 
  cat("\n\n Summary of findings ... \n\n")
  if(nrow(ohp) < 20){
    print(head(ohp), row.names=FALSE)
  } else {
    print(head(ohp, n=20), row.names=FALSE)
  } 
  cat(" Full results contained in HS object created by running hapsampler() \n")
  cat(" Run completed. \n\n")

  res <- list(trait=trait,
               nchains=nchains,
               nburn=nburn,
               canonical.hap = canonical.hap,
               haplotypes=ohp)
  class(res) <- "HS"   ## hapsampler class

  return(res)

}  ## end hapsampler 






#' @title HS
#' @description tests if object is a HapSampler list structure
#' @param   x  object to be tested.
#' @return  A logical value is returned. 
#' @export
is.HS <- function(x)
   inherits(x, "HS")