#' @useDynLib HapSampler
#' @title Haplotype Sampler
#' @aliases hapsampler
#' @description
#' Marker-QTL haplotypes are sampled, and their probabilities calculated via
#' MCMC methods.
#' @param data a data frame object generated by running \code{\link{read_data}}.
#' @param trait the column name in the data file of the trait. This must be
#' specified.
#' @param nchains the number of parallel chains to be run.
#' @param runlength the run length of each chain
#' @param probthresh a probability theshold value. Data on animals whose haplotype probability is less
#' than \code{probthresh} will be ignored.
#'
#' @details
#' This function implements a MCMC algorithm for sampling the space of marker-QTL genotypes,
#' conditional on the observed marker haplotypes for each animal, the trait values, and
#' a penetrance function (i.e. the probability of the trait given the putative QTL alleles).
#'
#' To guard against convergence issues, parallel chains are run by setting
#' the argument \code{nchains} to some integer value greater than 1.
#' \code{runlength} is the run length of a chain.
#'
#' There are several plotting functions (such as \code{\link{plotLike}},
#' \code{\link{plotEffects}}, and \code{\link{plotScatter}}) available for
#' viewing the results.
#'
#' @return
#' A list like object of class HS is returned with elements
#' \itemize{
#' \item trait: the trait name
#' \item nchains: the number of chains
#' \item canonical.hap: this is the haplotype that is the most common in the input data.
#' \item haplotypes: a data.frame object containing
#' \itemize{
#' \item Haplotype: the haplotype index
#' \item Count: the frequency of the haplotype in the input data file after data on those animals with a haplotype probability less than
#' \code{probthresh} have been removed.
#' \item MeanProb(Q): the probability of this haplotype carrying the Q qtl allele, averaged over the mean of the Q allele probabilities from each chain
#' \item Prob(Q):Chain1: the mean probability of this haplotype carrying the Q qtl allele, for chain 1. This field is repeated for each chain that is run.
#' }
#' }
#' @examples
#'
#' complete.name <- system.file("extdata", "dataexample.dat", package="HapSampler")
#' # read in phenotypic data which is space separated
#' dt <- read_data(path=dirname(complete.name),
#' file=basename(complete.name))
#'
#' # perform analysis for the NAVEL trait
#' hapres <- hapsampler(data=dt, trait="NAVEL", nchains=2, runlength=5)
#'
#' # a plot of the log likelihood for each chain against run length
#' plotLike(hapres)
#'
#' @export
hapsampler <- function (data = NULL, trait = NULL,
nchains=3,
runlength=30,
probthresh = 0.95)
{
## performing checks
if(!is.numeric(nchains))
stop(" nchains must be a counting number.")
if(nchains < 1)
stop(" nchains must be greater than 0.")
if(!is.numeric(runlength))
stop(" runlength must be a counting number.")
if(runlength < 1)
stop(" runlength must be greater than 0.")
if (is.null(data))
stop(" The name of the output file from read_data() needs to be specified.")
if (!is.data.frame(data))
stop("data argument must be of type data.frame. Run read_data() to input data.")
if (is.null(trait))
stop(" The name of the trait in the data file must be specified. ")
indx <- which(names(data) == trait)
if (length(indx) == 0)
stop(" The name of the trait is not a column name in the data file.")
if (probthresh < 0 || probthresh > 1)
stop(" probthresh must be a deciminal value betwen 0 and 1.")
indx <- which(names(data) == "prob")
if (length(indx) == 0)
stop(" The data object must contain a column heading called prob.")
pen.resample <- runlength ## to be consistent with John's original naming
## create directory structure
if (.Platform$OS.type == "unix") {
dir_path <- paste(getwd(), "/", sep = "")
} else {
dir_path <- paste(getwd(), "\\", sep = "")
}
datapath <- paste(dir_path, "TmpData", sep = "")
dir.create(datapath, showWarnings = FALSE, recursive = FALSE, mode = "0777")
datapath <- paste(dir_path, "Temp", sep = "")
dir.create(datapath, showWarnings = FALSE, recursive = FALSE, mode = "0777")
data <- subset(data, data$prob > 0.95)
haps <- c(data$hap1, data$hap2)
nhap <- max(haps)
ct <- array(NA, nhap)
for (i in 1:nhap) ct[i] <- sum(haps == i)
canonical.hap <- which(ct == max(ct))
#id <- as.character(data[[id]])
# adding a generic animal id
id <- paste("Animal",1:nrow(data), sep="")
cphen <- data[[trait]]
h1 <- with(data, hap1)
h2 <- with(data, hap2)
maxhap <- max(max(h1), max(h2))
#pen.resample <- 100
nburn <- as.integer(pen.resample/2)
nkeep <- pen.resample - nburn
nsamp <- as.integer(2000)
#nchains <- 3
n <- array(NA, 3)
m0 <- 0
t0 <- 0
a <- 0
b <- 0
nanis <- length(h1)
min.n <- nanis/40
cat(" set min.n to 2 for testing only .... \n")
for (i in 1:nchains) {
hapstore <- array(0,maxhap)
phap1store <- array(0,nanis)
phap2store <- array(0,nanis)
chain <- NULL
mustore <- array(0,3)
sigstore <- array(0,3)
kept <- 0
# starting values
samp.pen.mu <- array(NA,3)
samp.pen.mu[1] <- min(cphen,na.rm=T)
samp.pen.mu[3] <- max(cphen,na.rm=T)
samp.pen.mu[2] <- mean(samp.pen.mu,na.rm=T)
samp.pen.sd <- array(sqrt(var(cphen,na.rm=T)),3)/5
j <- 1
nfail <- 0
while (j <= pen.resample) {
cat(" Performing run ",j, "in chain ", i, "\n")
if (j == 1) {
hap.assign <- sample.int(2,maxhap,replace=T) # In here rather than outside
# loop to guard against an invalid
# first random sample.
}
xxx <- .forfortran.cont(id,cphen,h1,h2,
samp.pen.mu,samp.pen.sd,hap.assign,nsamp)
## Running code
.Fortran("runf90code")
## Read results
samplog <- read.table("Temp/samplog.txt",header=T)
pi_i <- samplog$pi_i[dim(samplog)[1]]
haplog <- read.table("Temp/haplog.txt",header=T)
anilog <- read.table("Temp/anilog.txt",header=T)
g <- anilog$last1 + anilog$last2 - 2
## Sample parameters for normal distributions
fail <- FALSE
vmu <- vsig <- array(NA,3)
for (zz in 1:3) { # For each genotype class (AA, AB, BB)
ss <- subset(anilog,g == zz-1)
y <- ss$pheno[!is.na(ss$pheno)]
n[zz] <- length(y)
## cat(zz, "n[zz], = ", n[zz], "and min.n is ", min.n, " .. \n")
if (n[zz] < min.n) { # Guard against low allele frequency spaces
fail <- TRUE
## cat(" in here don't know why ... ", n[zz], min.n, "\n")
} else {
# Initialise
tau <- 1
mtau <- a+n[zz]/2
for (qq in 1:10) { # Keep last of 10 samples
v <- 1/(tau*n[zz] + t0)
m <- v*(tau*sum(y)+t0*m0)
mu <- rnorm(1,m,sqrt(v))
tau <- rgamma(1,mtau,b+sum((y-mu)^2)/2)
sigma <- 1/tau
# print(c(mu,sigma))
}
vmu[zz] <- mu
vsig[zz] <- sigma
} ## if else
} ## end for each zz
if (!fail) {
print("in here ")
nfail <- 0
samp.pen.mu <- vmu
samp.pen.sd <- vsig
hap.assign <- haplog$last
chain <- rbind(chain,c(j,pi_i,samp.pen.mu,samp.pen.sd))
j <- j + 1
if (j > nburn) { # Accumulate
kept <- kept + 1
hapstore <- hapstore + haplog$ppoll
phap1store <- phap1store + anilog$phap1
phap2store <- phap2store + anilog$phap2
mustore <- mustore + samp.pen.mu
sigstore <- sigstore + samp.pen.sd
}
} else {
nfail <- nfail + 1
} ## if (!fail) else
if (nfail > 10) {
stop(" Hapsampler terminated. Bad run. Rerun.")
}
} ## while (j <= pen.resample)
#############
ofn <- paste("Temp/anisols_",i,".txt",sep="")
## phap1 average haplotype probability across iterations of chain that is being kept.
## phap2 average haplotype probability across iterations of chain that is being kept.
anisols <- data.frame(
id = anilog$id,
pheno = anilog$pheno,
hap1 = anilog$hap1,
hap2 = anilog$hap2,
phap1 = phap1store/kept,
phap2 = phap2store/kept)
write.table(anisols,ofn,quote=F,row.names=F)
#############
ofn <- paste("Temp/mu_",i,".txt",sep="")
musols <- data.frame(
muAA = mustore[1]/kept,
muAB = mustore[2]/kept,
muBB = mustore[3]/kept,
sdAA = sigstore[1]/kept,
sdAB = sigstore[2]/kept,
sdBB = sigstore[3]/kept)
write.table(musols,ofn,quote=F,row.names=F)
ofn <- paste("Temp/hapsols_",i,".txt",sep="")
hapsols <- data.frame(
haplotype = haplog$haplotype,
P = hapstore/kept)
write.table(hapsols,ofn,quote=F,row.names=F)
#############
ofn <- paste("Temp/chain_",i,".txt",sep="")
colnames(chain) <- c("sample","loglik","AAmu","ABmu",
"BBmu","AAsd","ABsd","BBsd")
write.table(chain,ofn,quote=F,row.names=F)
} ## for (i in 1:nchains)
#------------------------------------------
# Generate summary of haps and their probs
# and return with list object
#------------------------------------------
# Flips
# Identify those that need to be flipped
flip <- array(FALSE,nchains)
for (repl in 1:nchains) {
cf <- read.table(paste("Temp/hapsols_",repl,
".txt",sep=""),header=T)$P
if (cf[canonical.hap] < 0.5) flip[repl] <- T
}
# First need haplotype counts
## anisols contains average hap prob across iterations that are being kept.
anis = read.table("Temp/anisols_1.txt",header=T)
haps <- c(anis$hap1,anis$hap2)
nhap <- max(haps)
ct <- array(NA,nhap)
for (i in 1:nhap) {
ct[i] <- sum(haps == i)
}
repl <- 1
csum <- data.frame(
haplotype = read.table(paste("Temp/hapsols_",repl,
".txt",sep=""),header=T)$haplotype)
chain <- NULL
for (repl in 1:nchains) {
cf <- read.table(paste("Temp/hapsols_",repl,
".txt",sep=""),header=T)$P
if (flip[repl]) {cf <- 1 - cf}
chain <- cbind(chain,cf)
}
csum <- cbind(csum,
data.frame(
count = ct,
PA = rowMeans(chain)))
## form haplotype structure
ohp <- csum
ohp <- cbind(ohp,chain)
ohp <- subset(ohp,ohp$count > 0)
nhap <- dim(ohp)[1]
# Order probabilities for plotting
ohp <- ohp[order(ohp$PA, decreasing = TRUE),]
names(ohp) <- c("Haploype", "Count", "MeanProb(Q)", paste("Prob(Q):Chain", 1:nchains, sep=""))
# change classes
for(ii in 1:ncol(ohp))
ohp[[ii]] <- as.numeric(ohp[[ii]])
cat("\n\n Summary of findings ... \n\n")
if(nrow(ohp) < 20){
print(head(ohp), row.names=FALSE)
} else {
print(head(ohp, n=20), row.names=FALSE)
}
cat(" Full results contained in HS object created by running hapsampler() \n")
cat(" Run completed. \n\n")
res <- list(trait=trait,
nchains=nchains,
nburn=nburn,
canonical.hap = canonical.hap,
haplotypes=ohp)
class(res) <- "HS" ## hapsampler class
return(res)
} ## end hapsampler
#' @title HS
#' @description tests if object is a HapSampler list structure
#' @param x object to be tested.
#' @return A logical value is returned.
#' @export
is.HS <- function(x)
inherits(x, "HS")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.