Nothing
## fitNormDistr-methods
##
##
###############################################################################
##' @name fitNormDistr
##' @aliases fitNormDistr
##' @title fitNormDistr
##' @rdname fitNormDistr-methods
##' @docType methods
##' @description fit normal distributions to diffusion coefficient caclulated
##' by Dcoef method and saves seed state as a attribute of the result
##'
##' @usage
##' fitNormDistr(dcoef,components=NULL,log.transform=FALSE,binwidth=NULL,
##' combine.plot=FALSE,output=FALSE, proportion=NULL, means=NULL,
##' sd=NULL, constrain=FALSE)
##' @param dcoef diffusion coefficient calculated from Dcoef().
##' @param components parameter specifying the number of components to fit. If
##' NULL (default), a components analysis is done to determine the most likely
##' components and this number is then used for subsequent analysis.
##' @param log.transform logical indicate if log10 transformation is needed,
##' default F.
##' @param binwidth binwidth for the combined plot. If NULL (default), will
##' automatic assign binwidth.
##' @param combine.plot Logical indicate if all the plot should be combined
##' into one, with same scale (/same axises breaks), same color theme, and same
##' bin size for comparison.
##' @param output logical indicate if output file should be generated.
##' @param proportion numeric vector with estimates of each component's
##' proportion of the whole data.
##' @param means numeric vector with estimates of mean(mu) values for each
##' component.
##' @param sd numeric vector with estimates of standard deviation(sigma)
##' values for each component.
##' @param constrain logical indicate if mean and std deviation are set to the
##' given value. This will not work for the unimodal distribution.
##' @details
##' Components analysis uses the likelihood ratio test (LRT) to assess the
##' number of mixture components.
##' Bad Random seed generation may cause normalmixEM to crash. Using another
##' seed will solve the issue.
##'
##' Note: Ensure that a random number generator seed has been manually set!
##' The seed is stored as an attribute of the returned object of fitNormDistr()
##' and using the same seed makes results repeatable (see examples).
##'
##' @return
##' \describe{
##' \item{proportions}{The proportions of mixing components.}
##' \item{mean}{The Means of the components.}
##' \item{sd}{The Standard Deviations (SD) of components if not log
##' transformed; if log transformed, it is then interpreted as Coefficient of
##' Variation (CV).}
##' \item{loglik}{The log likelihood, useful for compare different fitting
##' result, the bigger the better fit.}
##' }
##'
##' @examples
##' # compare folders
##' folder1=system.file("extdata","SWR1",package="sojourner")
##' folder2=system.file("extdata","HTZ1",package="sojourner")
##' trackll=compareFolder(folders=c(folder1,folder2), input=3)
##' MSD=msd(trackll=trackll)
##' dcoef=Dcoef(MSD,dt=6,plot=TRUE,output=FALSE)
##'
##' # set unique seed (use any number)
##' set.seed(123)
##'
##' # fit dcoef (function automatically saves seed state as an attribute of
##' # the result)
##' a=fitNormDistr(dcoef,components=NULL,log.transform=FALSE,
##' combine.plot=FALSE, output=FALSE)
##'
##' # to repeat results of 'a', load seed attribute of a into RNG state
##' .Random.seed=attr(a,"seed")
##' # or, reset the seed with same unique number
##' # set.seed(123)
##'
##' b=fitNormDistr(dcoef,components=NULL,log.transform=FALSE,
##' combine.plot=FALSE,output=FALSE)
##'
##' # if 'a' and 'b' are the same
##' mapply(identical,a[[1]],b[[1]])
##'
##' #try with log transformation
##' set.seed(234)
##' c=fitNormDistr(dcoef,components=2,log.transform=TRUE,combine.plot=FALSE,
##' output=FALSE)
##'
##' # trying with some parameters provided(this will be applied to all dcoef
##' # results).
##' # with constrain = FALSE, this will be used as the starting values for the
##' # EM-algorithm
##' # normally we should deal with only one dataset when working with
##' # constrains, since it will apply to all of them.
##' folder3=system.file("extdata","HSF", package="sojourner")
##' trackll=compareFolder(c(folder3),input=3)
##' MSD=msd(trackll=trackll)
##' dcoef=Dcoef(MSD,dt=6,plot=TRUE,output=FALSE)
##'
##' # try with constrain=TRUE, the values will be forced to equal the provided
##' # ones.
##' set.seed(345)
##' e=fitNormDistr(dcoef,means=c(0.3,0.5), constrain=TRUE)
##' @export fitNormDistr
###############################################################################
# fit normal distribution to diffusion coefficient
#
# TO DO: combine.plot=TRUE requires additional testing
#function that deals with one-component normal distribution fitting
.singlecompFit=function(data, mean=NULL, sd=NULL, constrain=FALSE){
if (constrain == TRUE){warning(
"single component fitting cannot be constrained.")}
# if(!is.null(mean) & !is.null(sd)){stop("Please provide only one of
# the mean or sd constraints.")}
# fixlist = list(c(mean=mean, sd=sd))
# fit.info = fitdist(data,"norm", fix.arg=list(mean=mean))
#} else{fit.info = fitdist(data=data, distr="norm")}
#This dummy matrix is used to make it work out with the gg.mixEM function
#in Plotting.Helpers.R
fit.info = fitdist(data=data, distr="norm")
dummy.posterior = matrix()
colnames(dummy.posterior) = c("comp1")
#a list with components similar to that of mixEM so that it can be used in
# gg.mixEM
fit.output = list(x=as.vector(data), lambda = c(1),
mu = fit.info$estimate[1], sigma = fit.info$sd[1],
loglik = fit.info$loglik, restarts=0, ft="normalmixEM",
posterior= dummy.posterior)
return(list(fit.output))
}
#uses mclust package's mclustBootstrapLRT() function to estimate the number of
# components in distribution
.getCompNum=function(data){
cat("\ncomponents analysis\n")
components.test=mclust::mclustBootstrapLRT(data,model = "V",verbose=TRUE)
print(components.test)
components.int=length(components.test$p.value)
cat("\n\nmost likely components",components.int,
"at significant level 0.05\n\n")
return(components.int)
}
.getSummaryResult=function(result, log.transform){
out=list(
data.frame(proportion=result$lambda),
#why returning it back from log?
#data.frame(mean=if(log.transform) 10^(result$mu)
# else result$mu),
data.frame(mean=result$mu),
data.frame(sd=result$sigma),
log.lik=result$loglik
)
out=list(out)
out=t(do.call(cbind.data.frame,out))
colnames(out) = NULL
return(out)
}
.fitNormDistr=function(dcoef,components=NULL,log.transform=FALSE,binwidth=NULL,
combine.plot=FALSE,output=FALSE,proportion=NULL,
means=NULL,sd=NULL,constrain=FALSE){
cat(paste("\nIMPORTANT: Ensure a seed has been manually set!",
"See help docs for more info.\n"))
name=names(dcoef)
len=length(dcoef)
#parameters that help initial settings.
params = list(proportion, means, sd)
mixmdl.lst=list()
length(mixmdl.lst)=len
names(mixmdl.lst)=name
# store the standard error, currently only for components more than 2
## This part deals with input related errors
#collection of parameter vector lengths
param.len = lapply(params, length)
param.len=param.len[param.len!=0]
if(!is.null(components) & (any(param.len != components))){
stop(paste("Invalid parameter size: number of components",
"and parameter vector length does not match"))
}
for (i in seq_len(len)){
data=dcoef[[name[i]]][,"slope"]
# log transformation
if (log.transform == TRUE) {
data=log10(data)
data=data[!is.na(data)]
}
components.int=0
#components test
if (is.null(components)&length(param.len) == 0){
components.int = .getCompNum(data)
}else{
if(length(param.len) == 0){
components.int=components
}else if(!all(as.vector(param.len) == param.len[[1]])){
stop("Parameter vector lengths do not match")
}
if(is.null(components)){
components.int=param.len[[1]]
} else{components.int=components}
}
if (components.int == 1){ #The single component is a special case here
# since it cannot be a mixEM class object
if(constrain==TRUE){
oneComp = .singlecompFit(data=data, mean=means, sd=sd,
constrain=TRUE)
} else {oneComp=.singlecompFit(data=data)}
fit.output=oneComp[[1]]
plot.mixEM=gg.mixEM(fit.output,binwidth=binwidth,reorder=FALSE)
mixmdl=fit.output
mixmdl.lst[[i]] = fit.output
} else{
if(constrain == TRUE) {
mixmdl=normalmixEM(data,k=components.int,maxit=1e4,
epsilon=1e-10,lambda=proportion,
sd.constr=sd,
mean.constr=means)
} else {
mixmdl=normalmixEM(data,k=components.int,maxit=1e4,
epsilon=1e-10,lambda=proportion,mu=means,
sigma=sd)
}
# reorder also for plotting and for output result
# this is recordered on mean value
mixmdl=reorderEM(mixmdl)
print(summary(mixmdl))
plot.mixEM=gg.mixEM(mixmdl,binwidth=binwidth,reorder=TRUE)
# approximate standard error using parametic bootstrap
cat("\napproximating standard error by parametic bootstrap...\n\n")
# file="/dev/null" # this only for mac
mixmdl.lst[[i]]=mixmdl
}
suppressMessages(plot(plot.mixEM))
}
result.lst=list()
length(result.lst)=len
names(result.lst)=name
for (i in seq_len(len)){
result=.getSummaryResult(mixmdl.lst[[i]], log.transform)
result.lst[[i]]=result
}
print(result.lst)
if (combine.plot == TRUE){
# same scale, same binwidth, same breaks
ss=same.scale(mixmdl.lst)
# auto binwidth, smaller of the two
if (is.null(binwidth)) {
binwidth.vec=vapply(mixmdl.lst,function(mdl){
auto.binwidth(mdl$x)
},double(1))
binwidth=min(binwidth.vec)
cat("\ncombined binwidth =",binwidth,"\n")
}
if(components.int == 1){reorder=FALSE}
else{reorder=TRUE}
plot.lst=lapply(mixmdl.lst,function(x){
p=gg.mixEM(x,binwidth=binwidth,reorder=reorder)+
# this only plot polygon but not histogram when ylim is added
# +xlim(ss$scale.x)+ylim(ss$scale.y)
coord_cartesian(xlim = ss$scale.x,ylim=ss$scale.y)+
# makes integer breaks at the maxium n
scale_x_continuous(breaks=scales::pretty_breaks(n=10))
return(p)
})
# plot
do.call(gridExtra::grid.arrange,plot.lst)
# save
cmb.plot=gridExtra::marrangeGrob(plot.lst, nrow=2, ncol=1)
}
# output
if (output == TRUE){
result.df=do.call(rbind.data.frame,result.lst)
logTrans=""
if(log.transform){logTrans=".logtrans"}
fileName=paste("FitNormDistr-",
.timeStamp(name[1]),logTrans,"___.csv",sep="")
cat("\nOutput FitNormDistr.\n")
write.csv(file=fileName,result.df)
# output plot
if (combine.plot == TRUE){
fileName=paste("FitNormDistr-combinePlot-",
.timeStamp(name[1]),logTrans,"___.pdf",sep="")
cat("\nOutput FitNormDistr plot.\n")
# gridExtra::marrangeGrob non-interactive use, multipage pdf
# width=NULL,height=NULL default to graphic device size
ggsave(filename=fileName,cmb.plot,width=8,height=8)
}
}
# use invisible() so the user would not be overwhelmed by the numbers
# at the same time programmers can assign the value and use it
return(invisible(mixmdl.lst))
}
fitNormDistr=function(dcoef,components=NULL,log.transform=FALSE,binwidth=NULL,
combine.plot=FALSE,output=FALSE, proportion=NULL,
means=NULL, sd=NULL, constrain=FALSE){
# collects current seed (recommended that a unique seed is set beforehand,
# e.g. set.seed(123))
my_seed=.Random.seed
result = .fitNormDistr(dcoef=dcoef,components=components,
log.transform=log.transform,binwidth=binwidth,
combine.plot=combine.plot,output=output,
proportion=proportion,means=means,sd=sd,
constrain=constrain)
# saves seed as attribute of result
attr(result,"seed")=my_seed
return(result)
}
# Comment:
# the calculation of means for the bootstrapped sample seems to be directly
# using fitted posterior distributions, rather than fitted 100 times.
# TODO:
# constrain for the single component case should be implemented.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.