#' clean, a function to check data and inputs.
#'
#' @param data See \code{\link{PersonAlytic}}.
#' @param ids See \code{\link{PersonAlytic}}.
#' @param dv See \code{\link{Palytic}}.
#' @param time See \code{\link{PersonAlytic}}.
#' @param phase See \code{\link{PersonAlytic}}.
#' @param ivs See \code{PersonAlytic}.
#' @param fixed See \code{\link{Palytic}}.
#' @param random See \code{\link{Palytic}}.
#' @param formula See \code{\link{Palytic}}.
#' @param correlation See \code{\link{PersonAlytic}}.
#' @param family See \code{\link{PersonAlytic}}.
#' @param dvs See \code{\link{PersonAlytic}}.
#' @param target_ivs See \code{PersonAlytic}.
#' @param standardize Logical. Should all variables be standardized? Only applies
#' to \code{dvs}, \code{ivs}, and \code{target_ivs}.
#' @param alignPhase See \code{\link{PersonAlytic}}.
#'
#' @author Stephen Tueller \email{stueller@@rti.org}
#'
#' @examples
#' \dontrun{
#' data <- clean(data=OvaryICT, ids='Mare', dv='follicles',
#' time='Time', phase='Phase')
#' # illustrate dropping cases with < 2 observations
#' data <- clean(data=OvaryICT[29:nrow(OvaryICT),], ids='Mare', dv='follicles',
#' time='Time', phase='Phase')
#' }
#'
#' @keywords internal
clean <- function(data ,
ids ,
dv ,
time ,
phase = NULL ,
ivs = NULL ,
fixed = NULL ,
random = NULL ,
formula = NULL ,
correlation = NULL ,
family = NO() ,
dvs = NULL ,
target_ivs = NULL ,
standardize = list(dv=FALSE, iv=FALSE, byids=FALSE) ,
sortData = TRUE ,
alignPhase = "none" ,
debugforeach = debugforeach )
{
# check that the family is a gamlss.family object
if(! "gamlss.family" %in% class(family))
{
stop("\n`family`=", family, " is not a 'gamlss.family' family.")
}
# check time
if(! "raw" %in% names(time))
{
time <- c(time, raw = time[1])
}
# check that variables are in the data set
vars <- unique( c(ids, dv, time$raw, phase, unlist(ivs), unlist(dvs),
unlist(target_ivs), all.vars(fixed), all.vars(random),
all.vars(formula)) )
vars <- vars[which(vars!='1' & vars!='0')]
#vars <- gsub(" ", "", vars) # this will be a problem if var names have spaces...
wvars <- which( ! vars %in% names(data) )
if( length(wvars) > 0 )
{
stop( paste('\n`', vars[wvars], '` is not in the data\n'))
}
# check id variable
data[[ids]] <- checkID(data[[ids]], ids)
# check phase variable
if(!is.null(phase))
{
if(any(is.na(data[[phase]]))) data[[phase]] <- fixPhaseNA(data[[phase]])
}
# check correlation structure
invisible( iscorStruct(correlation) )
# check for infinite values
isinf <- unlist( lapply(data[,vars], function(x) any(is.infinite(x))) )
if(any(isinf))
{
whichinf <- which(isinf)[1]
stop('\nThe variable `', vars[whichinf], '`` contains infinite values. Change to `NA` and try',
'\nagain.\n\n')
}
# see issue #12 on github
# check that time is monotonically increasing - this fails for
# time that is a trigonometric function of raw time
#by(data[[time$raw]], data[[ids]], function(x) all( diff(x) > 0) )
# check for missing data and perform missing data handling.
# I'm hesitant to do it here b/c it changes the data, do it on the fly
# in the fitting methods
# check variance - note that this also must be done for each ids in loops,
# but follow Curelator analyses and return per-person errors in output rather
# than stopping analyses
novar <- lapply(data[,unlist(vars[2:length(vars)])],
function(x) all(duplicated(x)[-1L]) )
novar <- unlist(novar)
if( any(novar) )
{
stop( paste('\n`', names(novar)[novar], '` has zero variance' ))
}
# Standardize the data. Note that dvs and target_ivs (as part of ivs) are
# resubmitted to clean() for each Palytic object created in htp, so we don't
# need to standardize them here
if(!is.list(standardize))
{
stop("`standardize=", standardize, "`,\n",
"but `standardize` should be a named logical list with at least one of ",
"`dvs`, `ivs`, or `byids`. For example, `standardize=list(dvs=FALSE,",
"ivs=TRUE,byids=TRUE)`.")
}
data <- pstand(data, standardize, dv, ivs, family, ids)
# sort the data
# -- this is why ids must be numeric and
# -- time must be monotonically increasing
# this doesn't work for trigonometric functions of time
if(sortData)
{
if(debugforeach)
{
print(time)
}
data <- data[order(data[[ids]], data[[time$raw]]),]
}
# if any cases have only 1 observation, "drop" them but only in n=1
if(length(unique(data[[ids]]))==1)
{
tuid <- table(data[[ids]])
toofew <- names(tuid)[ tuid<2 ]
if(length(toofew)>0)
{
warning(paste('\nThe following have less than 2 observations and will be dropped:\n\n'),
paste(paste(ids, toofew), collapse='\n'))
# don't actually drop rows, this causes mismatch when the user provides
# a subgroup variable as it will be longer than nrow(data)
for(i in seq_along(dvs))
{
data[[dvs[i]]][toofew] <- NA
}
}
}
# align the data
if(!is.null(phase))
{
if(alignPhase == 'align') data <- alignPhases(dat = data, id = ids,
phase = phase, time = time$raw)
if(alignPhase == 'piecewise')
{
data <- data.frame(data, pwtime(time = data[[time$raw]],
phase = data[[phase]])
)
}
}
return(data)
}
#' fixPhaseNA - function to carry last value forward if phase is missing
#' (this is common in example data sets Ty has worked with)
#'
#' @param phase The \code{phase} variable
#'
#' @author Stephen Tueller \email{stueller@@rti.org}
#'
#' @keywords internal
fixPhaseNA <- function(phase)
{
for(i in 2:length(phase))
{
if(is.na(phase[i])) phase[i] <- phase[i-1]
}
phase
}
#' Function to check ID variable and if possible, coerce to numeric
#'
#' @param x The ID variable values as a vector.
#' @param ids The ID variable name. See \code{\link{PersonAlytic}}.
#'
#' @author Stephen Tueller \email{stueller@@rti.org}
#'
#' @keywords internal
checkID <- function(x, ids)
{
if(!is.numeric(x))
{
if(is.character(x))
{
# length unique x
lux <- length(unique(x))
# length unique numeric x
lunx <- length(unique(as.numeric(factor(x))))
if(lux==lunx)
{
warning('`', ids, '` is character but will be forced to numeric')
return(as.numeric(x))
}
if(lux!=lunx)
{
stop('`', ids, '` is character but cannot be forced to numeric.',
' For example, id="1.0" and id="1.00" are not unique after',
' conversion to numeric.')
}
}
if(!is.character(x))
{
stop( paste('\n`', ids, '` must be numeric\n', sep='') )
}
}
if(is.numeric(x)) return(x)
}
#' Function to standardize data with options from the \code{standardize} parameter.
#'
#' @param data See \code{\link{PersonAlytic}}.
#' @param standardize See \code{\link{PersonAlytic}}.
#' @param dv See \code{\link{PersonAlytic}}.
#' @param ivs See \code{\link{PersonAlytic}}.
#' @param family See \code{\link{PersonAlytic}}.
#' @param ids See \code{\link{PersonAlytic}}.
#'
#' @author Stephen Tueller \email{stueller@@rti.org}
#'
#' @keywords internal
pstand <- function(data, standardize, dv, ivs, family, ids)
{
# determine which variables to standardize
dostand <- list()
# only standardize dv if requested and if dv is normal
if( !is.null(standardize$dv) &
(is.null(family) | as.character(family)[1]=="c(\"NO\", \"Normal\")") )
{
if(standardize$dv) dostand$dv=dv
}
# ivs
if( !is.null(standardize$iv) & length(ivs) > 0 )
{
ivs_stand <- list()
for(i in seq_along(ivs))
{
if(is.numeric(data[[ivs[[i]]]])) ivs_stand[[i]] <- ivs[[i]]
}
if(standardize$iv) dostand$iv=ivs_stand
}
# turn dostand into a character vector
dostand <- unlist(dostand)
# group standardization
dogroup <- FALSE
if(!is.null(standardize$byids))
{
if(standardize$byids) dogroup <- TRUE
}
for(i in dostand)
{
if(!dogroup) data[[i]] <- scale(data[[i]])
if( dogroup)
{
data[[i]] <- unlist( by(as.numeric( data[[i]] ), data[[ids]], scale))
}
}
return( data )
}
#' align the time variable to be zero at the transition between
#' the first and second phase
#'
#' @param data See \code{\link{PersonAlytic}}.
#' @param id See \code{\link{PersonAlytic}}.
#' @param phase See \code{\link{PersonAlytic}}.
#' @param time See \code{\link{PersonAlytic}}.
#' @param do.plot Logical. Should the resulting data be plotted?
#'
#' @author Stephen Tueller \email{stueller@@rti.org}
#'
#' @export
alignPhases <- function(dat, id, phase, time, do.plot=FALSE)
{
time.old <- dat[,time]
### only align if phase order is the same for everyone
phase.order <- aggregate(dat[,time], by=list(dat[,id], dat[,phase]), min, na.rm=TRUE)
phase.order <- aggregate(phase.order$x, by=list(phase.order$Group.1), order)
phase.is.ordered <- all(phase.order$x[1,1]==phase.order$x[,1])
u.id <- unique(dat[,id])
### order phases if they are unordered
if(!phase.is.ordered)
{
phase.Levels <- levels(as.factor(dat[,phase]))
### this automatically takes the levels alphabetically except for the
# selected phase.Level; user input will be needed in the future
phase.Levels <- c(phase.Level, phase.Levels[phase.Levels!=phase.Level])
min.times <- max.times <- vector("list", 0)
### rescale in.traphase times to start at 0
for(i in seq_along(phase.Levels))
{
temp <- dat[dat[,phase]==phase.Levels[i], c(id,time)]
min.times[[i]] <- aggregate(temp[,time], list(temp[,id]), min, na.rm=TRUE)
max.times[[i]] <- aggregate(temp[,time], list(temp[,id]), max, na.rm=TRUE)
for(j in 1:length(u.id))
{
temp.time <- temp[temp[,id]==u.id[j],time]
temp.time <- temp.time - min.times[[i]]$x[min.times[[i]]$Group.1==u.id[j]]
temp[temp[,id]==u.id[j], time] <- temp.time
rm(temp.time)
}
dat[dat[,phase]==phase.Levels[i], time] <- temp[,time]
}
### reaccumulate time for the phase order in phase.Levels
for(i in seq_along(phase.Levels)[-1])
{
### get the interphase gap times
max.pmi <- cbind(min.times[[i-1]]$Group.1, apply(cbind(min.times[[i-1]]$x,
min.times[[i]]$x),1,max))
min.pmi <- cbind(max.times[[i-1]]$Group.1, apply(cbind(max.times[[i-1]]$x,
max.times[[i]]$x),1,min))
dif.pmi <- data.frame(id=min.times[[i-1]]$Group.1, diff=max.pmi[,2] - min.pmi[,2])
### accumulate
temp_im1 <- dat[dat[,phase]==phase.Levels[i-1], c(id,time)]
max.times <- aggregate(temp_im1[,time], list(temp_im1[,id]), max, na.rm=TRUE)
temp <- dat[dat[,phase]==phase.Levels[i], c(id,time)]
min.times <- aggregate(temp[,time], list(temp[,id]), min, na.rm=TRUE)
for(j in 1:length(u.id))
{
temp.time <- temp[temp[,id]==u.id[j],]
temp.time <- temp.time + (max.times$x[max.times$Group.1==u.id[j]] +
dif.pmi$diff[dif.pmi$id==u.id[j]])
temp[temp[,id]==u.id[j], time] <- temp.time[,time]
rm(temp.time)
}
dat[dat[,phase]==phase.Levels[i], time] <- temp[,time]
}
### re-sort the data
dat <- dat[order(dat[,id], dat[,time]),]
}
### alignment to the first phase transition
if(do.plot)
{
par(mfrow=c(2,2))
hist(dat[,time])
time.old <- dat[,time]
}
for(i in u.id)
{
w <- dat[,id]==i
phase.i <- factor(dat[w,phase])
if(is.numeric(phase.i)) phase.t <- table(phase.i!=min(phase.i))
if(is.factor(phase.i)) phase.t <- table(phase.i)
time.i <- dat[w, time]
time.p <- time.i - time.i[ phase.t[1]+1 ]
#time.p2 <- -(phase.t[1]):(phase.t[2]-1)
dat[w,time] <- time.p
#cbind(time.i, time.p, time.p2, phase.i)
}
if(do.plot)
{
plot(time.old, dat[,time])
hist(dat[,time])
par(mfrow=c(1,1))
}
return(dat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.