Nothing
#' ctmaShapeRawData
#'
#' @description Raw data objects are re-shaped (dealing with missing time points, wrong time intervals etc)
#'
#' @param dataFrame an R object containing data
#' @param id the identifier of subjects if data are in long format
#' @param inputDataFrameFormat "wide" or "long"
#' @param inputTimeFormat "time" (default) or "delta"
#' @param missingValues Missing value indicator, e.g., -999 or NA (default)
#' @param n.manifest Number of process variables (e.g, 2 in a bivariate model)
#' @param manifest.per.latent n.manifest per latent factor. Frequently 1 manifest per latent, but e.g. c(2,3,1) also possible for 6 manifest loading on 3 latents
#' @param Tpoints Number of time points in the data frame
#' @param allInputVariablesNames vector of all process variable names, time dependent predictor names, time independent predictor names, and names of times/deltas. Only required if the dataFrame does not have column names.
#' @param orderInputVariablesNames = "names" vs "time" (e.g., names: X1, X2, X3, Y1, Y2, X3 vs time: X1, Y1, X2, Y2, ... ). For ctsem/CoTiMA, the output file will order by time.
#' @param targetInputVariablesNames = the process variables in the dataFrame that should be used (in "names" or in "times" order; e.g., c("X1", "X3", "Y1", "X3") ). This is used to delete variables from the data frame that are not required.
#' @param targetInputTDpredNames The actual time dependent (TD) predictor variable names, e.g, 3, or 6, or 9, ... names if Tpoints = 3. Internally, each of the 3, 6, etc represents one TDpred. One typically does NOT have TD predictors in a CoTiMA.
#' @param targetInputTIpredNames time independet (TI) predictor names names in the dataFrame. One typically does NOT have TI predictors in CoTiMA except it uses raw data only, where TIpreds are avalaible for individual cases.
#' @param targetTimeVariablesNames The time variables names in the dataFrame. They also define which Tpoints will be included in the output file , e.g., c("Time4", "Time9").
#' @param outputDataFrameFormat "long" (default) or "wide"
#' @param outputVariablesNames "Y" (default; creates Y1_T0, Y2_T0, Y1_T1, Y2_T1, etc.), but can also be, e.g., c("X", "Y"; creates X_T0, Y_T0, X_T1, Y_T1, etc.).
#' @param outputTDpredNames Will become "TD" if not specified
#' @param outputTIpredNames Will become "TI" if not specified
#' @param outputTimeVariablesNames "time" (default)
#' @param outputTimeFormat "time" (default) or "delta"
#' @param scaleTime A scalar that is used to multiply the time variable. Typical use is rescaling primary study time to the time scale use in other primary studies. For example, scaleTime=1/(60 x 60 x 24 x 365.25) rescales time provided in seconds (frequent case when imported from SPSS) into years (60sec x 60min x 24hrs x 365.25days incl. leap years).
#' @param standardization the way to standardize possible raw data ("none", "withinTimeA", "withinTimeB", "withinColumn", "withinPerson", or "overall"). Only applies if the list for specifying raw data information contains the list element 'standardize=TRUE'. 'WithinTimeA' standardizes within time points and deletes cases with missing T0 data. 'WithinTimeB' does not delete cases, and in subsequent ctsem or CoTiMA applications the user is adviced to use the argument 'sameInitialTimes=TRUE'.
#' @param minInterval A parameter (default = 0.0001) supplied to ctIntervalise. Set to smaller values than any possible observed measurement interval, but larger than 0.0001. The value is used for indicating unavailable time interval information (caused by missing values) because NA is technically not possible for time intervals.
#' @param minTolDelta Set, e.g. to 1/24, to delete variables from time points that are too close (e.g., 1hr; or even before) after another time point. Could be useful to delete values generated by unreliable responding, e.g., in diary studies. Note that minTolDelta applies to the time intervals AFTER the scaleTime argument has applied (i.e., scaleTime may need adaptation for each primary study, but minTolDelta does not).
#' @param maxTolDelta Set, e.g., to 7, to delete variables from time points that are too far after another time point (e.g., 7 days, if all participants should have responed within a week). Note that maxTolDelta applies to the time intervals AFTER the scaleTime argument has applied (i.e., scaleTime may need adaptation for each primary study, but minTolDelta does not).
#' @param negTolDelta FALSE (default) or TRUE. Delete entire cases that have at least one negative delta ('unreliable responding'; use minTolDelta to delete certain variables only)
#' @param min.val.n.Vars min.val.n.Vars = Minimum no. of valid variables. Default = 1 (retaines cases with only 1 valid variable), 0 would retain cases will all variables missing (not very useful). Retaining participants who provide a single valid variable is technically possible, but these participants contribute to the estimation of the variance/mean of this variable only. Since variance/mean are 1/0 in most CoTiMA applications, this is not very informative but at the cost of additional computational burden. Setting min.val.n.Vars = 2 is recommended.
#' @param min.val.Tpoints Minimum no. of valid Tpoints (i.e. Tpoints where min.val.n.Vars is met). Default = 1 retains participants with full set of valid variables at least at one single Tpoint (which will become T0). Setting min.val.Tpoints = 2 or higher values retains participants which provide longitudinal information. Since T0 covariances are usually not too interesting, min.val.Tpoints = 2 may be more reasonable then the default = 1.
#'
#' @examples
#' \dontrun{
#' tmpData <- data.frame(matrix(c(1, 2, 1, 2, 1, 2, 11, 26, 1,
#' NA, NA, 3, NA, 3, NA, 12, 27, 1,
#' 1, 2, 1, 2, 1, 2, NA, 24, 0 ),
#' nrow=3, byrow=TRUE))
#' colnames(tmpData) <- c("first_T0", "second_T0", "first_T1", "second_T1",
#' "TD1_0", "TD1_1",
#' "time1", "time2", "sex")
#' shapedData <- ctmaShapeRawData(dataFrame=tmpData,
#' inputDataFrameFormat="wide",
#' inputTimeFormat="time",
#' n.manifest=2,
#' Tpoints=2,
#' orderInputVariablesNames="time",
#' targetInputVariablesNames=c("first_T0", "second_T0",
#' "first_T1", "second_T1"),
#' targetInputTDpredNames=c("TD1_0", "TD1_1"),
#' targetInputTIpredNames="sex",
#' targetTimeVariablesNames=c("time1", "time2"),
#' scaleTime=1/12,
#' maxTolDelta=1.2)
#' head(shapedData)
#' }
#'
#' @importFrom ctsem ctWideToLong ctDeintervalise
#' @importFrom utils head
#' @importFrom stats diffinv
#'
#' @export ctmaShapeRawData
#'
#' @return A reshaped raw data file
#'
ctmaShapeRawData <- function(
dataFrame=NULL,
id=NULL,
inputDataFrameFormat=NULL,
inputTimeFormat="time",
missingValues=NA,
n.manifest=NULL,
manifest.per.latent=NULL,
Tpoints=NULL,
allInputVariablesNames=NULL,
orderInputVariablesNames=NULL,
targetInputVariablesNames=NULL,
targetInputTDpredNames=NULL,
targetInputTIpredNames=NULL,
targetTimeVariablesNames=NULL,
outputDataFrameFormat="long",
outputVariablesNames="Y",
outputTDpredNames=NULL,
outputTIpredNames=NULL,
outputTimeVariablesNames="time",
outputTimeFormat="time",
scaleTime=1,
minInterval=0.0001,
minTolDelta=NULL,
maxTolDelta=NULL,
negTolDelta=FALSE,
min.val.n.Vars=1,
min.val.Tpoints=1,
standardization='none'
) {
# some checks
{
# standardization
standardization <- tolower(standardization)
if (!( standardization %in% c("none", "withintimea", "withintimeb", "withincolumn", "withinperson", "overall"))) {
ErrorMsg <- "\nThe standardization argument hast to be one out of c(\"none\", \"withintimeA\", \"withintimeB\", \"withincolumn\", \"withinperson\", \"overall\"))). \nGood luck for the next try!"
stop(ErrorMsg)
}
if (!(outputTimeVariablesNames %in% c("time", "dT"))) {
ErrorMsg <- "\nThe argument \"outputTimeVariablesNames\" is currently limited to either \"time\" or \"dT\"! \nGood luck for the next try!"
stop(ErrorMsg)
}
if (!(inputDataFrameFormat %in% c("wide", "long"))) {
ErrorMsg <- "\nThe argument \"inputDataFrameFormat\" should be either \"wide\" or \"long\"! \nGood luck for the next try!"
stop(ErrorMsg)
}
if (is.null(n.manifest)) {
ErrorMsg <- "\nThe number of manifest variables has to be specified! \nGood luck for the next try!"
stop(ErrorMsg)
}
if ((is.null(Tpoints)) & (inputDataFrameFormat == 'wide')) {
ErrorMsg <- "\nThe (maximum) number of time points has to be specified! \nGood luck for the next try!"
stop(ErrorMsg)
}
if ( length(outputVariablesNames) > n.manifest) {
ErrorMsg <- "\nYou provided more outputVariablesNames than you specified n.manifest! \nGood luck for the next try!"
stop(ErrorMsg)
}
if (inputDataFrameFormat == 'wide') {
if ( !(orderInputVariablesNames) %in% c("names", "time")) {
ErrorMsg <- "\nThe argument orderInputVariablesNames has to be either \"names\" or \"time\"! \nGood luck for the next try!"
stop(ErrorMsg)
}
}
if ( !(inputTimeFormat) %in% c("time", "delta")) {
ErrorMsg <- "\nThe argument inputTimeFormat has to be either \"time\" or \"delta\"! \nGood luck for the next try!"
stop(ErrorMsg)
}
if ( !(outputTimeFormat) %in% c("time", "delta")) {
ErrorMsg <- "\nThe argument outputTimeFormat has to be either \"time\" or \"delta\"! \nGood luck for the next try!"
stop(ErrorMsg)
}
if (!(is.null(targetInputTDpredNames))) {
if (length(targetInputTDpredNames) != Tpoints) {
ErrorMsg <- "\nThe number of TD predictors names provided (\"targetInputTDpredNames\") should be equal to the number of time points (\"Tpoints\")! \nGood luck for the next try!"
stop(ErrorMsg)
}
}
if (minInterval < .00001) {
ErrorMsg <- "\nThe argument \"minInterval\" has been set to a value < .00001, which is currently not allowed! \nGood luck for the next try!"
stop(ErrorMsg)
}
if (any(is.na(missingValues))) {
Msg <- "Note: I assume that the missing values indicator in the dataFrame or dataFile is \"NA\" \n"
message(Msg)
}
if (scaleTime == 1) {
Msg <- "Note: Time is not scaled. \n"
message(Msg)
}
if (!(is.null(minTolDelta))) {
Msg <- paste0("Note: The shortest tolerated delta is ", minTolDelta, ". A subsequent time point closer to the preceeding one (afte possible time scaling) than ", minTolDelta," will be deleted. \n" )
message(Msg)
} else {
minTolDelta = minInterval*2 # just slightly above the missing indicator
}
if (!(is.null(maxTolDelta))) {
Msg <- paste0("Note: The longest tolerated Delta is ", maxTolDelta, ". All (!) subsequent time points (after possible time scaling) after T0 with an intervall larger than ", maxTolDelta," will be deleted. \n" )
message(Msg)
}
if (is.null(maxTolDelta)) {
Msg <- paste0("Note: All long deltas are specified to be acceptable (NULL). The shortest tolerate Delta is ", minTolDelta, ". \n" )
message(Msg)
}
if (is.null(manifest.per.latent)) {
Msg <- paste0("Note: The argument manifest.per.latent was not specified (NULL). I expect that there is an eual number of manifests per latent. \n
In your case I assume you have ", n.manifest, " latent variables! \n")
message(Msg)
}
if ( !(is.null(allInputVariablesNames)) & (!(is.null(colnames(dataFrame)))) ) {
if ( length(allInputVariablesNames) != length(colnames(dataFrame)) ) {
ErrorMsg <- "\nThe argument \"allInputVariablesNames\" does not equal the no. of columns of the dataFrame provided! \nGood luck for the next try!"
stop(ErrorMsg)
}
Msg <- "\nThe argument \"allInputVariablesNames\" has been provided, but the dataFrame provided has colnames, too. Take care you label variables correctly! \nGood luck for the next try!"
message(ErrorMsg)
}
if (!(is.null(minTolDelta)) & !(is.null(maxTolDelta))) {
if (minTolDelta > maxTolDelta) {
ErrorMsg <- "\nThe argument minTolDelta has been set to a larger value than maxTolDelta ! \nGood luck for the next try!"
stop(ErrorMsg)
}
}
if (!(is.null(targetInputTDpredNames))) {
tmp1 <- length(targetInputTDpredNames); tmp1
if (tmp1/Tpoints != round(tmp1/Tpoints)) {
ErrorMsg <- "\nThe number of variables specified in targetInputTDpredNames has to be a multifold of Tpoints! \nGood luck for the next try!"
stop(ErrorMsg)
}
}
if (minTolDelta < minInterval) {
ErrorMsg <- "\nThe argument minTolDelta has been set to a smaller value than mininterval (= indicator for missing)! \nGood luck for the next try!"
stop(ErrorMsg)
}
tmp1 <- length(outputVariablesNames); tmp1
if (tmp1 < n.manifest) {tmp1 <- rep(outputVariablesNames, n.manifest) } else {tmp1 <- outputVariablesNames}
if (all(tmp1 == tmp1[1])) tmp1 <- paste0(tmp1[1], seq(1,length(tmp1),1))
#tmp2 <- rep("_T", 4); tmp2
tmp2 <- rep("_T", n.manifest); tmp2
tmp2a <- rep(0, n.manifest); tmp2a
tmp2b <- rep(1, n.manifest); tmp2b
#tmp3 <- paste0(tmp1, paste0(tmp2, c(0,0,1,1))); tmp3
tmp3 <- paste0(tmp1, paste0(tmp2, c(tmp2a, tmp2b))); tmp3
tmp3 <- paste(tmp3, collapse=" "); tmp3
Msg <- paste0("Note: Output variable names will be ", tmp3, ", etc. \n")#)
message(Msg)
}
####################################################### Shape #######################################################
### Step 1 (Read raw data. Store in R-Object. Replacing missing value indicators with NA)
tmpData <- data.frame(dataFrame)
if (!(is.na(missingValues))) {
tmp1 <- which(tmpData == missingValues, arr.ind = TRUE); tmp1
tmpData[tmp1] <- NA
}
#head(tmpData)
### Step 2a - (re-)label variables
if ( !(is.null(allInputVariablesNames)) ) {
colnames(dataFrame) <- allInputVariablesNames
}
### Step 2 - (Transpose data into wide format if they are in long format)
{
if (inputDataFrameFormat == "long") {
if (is.null(id)) {
ErrorMsg <- "\nYou have to specify the id (identifier) because you provided data in long format! \nGood luck for the next try!"
stop(ErrorMsg)
}
#ErrorMsg <- "\nUnfortunetaly, long format data as input is not yet implemented. Consider using the function ctLongToWide to make wide format data frame! \nGood luck for the next try!"
#stop(ErrorMsg)
tmpData <- tmpData[, c(id, targetTimeVariablesNames, targetInputVariablesNames, targetInputTDpredNames, targetInputTIpredNames)]
tmpData <- as.data.frame(ctsem::ctLongToWide(tmpData, id=id, time=targetTimeVariablesNames,
manifestNames = targetInputVariablesNames,
TDpredNames=targetInputTDpredNames,
TIpredNames=targetInputTIpredNames))
# determine Tpoints created
tmp <- grep("_T", colnames(tmpData))
# CHD changed 13.11.2003
#Tpoints <- length(tmp) / (n.manifest + length(targetInputTDpredNames) + length(targetInputTIpredNames)); Tpoints
Tpoints <- length(tmp) / n.manifest; Tpoints
# make new timeVariable names
targetTimeVariablesNames <- paste0("T", 0:(Tpoints-1)); targetTimeVariablesNames
# make new inputVariable names
tmp <- c()
for (i in 1:length(targetInputVariablesNames)) tmp <- c(tmp, paste0(targetInputVariablesNames[i], "_T", 0:(Tpoints-1)))
targetInputVariablesNames <- tmp
# define new order of names
orderInputVariablesNames <- 'names'
}
# CHD removed 10.11.2023
#if (standardizeWithinTime == TRUE) {
# for (c in targetInputVariablesNames) {
# tmpData[, c] <- scale(tmpData[, c])
# }
#}
}
# Step 3 (Select the desired "target variables" (at least X and Y and time) and kick out the remaining stuff.)
#c(targetInputVariablesNames, targetInputTDpredNames, targetTimeVariablesNames, targetInputTIpredNames)
tmp1 <- c(targetInputVariablesNames, targetInputTDpredNames, targetTimeVariablesNames, targetInputTIpredNames); tmp1
tmpData <- tmpData[, tmp1]
#head(tmpData, 30)
#apply(tmpData, 2, mean, na.rm=T)
# Step 5 (Rename & re-arrange variables: X_T0, Y_T0, X_T1, Y_T1, ... time1, time2, ...)
tmp1 <- length(outputVariablesNames); tmp1
if (tmp1 < n.manifest) {tmp1 <- rep(outputVariablesNames, n.manifest) } else {tmp1 <- outputVariablesNames}
if (all(tmp1 == tmp1[1])) tmp1 <- paste0(tmp1[1], seq(1,length(tmp1),1))
newOutputVariablesNames <- tmp1; newOutputVariablesNames
tmp2 <- sort(rep(seq(1, Tpoints, 1)-1, n.manifest)); tmp2
allOutputVariablesNames <- paste0(tmp1, "_T", tmp2); allOutputVariablesNames
# TD preds
if (!(is.null(targetInputTDpredNames))) {
n.TDpred <- length(targetInputTDpredNames)/Tpoints; n.TDpred
if (is.null(outputTDpredNames)) {
outputTDpredNames <- c()
generalTDpredNames <- c()
for (i in 1:n.TDpred) {
generalTDpredNames <- c(generalTDpredNames, paste0("TD", i))
for (j in 0:(Tpoints-1)) {
outputTDpredNames <- c(outputTDpredNames, paste0("TD", i, "_T", j)); outputTDpredNames
}
}
}
} else {
n.TDpred <- 0
generalTDpredNames <- c()
outputTDpredNames <- c()
}
# TI preds
if (!(is.null(targetInputTIpredNames))) {
n.TIpred <- length(targetInputTIpredNames); n.TIpred
if (is.null(outputTIpredNames)) {
outputTIpredNames <- paste0("TI", seq(1, length(targetInputTIpredNames), 1)); outputTIpredNames
}
} else {
n.TIpred <- 0
outputTIpredNames <- c()
}
# time
allOutputTimeVariablesNames <- paste0("time", seq(0, (Tpoints-1), 1)); allOutputTimeVariablesNames
# CHD 4.4.23 original order
if (orderInputVariablesNames == "names") {
if (is.null(manifest.per.latent)) manifest.per.latent <- rep(1, n.manifest)
tmp1 <- stats::diffinv(manifest.per.latent*Tpoints)+1; tmp1
start <- tmp1[-length(tmp1)]; start
end <- start + manifest.per.latent-1; end
variableOrder <- c()
while(end[length(end)] <= n.manifest*Tpoints) {
for (m in 1:length(manifest.per.latent)) {
variableOrder <- c(variableOrder, start[m]:end[m])
}
start <- start + manifest.per.latent
end <- end + manifest.per.latent
}
targetInputVariablesNames <- targetInputVariablesNames[variableOrder]
}
tmpData <- tmpData[, c(targetInputVariablesNames, targetInputTDpredNames, targetTimeVariablesNames, targetInputTIpredNames)]
if (inputTimeFormat == "delta") {
dT0 <- data.frame(matrix(0, ncol=1, nrow=dim(tmpData)[1]))
colnames(dT0) <- "dT0"
tmpData <- cbind(tmpData[, c(targetInputVariablesNames, targetInputTDpredNames)],
dT0,
tmpData[, c(targetTimeVariablesNames, targetInputTIpredNames)])
}
colnames(tmpData) <- c(allOutputVariablesNames, outputTDpredNames, allOutputTimeVariablesNames, outputTIpredNames)
#head(tmpData)
#apply(tmpData, 2, mean, na.rm=T)
#apply(tmpData, 2, sd, na.rm=T)
#
#### Step 5b (make time out of delta if necessary)
if (inputTimeFormat == "delta") {
if (length(targetTimeVariablesNames) >= Tpoints) {
ErrorMsg <- "\nYou specified time to be provided as time lags (deltas). The number of \"targetTimeVariablesNames\" provided exceeds the time lags in the data set! \nGood luck for the next try!"
stop(ErrorMsg)
}
for (i in 1:(Tpoints-1)) {
tmpData[, paste0("time", i)] <- tmpData[ , paste0("time", i-1)] + tmpData[ , paste0("time", i)]
if (length(tmp1) > 0) tmpData[tmp1, paste0("time", i)] <- 0
}
allOutputTimeVariablesNames <- colnames(tmpData)[grep("time", colnames(tmpData))]; allOutputTimeVariablesNames
tmp1 <- which(tmpData[, allOutputTimeVariablesNames[-1]] == 0, arr.ind = TRUE)
tmpData[, allOutputTimeVariablesNames[-1]][tmp1] <- NA
}
# check
if ( length(grep("time", colnames(tmpData)[c(allOutputVariablesNames, outputTDpredNames, outputTIpredNames)])) > 0) {
if (minTolDelta > maxTolDelta) {
ErrorMsg <- "\nThe name part \"time\" is only allowed in \"time\" or \"delta\" variables - not in latents, TIpreds, TDpreds! \nGood luck for the next try!"
stop(ErrorMsg)
}
}
## at this stage, the variables should by in the order Y1_T0, Y2_T0, ..., Y1_T1, Y2_T1, ... TD1, TD2,... time1, time2, ... TI1, TI2, ...
#head(tmpData)
# Step 6: Delete variables from time points for which no time stamp is available (without time information, ctsem is impossible)
counter <- -1
for (i in allOutputTimeVariablesNames) {
counter <- counter + 1
tmp1 <- which(is.na(tmpData[, i])); tmp1
tmp2 <- grep(paste0("T", counter), allOutputVariablesNames); tmp2
tmpData[tmp1, allOutputVariablesNames[tmp2]] <- NA
}
# Step 6b - Scale time intervals
tmpData[ , allOutputTimeVariablesNames] <- tmpData[ , allOutputTimeVariablesNames] * scaleTime
#head(tmpData)
# Step 6c - Delete all cases where all time stamps are missing
if (inputTimeFormat == "time") { # if it is "delta" there should be at lease one time point
tmp1 <- apply(tmpData[, allOutputTimeVariablesNames], 1, sum, na.rm=TRUE)
tmp2 <- which(tmp1 == 0)
if (length(tmp2) > 0) tmpData <- tmpData[-tmp2, ]
}
# Step 6d - Delete all cases where all process variables are missing
tmp1 <- apply(tmpData[, allOutputVariablesNames], 1, sum, na.rm=TRUE)
tmp2 <- which(tmp1 == 0)
if (length(tmp2) > 0) tmpData <- tmpData[-tmp2, ]
# Intermediate Step: delete cases for which conditions min.val.n.Vars and min.val.Tpoints are not met
tmp1 <- apply(tmpData[, allOutputVariablesNames], 1, function(x) sum(!(is.na(x))))
tmp2 <- which(tmp1 < min.val.n.Vars)
if(length(tmp2) > 0 ) tmpData <- tmpData[-tmp2, ]
# min.val.Tpoints
validTpoints <- matrix(1, nrow=nrow(tmpData), ncol=Tpoints)
for (i in 0:(Tpoints-1)) {
tmp1 <- grep(paste0("T", i), colnames(tmpData))
tmp2 <- apply(tmpData[, tmp1], 1, function(x) sum(!(is.na(x))))
tmp3 <- which(tmp2 == 0)
validTpoints[tmp3, i+1] <- 0
}
tmp1 <- apply(validTpoints, 1, function(x) sum(x))
tmp2 <- which(tmp1 < min.val.Tpoints)
if(length(tmp2) > 0 ) tmpData <- tmpData[-tmp2, ]
#head(tmpData)
#apply(tmpData, 2, sd, na.rm=T)
# CHD 3.11.2023
### Step 2b - standardize within time points if requested
if (standardization == "withintimea") {
Msg <- "Variables are standardized within time points. This implies that all cases will be deleted that have missing target variables at T0.\n"
message(Msg)
tmp1 <- grep("_T0", colnames(tmpData)); tmp1
tmp2 <- apply(tmpData[, tmp1], 1, sum, na.rm=T)
tmp3 <- which(tmp2 == 0); head(tmp3)
if (length(tmp3) > 0) tmpData <- tmpData[-tmp3, ]
for (c in allOutputVariablesNames) {
tmpData[, c] <- scale(tmpData[, c])
}
}
### Step 2b - standardize within time points if requested
if (standardization == "withintimeb") {
Msg <- "Variables are standardized within time points. All cases are retained. The user is advised to use 'sameInitialTimes=TRUE' in ctsem or CoTiMA applications.\n"
message(Msg)
for (c in allOutputVariablesNames) {
tmpData[, c] <- scale(tmpData[, c])
}
}
# Step 6e - Shift data left if 1st time point is missing (otherwise lags will be not computed correctly later)
tmpData2 <- tmpData
n.TDpredPerWave <- length(targetInputTDpredNames)/Tpoints; n.TDpredPerWave
for (t in 1:(Tpoints-1)) {
# which T0 time stamp is missing
tmp1 <- which(is.na(tmpData2[, allOutputTimeVariablesNames[1]])); tmp1
# which substantive T0 variables are all missing
tmp2 <- which(is.na(tmpData2[, allOutputVariablesNames[1:n.manifest]]), arr.ind = TRUE)
tmp2 <- which(table(tmp2[, 1]) == n.manifest)
tmp2 <- as.numeric(names(tmp2)); tmp2
#combine
tmp1 <- c(tmp1, tmp2); tmp1
# shift substantive variables (allOutputVariablesNames)
tmpData2[tmp1, allOutputVariablesNames[1:((Tpoints-1)*n.manifest)]] <- tmpData2[tmp1, allOutputVariablesNames[(n.manifest+1):((Tpoints)*n.manifest)]]
tmpData2[tmp1, allOutputVariablesNames[(n.manifest*(Tpoints-t)+1):((Tpoints+1-t)*n.manifest)]] <- NA
# shift TDpreds (outputTDpredNames)
tmpData2[tmp1, outputTDpredNames[1:((Tpoints-1)*n.TDpredPerWave)]] <- tmpData2[tmp1, outputTDpredNames[(n.TDpredPerWave+1):((Tpoints)*n.TDpredPerWave)]]
tmpData2[tmp1, outputTDpredNames[(n.TDpredPerWave*(Tpoints-t+1)):((Tpoints-t+1)*n.TDpredPerWave)]] <- NA
# shift time variables
tmpData2[tmp1, allOutputTimeVariablesNames[1:(Tpoints-t)]] <- tmpData2[tmp1, allOutputTimeVariablesNames[(2):(Tpoints-t+1)]]
tmpData2[tmp1, allOutputTimeVariablesNames[Tpoints+1-t]] <- NA
}
#head(tmpData2)
tmpData <- tmpData2
# Step 6 Shift data left if all process variables are missing at a time point (even if time stamp is available)
if (Tpoints > 2) {
for (tt in 2:(Tpoints-1)) {
for (t in tt:(Tpoints-1)) {
# which substantive T1 variables are all missing
tmp2 <- which(is.na(tmpData2[, allOutputVariablesNames[((tt-1)*(n.manifest)+1):((tt-1)*(n.manifest)+n.manifest)]]), arr.ind = TRUE)
tmp2 <- which(table(tmp2[, 1]) == n.manifest)
tmp2 <- as.numeric(names(tmp2))
# shift substantive variables (allOutputVariablesNames)
tmpData2[tmp2, allOutputVariablesNames[((tt-1)*(n.manifest)+1):((Tpoints-1)*n.manifest)]] <- tmpData2[tmp2, allOutputVariablesNames[(tt*(n.manifest)+1):((Tpoints)*n.manifest)]]
tmpData2[tmp2, allOutputVariablesNames[(n.manifest*(Tpoints-1)+1):(n.manifest*(Tpoints-1)+n.manifest)]] <- NA
# shift TDpreds (outputTDpredNames)
tmpData2[tmp2, outputTDpredNames[((tt-1)*(n.TDpredPerWave)+1):((Tpoints-1)*n.TDpredPerWave)]] <- tmpData2[tmp2, outputTDpredNames[(tt*n.TDpredPerWave+1):((Tpoints)*n.TDpredPerWave)]]
tmpData2[tmp2, outputTDpredNames[(n.TDpredPerWave*(Tpoints-1)+1):(n.TDpredPerWave*(Tpoints-1)+n.TDpredPerWave)]] <- NA
# shift time variables
tmpData2[tmp2, allOutputTimeVariablesNames[tt:(Tpoints-1)]] <- tmpData2[tmp2, allOutputTimeVariablesNames[(tt+1):(Tpoints)]]
tmpData2[tmp2, allOutputTimeVariablesNames[(Tpoints)]] <- NA
# delete last time stamp if process variables are missing at last time point
tmp2 <- which(is.na(tmpData2[, allOutputVariablesNames[((Tpoints-1)*(n.manifest)+1):((Tpoints)*(n.manifest))]]), arr.ind = TRUE)
tmp2 <- which(table(tmp2[, 1]) == n.manifest)
tmp2 <- as.numeric(names(tmp2))
tmpData2[tmp2, allOutputTimeVariablesNames[(Tpoints)]] <- NA
}
}
# delete time stamps and TDpreds if process variables are missing
for (t in tt:(Tpoints-0)) {
tmp2 <- which(is.na(tmpData2[, allOutputVariablesNames[((t-1)*n.manifest+1):(t*n.manifest)]]), arr.ind = TRUE)
tmp2 <- which(table(tmp2[, 1]) == n.manifest)
tmp2 <- as.numeric(names(tmp2))
tmpData2[tmp2, allOutputTimeVariablesNames[(t)]] <- NA
}
}
tmpData <- tmpData2
#head(tmpData)
#apply(tmpData, 2, sd, na.rm=T)
### Step 6f - Determine possible lags that
# - are longer than maxTolDelta
# - are shorter than minTolDelta
# and delete time points. Further, determine possible lags that
# - are negative
# and delete this cases (if negTolDelta is not set to TRUE)
#
# all possible lags (last value in name indicates the time point (0, 1, ... involved))
tmp1 <- grep("time", colnames(tmpData)); tmp1
timeMat <- tmpData[, tmp1]
# test 1 wave lags first, then 2 wave lags, ... The first hit is the critical time point
lagWidth <- 0
for (j in 1:(Tpoints-1)) {
lagWidth <- lagWidth + 1; lagWidth
for (i in 1:(Tpoints-lagWidth)) {
currentLags <- timeMat[,(i+lagWidth)]- timeMat[,i]; currentLags
targetTimePoint <- i+lagWidth-1; targetTimePoint # 0, 1,
timeVariableToDelete <- allOutputTimeVariablesNames[targetTimePoint+1]; timeVariableToDelete
timePointsToDelete <- paste0("T", targetTimePoint); timePointsToDelete # just fro grepping the correct variable names
variablesToDelete <- allOutputVariablesNames[c(grep(timePointsToDelete, allOutputVariablesNames))];variablesToDelete
TDpredsToDelete <- outputTDpredNames[grep(timePointsToDelete, outputTDpredNames)]; TDpredsToDelete
# delete variables involved in too short intervals
targetCases <- which(currentLags < minTolDelta); targetCases
#tmpData[targetCases, c(variablesToDelete, TDpredsToDelete, timeVariableToDelete)]
if ( length(targetCases) > 0) tmpData[targetCases, c(variablesToDelete, TDpredsToDelete, timeVariableToDelete)] <- NA
# delete variables involved in too long intervals
targetCases <- which(currentLags > maxTolDelta); targetCases
#tmpData[targetCases, c(variablesToDelete, TDpredsToDelete, timeVariableToDelete)]
if ( length(targetCases) > 0) tmpData[targetCases, c(variablesToDelete, TDpredsToDelete, timeVariableToDelete)] <- NA
# delete cases if a single delta is negative
targetCases <- which(currentLags < 0); targetCases
if ( (negTolDelta == FALSE) & (length(targetCases) > 0) ) {
tmpData <- tmpData[-targetCases, ]
timeMat <- timeMat[-targetCases, ]
}
}
}
#apply(tmpData, 2, sd, na.rm=T)
# further standardization
if (standardization == "withincolumn") {
targetCols <- c()
for (i in outputVariablesNames) targetCols <- c(targetCols, grep(i, colnames(tmpData)))
for (i in targetCols) tmpData[, i] <- scale(tmpData[, i])
}
#
if (standardization == "withinperson") {
for (i in outputVariablesNames) {
#i <- outputVariablesNames[1]; i
targetCols <- grep(i, colnames(tmpData)); targetCols
#tmpData[1:2, targetCols]
tmpData[ , targetCols] <- t(apply(tmpData[ , targetCols], 1, scale))
}
}
#
if (standardization == "overall") {
for (i in outputVariablesNames) {
targetCols <- grep(i, colnames(tmpData)); targetCols
overallM <- mean(unlist(tmpData[, targetCols]), na.rm=T); overallM
overallSD <- sd(unlist(tmpData[, targetCols]), na.rm=T); overallSD
tmpData[ , targetCols] <- (tmpData[ , targetCols] - overallM)/overallSD
}
}
#head(tmpData)
#round(apply(tmpData, 2, mean, na.rm=T), 2)
#round(apply(tmpData, 2, sd, na.rm=T), 2)
if (! ((outputDataFrameFormat == "wide") & (outputTimeFormat == "time")) ) { # do nothing if it is wide and time (except possibly changing time name at the end)
#ctIntervalise requires datawide
#ctWideToLong requires datawide
#ctDeintervalise requires datalong
#tmpData2 <- tmpData
#tmpData <- tmpData2
#head(tmpData)
tmpData <- ctIntervalise(datawide=tmpData,
Tpoints=Tpoints,
n.manifest=n.manifest,
n.TDpred = n.TDpred,
n.TIpred = n.TIpred,
manifestNames = newOutputVariablesNames,
TDpredNames = generalTDpredNames,
TIpredNames = outputTIpredNames)
if (outputDataFrameFormat == "long") {
# without intervalising it does not work correctly
tmpData <- ctsem::ctWideToLong(tmpData, Tpoints = Tpoints, n.manifest = n.manifest,
n.TDpred = n.TDpred, n.TIpred = n.TIpred,
manifestNames = newOutputVariablesNames,
TDpredNames = generalTDpredNames, TIpredNames = outputTIpredNames)
tmpData <- data.frame(tmpData)
# delete cases where time is missing
tmp1 <- which(tmpData$dT == minInterval)
if (length(tmp1) > 0) tmpData <- tmpData[-tmp1, ]
# delete cases where all process variables are missing )probably not necessary)
tmp1 <- apply(tmpData[, outputVariablesNames], 1, function(x) sum(!(is.na(x))))
tmp2 <- which(tmp1 == 0)
if(length(tmp2) > 0 ) tmpData <- tmpData[-tmp2, ]
#head(tmpData, 50)
#tmpData3 <- tmpData
}
#head(tmpData)
#apply(tmpData, 2, sd, na.rm=T)
#outputTimeFormat
if (outputTimeFormat == "time") {
allIds <- unique(tmpData$id); allIds
for (i in allIds) {
#i <- allIds[1]; i
currentData <- tmpData[tmpData$id == i,]
##currentData
if (length(currentData$dT) > 1) {
for (j in 2:length(currentData$dT)) {
#j <- (length(currentData$time):2)[1]; j
currentData$dT[j] <- currentData$dT[j] + currentData$dT[j-1]
}
}
tmpData[tmpData$id == i,] <- currentData
}
}
head(tmpData, 50)
#apply(tmpData, 2, mean, na.rm=T)
} # end if (!(outputDataFrameFormat == "wide") & (outputTimeFormat == "time"))
#head(tmpData)
#apply(tmpData, 2, sd, na.rm=T)
# correction of time names
if (outputTimeVariablesNames != "time") {
tmp1 <- grep("time", colnames(tmpData)); tmp1
colnames(tmpData) <- gsub("time", outputTimeVariablesNames, colnames(tmpData))
}
if (outputTimeVariablesNames == "time") {
tmp1 <- grep("dT", colnames(tmpData)); tmp1
colnames(tmpData) <- gsub("dT", outputTimeVariablesNames, colnames(tmpData))
}
#head(tmpData, 50)
return(tmpData)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.