R/esFinal.R

#' esFinal
#
#' @description esFinal generates the final ESM dataset.
#
#' @param esDf a data.frame. A single ESM dataset. It must contain the 2 columns that hold the date-time object for when an ESM questionnaire was started and finished, respectively.
#
#' @param maxRows a numeric value. The number of data lines (per participant)  in the final event sampling dataset; must be equal for all participants. If no number is entered the maximum number across all participants is used.
#
#' @param RELEVANTVN_ES a list. This list is generated by function \code{\link{setES}} and it is extended once either by function \code{\link{genDateTime}} or by function \code{\link{splitDateTime}}.
#
#' @param RELEVANTINFO_ES a list. This list is generated by function \code{\link{setES}}.
#
#' @param esOpt a data.frame. This data.frame is generated as part of a list returned by the function \code{\link{esAssign}}. The name of this list's element is \code{ESopt}. Extract this element and pass it to the argument \code{esOpt} of this function.
#
#' @param complianceRate integer. This value must be set to a value between 0 and 100. If you want to have the completion rates for all participants and you also want to make sure that all participants are kept in the final ESM dataset, simply set the argument \code{complianceRate} to 0.
#
#' @details The empty rows will either denote ESM questionnaires that were missed by the participant or it will denote fillers, i.e. rows of empty data to fill up the number of rows to be equal across all participants The number of maximum rows per participant either are computed by searching the actual maximum number of questionnaires started by the participant, or by what the user defines to be the maximum number of questionnaires (no less than 2).
#
#' @return The user receives a list containing 3 elements:
#' \enumerate{
#' \item ESfinal, i.e. with empty rows of data added and with 2 additional columns MISSED and FILLER. MISSED refers to questionnaires that should have been answered by the participants but weren't. FILLER refers to empty rows of data due to all participants having to have equally many rows of data for multilevel modeling. See \strong{Details} for more information..
#' \item ESrateFinal, i.e. the average completion rates per participant, both per prompt and overall.
#' \item ESfinalOut, i.e. ESM data that contains participants that won't be used for statistical analyses due to having less than a minimum number of answered ESM questionnaires. If no participant is deselected a character string is returned that affirms that no deselection took place.
#' }
#' Unlike the interim result returned by the function \code{\link{esAssign}} (whenever there was at least one daily prompt) the data.frame \code{ESrateFinal} shows the final completion rates per participant.
#
#' @examples
#' # o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o
#' # Prerequisites in order to execute esFinal. Start ------------------
#' # RELEVANTINFO_ES is delivered with the package
#' # Use example list delivered with the package
#' RELEVANTVN_ES <- RELEVANTVN_ESext
#' # tbsqDf is a raw ESM dataset, delivered with the package.
#' # Prerequisites in order to execute esFinal. End --------------------
#' # -------------------------------------------------------
#' # Run function 29 of 29; see esmprep functions' hierarchy.
#' # -------------------------------------------------------
#' # tbsqDf is the result of function 'computeTimeBetween'.
#' esDfFin <- esFinal(tbsqDf, esOpt=esAssigned[["ESopt"]], complianceRate=50,
#' RELEVANTINFO_ES, RELEVANTVN_ES)
#' # o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o
#
#' @seealso Exemplary code (fully executable) in the documentation of \code{\link{esmprep}} (function 29 of 29).
#
#' @export
#
esFinal <- function(esDf, esOpt=NULL, complianceRate=NULL, RELEVANTINFO_ES=NULL, RELEVANTVN_ES=NULL, maxRows=NULL) {
	
	# Except for arg 'maxRows' no arg must be NULL
	anyMissing <- c(is.null(esDf), is.null(esOpt), is.null(complianceRate), is.null(RELEVANTINFO_ES), is.null(RELEVANTVN_ES))
	if(any(anyMissing)) {
		whichMissing <- paste0(c("esDf", "esOpt", "complianceRate", "RELEVANTINFO_ES", "RELEVANTVN_ES")[anyMissing], collapse=", ")
		stop(paste0("All arguments (except for 'maxRows') are obligatory. ", whichMissing, " need to be passed to the function."))
	}
	
	# No obligatory args are missing. Are all args of the correct class?
	anyWrongClass <- c(is.data.frame(esDf), is.data.frame(esOpt), is.numeric(complianceRate), is.list(RELEVANTINFO_ES), is.list(RELEVANTVN_ES))
	if(any(!anyWrongClass)) {
		whichWrongClass <- paste0(c("esDf", "esOpt", "complianceRate", "RELEVANTINFO_ES", "RELEVANTVN_ES")[anyWrongClass], collapse=", ")
		stop(paste0("The class of ", whichWrongClass, " is not correct. Enter ?esFinal in R console, hit enter, and then check which class is expected."))
	}
	
	# Check the specific classes.
	if(!is.data.frame(esDf) | !is.data.frame(esOpt)) {
        stop("Arguments 'esDf' and 'esOpt' both must be of class data.frame.")
    }
    		
    if(!is.numeric(complianceRate) || complianceRate%%1!=0) {
    	stop("Argument 'complianceRate' must be an integer value. Often it is set to 50, i.e. only participants who answered at least 50 percent of the possible number of ESM questionnaires are kept in the final dataset.")
    	
    } else {
    		if(all((0:100 %in% complianceRate)==FALSE)) {
    		stop("Argument 'complianceRate' must be between 0 and 100.")
    	}
    	
    		complianceRate <- as.integer(complianceRate)
    }
	
	# Error handling function for all set-up lists generated by setES and setREF.
    # Both lists RELEVANTVN_ES and RELEVANTVN_REF get extended either by function
    # genDateTime or by function splitDateTime!
    SETUPLISTCheck(RELEVANTINFO_ES=RELEVANTINFO_ES,
    			   RELEVANTVN_ES=RELEVANTVN_ES,
    			   RELEVANTVN_REF=NULL)
	
    variablesUsedIn_esFinal <- c("CV_ES", "CV_ESDAY", "ID",
                                 "PROMPT",
                                 RELEVANTVN_ES[["ES_START_DATETIME"]],
                                 RELEVANTVN_ES[["ES_END_DATETIME"]])

    if(any(is.na(match(variablesUsedIn_esFinal,	 names(esDf))))) {
        stop(paste0("In order to compute the time lag the variable ",
                    variablesUsedIn_esFinal,
                    " must be part of the data.frame that is passed to this function.\n\n"))
    }
    
    if(all(grepl("ID", names(esOpt))==FALSE)) {
		stop("The most important column name of 'esOpt' is missing, namely the column name 'ID', containing the identification code of each participant. For the argument 'esOpt' use part of the return value of the function 'esAssign', which returns a list, of which one element's name is 'ESopt'. Extract that element of the list and pass it to the argument 'esOpt' in this function.")
	}

	# If the user doesn't set a maximum number of questionnaires.
    if(is.null(maxRows)) {
    		# Compute the actual maximum number of questionnaires
        maxRows <- max(esDf[,"CV_ES"])
    # If maxRows is not a number or of it is not an integer number.
    } else if(!is.numeric(maxRows) || maxRows%%1 > 0) {
    		stop("The argument 'maxRows' must be an integer number.")
    	# If the number of maxRows is greater than the actual maximum.
    } else if(maxRows > max(esDf[,"CV_ES"])) {
    		stop(paste0("The maximum number of ESM questionnaires set for all participants cannot be greater than the actual maximum number of ESM questionnaires. The actual maximum number is: ", max(esDf[,"CV_ES"]), "."))
    } else if(maxRows < 2) {
    		stop(paste0("The argument maxRows was set to ", maxRows, ". The function expects at least the value 2, otherwise ESM and multilevel modeling obviously is not in place."))
    }
	
	# IDs from esDf (use sort, in case any user wants to play the game of checking
	# whether the function is super-robust against any manipulations.)
    idsES <- sort(as.character(unique(esDf[,"ID"])))
	
	# IDs from argument esOpt (use sort for the same reason as in idsES)
	idsOpt <- sort(as.character(unique(esOpt[,"ID"])))
	
	# If no participant was lost because of any procedure between function
	# 'esAssign' and function 'esFinal':
	if(all(c(idsOpt %in% idsES, idsES %in% idsOpt))) {
		
		finalizeDf <- data.frame(ID=rep(idsES, each = maxRows))
	    finalizeDf[,"CV_ES"] <- rep(1:maxRows, times = length(idsES))
	
	    # Merge (use default sort = TRUE)
	    esFinal <- merge(finalizeDf, esDf, all = TRUE)
		
		# This for-loop inserts coherent values into the variables CV_ESDAY
		# and PROMPT, which by now are NA, because of inserting empty rows
		# whenever a questionnaire is either a missing or a filler.
		
		if(!is.null(esOpt) & !is.null(complianceRate)) {
			missed <- filler <- avrgCompletionRateFinal <- c()
		}
		
	    for(i in 1:length(idsES)) {
			
	        idx_i <- which(esFinal[,"ID"] == idsES[i])
	        emptyLines_i <- is.na(esFinal[idx_i, "CV_ESDAY"])
	        idx_emptyLines_i <- which(emptyLines_i == TRUE)
	        len_emptyLines_i <- length(idx_emptyLines_i)
			
			if(!is.null(esOpt) & !is.null(complianceRate)) {

				idx_opt_i <- which(esOpt[,"ID"] == idsES[i])
							
				missedTemp <- rep(0, times=length(idx_i))
				fillerTemp <- rep(0, times=length(idx_i))
				
				if(length(idx_opt_i) >= length(idx_i)) {
					missedTemp[is.na(esFinal[idx_i, "PROMPT"])] <- 1
					
					if(length(idx_opt_i) > length(idx_i)) {
						message(paste0(idsES[i], " optimally should have answered ", length(idx_opt_i), " ESM questionnaires, which is more than the maximum number of actually answered ESM questionnaires across all participants, which is ", length(idx_i), "."))
					}
					
				} else {
					
					idx_iMissed <- is.na(esFinal[idx_i, "PROMPT"])
					idx_iMissed[(length(idx_opt_i)+1):length(idx_i)] <- FALSE
					
					missedTemp[idx_iMissed] <- 1
					fillerTemp[(length(idx_opt_i)+1):length(idx_i)] <- 1
				}
				
				missed <- c(missed, missedTemp)
				filler <- c(filler, fillerTemp)
			}
			
	        # If person i has filled out EVERY questionnaire there is no need to
	        # fill up anything in the variable 'CV_ESDAY'
	        if(len_emptyLines_i == 0) {
	
	            # Don't do anything.
	
	        # Else fill up any NAs in the variable 'CV_ESDAY' and 'PROMPT'.
	        } else {
	        	
	            for(j in idx_emptyLines_i) {
					
					# If 1 line prior to THIS line j of participant i is NOT the
					# last prompt of the day.
	                if(esFinal[ (idx_i [j] - 1) , "PROMPT"] != RELEVANTINFO_ES[["MAXPROMPT"]]) {
						
						# CV_ESDAY must NOT change (remain within same ESM-day)
	                    esFinal [ idx_i [j] , "CV_ESDAY"] <-
	                        esFinal [ (idx_i [j] - 1) , "CV_ESDAY"]
						
						# Increase variable PROMPT by 1 (all erroneous lines
						# must have been removed by now).
	                    esFinal [ idx_i [j] , "PROMPT"] <-
	                        esFinal [ (idx_i [j] - 1) , "PROMPT"] + 1
					
					# Else: THIS line j of participant i is NOT the last prompt
					# of the day. Therefore ...
	                } else {
						
						# Increase CV_ESDAY by 1 (start a new ESM-day)
	                    esFinal [ idx_i [j] , "CV_ESDAY"] <-
	                        esFinal [ (idx_i [j] - 1) , "CV_ESDAY"] + 1
						
						# Set variable PROMPT to 1. In the full ESM dataset
						# all prompts for each participant must be present.
	                    esFinal [ idx_i [j] , "PROMPT"] <- 1
	                }
	            }
	    	}
	    	
	    	if(!is.null(esOpt) & !is.null(complianceRate)) {
	    	
		    	actualPromptsFac <- factor(esFinal[idx_i[fillerTemp==0 & missedTemp==0], "PROMPT"], levels=1:RELEVANTINFO_ES[["MAXPROMPT"]])
		    
			    tblActual_i <- table(actualPromptsFac)
		        actual_i <- as.numeric(tblActual_i)
		
		        tblOptim_i <- table(esOpt[idx_opt_i,"PROMPT"])
		        optim_i <- as.numeric(tblOptim_i)
		        # Efficiency for person i (ES commitment for person i); not rounded.
		        efficiency_temp0 <- actual_i / optim_i * 100
				
				avrgCompletionRateFinal <- c(avrgCompletionRateFinal, efficiency_temp0, mean(efficiency_temp0))
			}
	    }
    	
	# Else: There is at least one participant in the ESM dataset gone missing.
	} else if(any(!(idsOpt %in% idsES))) {
		
		idsGone <- idsOpt[!(idsOpt %in% idsES)]
		message(paste0(idsGone, " has been removed from the ESM dataset, which is unlikely, yet not impossible. The removal can only have happened after executing the function 'intolerable'. Therefore, check the return value of the function 'intolerable', i.e. display the list element which is named 'intoleranceDf'. There you should find ", idsGone, "."))
		
		idsES <- idsOpt[idsOpt %in% idsES]
		
		finalizeDf <- data.frame(ID=rep(idsES, each = maxRows))
	    finalizeDf[,"CV_ES"] <- rep(1:maxRows, times = length(idsES))
	
	    # Merge (use default sort = TRUE)
	    esFinal <- merge(finalizeDf, esDf, all = TRUE)
		
		# This for-loop inserts coherent values into the variables CV_ESDAY
		# and PROMPT, which by now are NA, because of inserting empty rows
		# whenever a questionnaire is either a missing or a filler.
		
		if(!is.null(esOpt) & !is.null(complianceRate)) {
			missed <- filler <- avrgCompletionRateFinal <- c()
		}
		
	    for(i in 1:length(idsES)) {
			
	        idx_i <- which(esFinal[,"ID"] == idsES[i])
	        emptyLines_i <- is.na(esFinal[idx_i, "CV_ESDAY"])
	        idx_emptyLines_i <- which(emptyLines_i == TRUE)
	        len_emptyLines_i <- length(idx_emptyLines_i)

	        if(!is.null(esOpt) & !is.null(complianceRate)) {
	        	
		        idx_opt_i <- which(esOpt[,"ID"] == idsES[i])
				
				missedTemp <- rep(0, times=length(idx_i))
				fillerTemp <- rep(0, times=length(idx_i))
				
				if(length(idx_opt_i) >= length(idx_i)) {
					missedTemp[is.na(esFinal[idx_i, "PROMPT"])] <- 1
					
					if(length(idx_opt_i) > length(idx_i)) {
						message(paste0(idsES[i], " optimally should have answered ", length(idx_opt_i), " ESM questionnaires, which is more than the maximum number of actually answered ESM questionnaires across all participants, which is ", length(idx_i), "."))
					}
					
				} else {
					
					idx_iMissed <- is.na(esFinal[idx_i, "PROMPT"])
					idx_iMissed[(length(idx_opt_i)+1):length(idx_i)] <- FALSE
					
					missedTemp[idx_iMissed] <- 1
					fillerTemp[(length(idx_opt_i)+1):length(idx_i)] <- 1
				}
				missed <- c(missed, missedTemp)
				filler <- c(filler, fillerTemp)
			}
			
	        # If person i has filled out EVERY questionnaire there is no need to
	        # fill up anything in the variable 'CV_ESDAY'
	        if(len_emptyLines_i == 0) {
	
	            # Don't do anything.
	
	        # Else fill up any NAs in the variable 'CV_ESDAY' and 'PROMPT'.
	        } else {
	        	
	            for(j in idx_emptyLines_i) {
					
					# If 1 line prior to THIS line j of participant i is NOT the
					# last prompt of the day.
	                if(esFinal[ (idx_i [j] - 1) , "PROMPT"] != RELEVANTINFO_ES[["MAXPROMPT"]]) {
						
						# CV_ESDAY must NOT change (remain within same ESM-day)
	                    esFinal [ idx_i [j] , "CV_ESDAY"] <-
	                        esFinal [ (idx_i [j] - 1) , "CV_ESDAY"]
						
						# Increase variable PROMPT by 1 (all erroneous lines
						# must have been removed by now).
	                    esFinal [ idx_i [j] , "PROMPT"] <-
	                        esFinal [ (idx_i [j] - 1) , "PROMPT"] + 1
					
					# Else: THIS line j of participant i is NOT the last prompt
					# of the day. Therefore ...
	                } else {
						
						# Increase CV_ESDAY by 1 (start a new ESM-day)
	                    esFinal [ idx_i [j] , "CV_ESDAY"] <-
	                        esFinal [ (idx_i [j] - 1) , "CV_ESDAY"] + 1
						
						# Set variable PROMPT to 1. In the full ESM dataset
						# all prompts for each participant must be present.
	                    esFinal [ idx_i [j] , "PROMPT"] <- 1
	                }
	            }
	       	}
	       	
	       	if(!is.null(esOpt) & !is.null(complianceRate)) {
	       		
		       	actualPromptsFac <- factor(esFinal[idx_i[fillerTemp==0 & missedTemp==0], "PROMPT"], levels=1:RELEVANTINFO_ES[["MAXPROMPT"]])
		    
			    tblActual_i <- table(actualPromptsFac)
		        actual_i <- as.numeric(tblActual_i)
		
		        tblOptim_i <- table(esOpt[idx_opt_i,"PROMPT"])
		        optim_i <- as.numeric(tblOptim_i)
		            
		        # Efficiency for person i (ES commitment for person i); not rounded.
		        efficiency_temp0 <- actual_i / optim_i * 100
				
				avrgCompletionRateFinal <- c(avrgCompletionRateFinal, efficiency_temp0, mean(efficiency_temp0))
			}
	    }
	}
	
	if(!is.null(esOpt) & !is.null(complianceRate)) {
		
    		ESrateFinal <- data.frame(ID=idsES, matrix(avrgCompletionRateFinal, nrow=length(idsES), byrow=TRUE))
    		colnames(ESrateFinal)[2:ncol(ESrateFinal)] <- c(paste0("PROMPT", 1:RELEVANTINFO_ES[["MAXPROMPT"]]), "MEAN")
    }
    
    if(!is.null(esOpt) & !is.null(complianceRate)) {
    
	    esFinal[,"MISSED"] <- missed
	    esFinal[,"FILLER"] <- filler
	    
	    # Deselect all IDs with a compliance rate that is less than what
	    # was specified by the user (see argument complianceRate)
	    idsOut <- esFinal$ID %in% ESrateFinal$ID[ESrateFinal$MEAN < complianceRate]
	    
	    cat("!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!\n")
	    cat("Don't forget to write the final ESM dataset on your computer's hard disk!\n")
	    cat("!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!\n")
	    
	    if(all(idsOut == FALSE)) {
	    		list(ESfinal=esFinal, ESrateFinal=ESrateFinal, ESfinalOut=paste0("No participant has been removed from the final ESM dataset due to having answered less than ", complianceRate, " percent of the possible maximum number of ESM questionnaires."))
	    } else {
	    		list(ESfinal=esFinal[!idsOut,], ESrateFinal=ESrateFinal, ESfinalOut=esFinal[idsOut,])
	    }
	    
    } else {
    		
    		cat("!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!\n")
	    cat("Don't forget to write the final ESM dataset on your computer's hard disk!\n")
	    cat("!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!\n")
	    
    		list(ESfinal=esFinal, ESrateFinal="Not selected.", ESfinalOut="Not selected.")
    }
}
mmiche/esmprep documentation built on July 7, 2019, 8:23 p.m.