R/esPlausible.R
In esmprep: Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)

#' esPlausible
#
#' @description esPlausible helps detecting implausibilities in the raw ESM datasets.
#
#' @param dfList a list. Each element of the list must be a data.frame. Each data.frame is a separate raw ESM dataset/an ESM questionnaire version. If there is just one ESM version the list therefore contains one data.frame.
#
#' @param itemVecList a list. Each list element must be a vector. Each vector element must contain all the variable names of the respective ESM questionnaire version. Use function \code{\link{esItems}} to generate \code{itemVecList}.
#
#' @details \code{esPlausible} makes no sense if there is only one ESM questionnaire version. However, if there is more than one version it makes sense. The list which is returned by \code{esPlausible} contains 4 elements:
#' \enumerate{
#' \item PlausibNames shows which item names occur in the different ESM questionnaire versions
#' \item plausibClass shows the class of each item in each of the different ESM questionnaire versions (R built-in class registration)
#' \item plausibRowNa shows the number of rows in each of the different ESM questionnaire versions and the percentage of existing data therein
#' \item plausibMinMax shows the minimum and the maximum value for all items containing numeric data.
#' }
#
#' @return A list of dataframes, each containing separate information regarding the plausibility of the ESM datasets. See \strong{Details} for more information.
#
#' @examples
#' # o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o
#' # Prerequisites in order to execute esPlausible. Start --------------
#' # Use example list delivered with the package
#' RELEVANTVN_ES <- RELEVANTVN_ESext
#' # keyLsNew is a list of datasets, also delivered with the package
#' rmInvLs <- rmInvalid(keyLsNew, RELEVANTVN_ES)
#' plausibItems <- esItems(dfList=rmInvLs[["dfValid"]], RELEVANTVN_ES)
#' # Prerequisites in order to execute esPlausible. End ----------------
#' # -------------------------------------------------------
#' # Run function 12 of 29; see esmprep functions' hierarchy.
#' # -------------------------------------------------------
#' # Help checking the plausibility of items in the raw ESM datasets. rmInvLs[["dfValid"]]
#' # is one of the results from function 'rmInvalid'.
#' # plausibItems is the result of function 'esItems'.
#' plausibLs <- esPlausible(dfList=rmInvLs[["dfValid"]], itemVecList=plausibItems)
#' # Display the results (4 data frames) to the console
#' # plausibNames gives an overview of the item names across all ESM versions
#' plausibLs[["plausibNames"]]
#' # plausibClass gives an overview of the variable types of all items
#' plausibLs[["plausibClass"]]
#' # plausibRowNa shows for each ESM version the number of lines in the raw
#' # ESM datasets and how much percent of the data is missing (NAs).
#' plausibLs[["plausibRowNa"]]
#' # plausibMinMax shows for each numeric variable the minimum and maximum.
#' plausibLs[["plausibMinMax"]]
#' # o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o=o
#
#' @seealso Exemplary code (fully executable) in the documentation of \code{\link{esmprep}} (function 12 of 29).
#
#' @export
#
esPlausible <- function(dfList, itemVecList) {

    dfCheck <- sapply(dfList, FUN = is.data.frame)

    # If at least one of the elements in dfList is not of type data.frame
    # stop the processing.
    if(any(dfCheck == FALSE)) {
        stop("Error in 1st argument 'dfList', which must be of type 'list'. All elements in the list must be of type 'data.frame'.")
    }

    if(class(itemVecList) != "list") {
        stop("Error in 2nd argument 'itemVecList', which must be of type 'list'. All elements in the list must be of type 'vector'.")
    }

    if(any(!is.character(unlist(itemVecList)))) {
        stop("Error in 2nd argument 'itemVecList'. All elements in each vector must be of type 'character'.")
    }

    namesAll0 <- c()
    for(i in 1:length(itemVecList)) {
        namesAll0 <- c(namesAll0, itemVecList[[i]])
    }
    namesAll <- unique(namesAll0)

    df_temp <- data.frame(namesAll)
    colNamesDf_temp <- c("names", "class", "naPercent", "min", "max", "nRows")

    # j <- 1
    for(j in 1:length(dfList)) {
    		# Match which variable in version j are among all versions variable names
        idxMatchNames <- match(itemVecList[[j]], namesAll)
        # Generate temporary variables containing missing values
        names_temp <- class_temp <- na_temp <- min_temp <- max_temp <- nRow_temp <- rep(NA, times = length(namesAll))
        # Assign the variable names from version j to the temporary variable
        # names_temp
        names_temp[idxMatchNames] <- itemVecList[[j]]
        # Assign the first 3 letters of the class (as registered by R) to the
        # variables of version j
        class_temp[idxMatchNames] <- substring(unlist(lapply(dfList[[j]][,itemVecList[[j]]], class)), 1, 3)
        
        for(k in 1:length(itemVecList[[j]])) {
        		# Generate the current (k_th) variable of version j
        		vn_temp <- itemVecList[[j]][k]
        		# Extract relevant values with summary function
            smry_k <- summary(dfList[[j]][, vn_temp])
            # If there are values returned by summary then
            # is.na(smry_k["Class"]) returns TRUE
            if(is.na(smry_k["Class"])) {
            	# Insert the most relevant values from summary to the
            	# temporary variables
                na_temp[idxMatchNames][k] <- as.numeric(smry_k["NA's"])
                min_temp[idxMatchNames][k] <- as.numeric(smry_k["Min."])
                max_temp[idxMatchNames][k] <- as.numeric(smry_k["Max."])
            }
        }
        nRow_temp[idxMatchNames] <- rep(nrow(dfList[[j]]), times = length(idxMatchNames))
        na_temp[idxMatchNames][which(is.na(na_temp[idxMatchNames]))] <- 0
        df_temp_j <- data.frame(names_temp, class_temp, na_temp, min_temp, max_temp, nRow_temp)
        colnames(df_temp_j) <- paste0(colNamesDf_temp, j)
        df_temp <- data.frame(df_temp, df_temp_j)
    }

    # -----------------------------------------------------------------
    # Extract information per variable and per ES questionnaire version
    # -----------------------------------------------------------------

    # 1 Extract names of variables
    # ----------------------------
    colNamesIdx <- which(substring(names(df_temp), 1, nchar(colNamesDf_temp[1])) == colNamesDf_temp[1])
    df_temp[,"namesCheck"] <- apply(df_temp[,c(1,colNamesIdx)], MARGIN = 1, function(x) all(x[2:(length(colNamesIdx+1))][!is.na(x[2:(length(colNamesIdx+1))])]==x[1]))

    # 2 Extract class of variables
    # ----------------------------
    classesIdx <- which(substring(names(df_temp), 1, nchar(colNamesDf_temp[2])) == colNamesDf_temp[2])

    classesNotNa <- apply(df_temp[,classesIdx], MARGIN = 1, function(x) which(!is.na(x[1:length(classesIdx)])))

    checkClass <- c()
    for(m in 1 : nrow(df_temp)) {
        if(length(classesNotNa[[m]])==1) {
            checkClass <- c(checkClass, TRUE)
        } else {
            consistantClass <- all(as.character(unlist(df_temp[m,classesIdx][classesNotNa[[m]][-1]]))==as.character(unlist(df_temp[m,classesIdx][classesNotNa[[m]][1]])))
            checkClass <- c(checkClass, consistantClass)
        }
    }
    df_temp[,"classCheck"] <- checkClass
    names(df_temp)

    # Extract number of missing values in variables
    # ---------------------------------------------
    nasIdx <- which(substring(names(df_temp), 1, nchar(colNamesDf_temp[3])) == colNamesDf_temp[3])

    # Extract minimum value in variables
    # ----------------------------------
    minimIdx <- which(substring(names(df_temp), 1, nchar(colNamesDf_temp[4])) == colNamesDf_temp[4])

    # Extract maximum value in variables
    # ----------------------------------
    maximIdx <- which(substring(names(df_temp), 1, nchar(colNamesDf_temp[5])) == colNamesDf_temp[5])

    # Extract number of values in questionnaire versions
    # --------------------------------------------------
    rowsIdx <- which(substring(names(df_temp), 1, nchar(colNamesDf_temp[6])) == colNamesDf_temp[6])

    # Compute number of missing values in percent
    # -------------------------------------------
    df_temp[,nasIdx] <- round(df_temp[,nasIdx]/df_temp[,rowsIdx]*100, digits = 1)

    df_tempOrder <- c(names(df_temp)[c(1,colNamesIdx)], "namesCheck", names(df_temp)[classesIdx], "classCheck", names(df_temp)[c(rowsIdx, nasIdx, minimIdx, maximIdx)])
    ncol(df_temp) == length(df_tempOrder)

    namesGrep <- grep(pattern="names", df_tempOrder)
    namesOrder <- df_tempOrder[namesGrep[-1]]

    classGrep <- grep(pattern="class", df_tempOrder)
    classOrder <- df_tempOrder[c(1,classGrep)]

    nRowsGrep <- grep(pattern="nRows", df_tempOrder)
    naPercentGrep <- grep(pattern="naPercent", df_tempOrder)
    nRowsNaPercent <- as.vector(rbind(nRowsGrep, naPercentGrep))
    rowNaOrder <- df_tempOrder[c(1, nRowsNaPercent)]

    minGrep <- grep(pattern="min", df_tempOrder)
    maxGrep <- grep(pattern="max", df_tempOrder)
    minMax <- as.vector(rbind(minGrep, maxGrep))
    minMaxOrder <- df_tempOrder[c(1, minMax)]

    list(plausibNames=df_temp[,namesOrder], plausibClass=df_temp[,classOrder], plausibRowNa=df_temp[,rowNaOrder], plausibMinMax=df_temp[,minMaxOrder])
}

Any scripts or data that you put into this service are public.

esmprep documentation built on July 5, 2019, 5:03 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

esmprep
Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)

R/esPlausible.R
In esmprep: Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)

Try the esmprep package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

esmprep Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)

R/esPlausible.R In esmprep: Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)

Try the esmprep package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

esmprep
Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)

R/esPlausible.R
In esmprep: Data Preparation During and After the Use of the Experience Sampling Methodology (ESM)