R/edsurvey.data.frame.list.R

Defines functions append.edsurvey.data.frame.list extractCovs edsurvey.data.frame.list

Documented in append.edsurvey.data.frame.list edsurvey.data.frame.list

#' @title EdSurvey Dataset Vectorization
#'
#' @description The \code{edsurvey.data.frame.list} function creates an
#'              \code{edsurvey.data.frame.list} object from a series of
#'              \code{edsurvey.data.frame} objects.
#'              \code{append.edsurvey.data.frame.list} creates an
#'              \code{edsurvey.data.frame.list} from two
#'              \code{edsurvey.data.frame} or \code{edsurvey.data.frame.list} objects.
#'
#'              An \code{edsurvey.data.frame.list} is useful for looking at
#'              data, for example, across time or graphically, and reduces
#'              repetition in function calls.
#'              The user may specify a variable that varies across the
#'              \code{edsurvey.data.frame} objects that is
#'              then included in further output.
#'
#' @param datalist   a list of \code{edsurvey.data.frame}s to be combined
#' @param cov        a character vector that indicates what varies across
#'                   the \code{edsurvey.data.frame} objects.
#'                   Guessed if not supplied. For example,
#'                   if several \code{edsurvey.data.frame}s for several
#'                   different countries are supplied, then \code{cov} would
#'                   be set to the country.
#' @param labels     a character vector that specifies labels. Must be the
#'                   same length
#'                   as \code{datalist}. Not needed if \code{cov} exists or can be
#'                   guessed. See Examples.
#' @param sdfA       an \code{edsurvey.data.frame} or an \code{edsurvey.data.frame.list} to be combined
#' @param sdfB       an \code{edsurvey.data.frame} or an \code{edsurvey.data.frame.list} to be combined
#' @param labelsA    a character vector that specifies \code{labels} for \code{sdfA} when creating
#'                   the new \code{edsurvey.data.frame.list}.
#' @param labelsB    a character vector that specifies \code{labels} for \code{sdfB} when creating
#'                   the new \code{edsurvey.data.frame.list}.
#'
#' @details
#' The \code{edsurvey.data.frame.list} can be used in place of an
#' \code{edsurvey.data.frame} in function calls, and results are returned
#' for each of the component \code{edsurvey.data.frame}s, with the
#' organization of the results varying by the particular method.
#'
#' An \code{edsurvey.data.frame.list} can be created from several
#' \code{edsurvey.data.frame} objects that are related;
#' for example, all are NAEP mathematics assessments but have one or more
#' differences (e.g.,  they are all from different years).
#' Another example could be data from multiple countries for an
#' international assessment.
#'
#' When \code{cov} and \code{labels} are both missing, \code{edsurvey.data.frame.list}
#' attempts to guess what variables may be varying and uses those. When there are no
#' varying covariates, generic labels are automatically generated.
#'
#' @return
#' \code{edsurvey.data.frame.list} returns an \code{edsurvey.data.frame.list} with
#' elements
#' \item{datalist}{a list of \code{edsurvey.data.frame} objects}
#' \item{covs}{a character vector of key variables that vary within
#'                    the \code{edsurvey.data.frame.list}.
#'                    When labels are included, they will be included in
#'                    \code{covs}. In the unusual circumstance that \code{sdfA} or \code{sdfB}
#'                    is an \code{edsurvey.data.frame.list}
#'                    has \code{covs}, and labels are not supplied, the \code{covs}
#'                    are simply pasted together with colons between them.}
#'
#' \code{append.edsurvey.data.frame.list} returns an \code{edsurvey.data.frame.list} with
#' elements
#' \item{datalist}{a list of \code{edsurvey.data.frame} objects}
#' \item{covs}{a character vector of key variables that vary within
#'                    the \code{edsurvey.data.frame.list}.
#'                    When labels are included, they will be included in
#'                    \code{covs}.}
#'
#' @author Paul Bailey, Huade Huo
#'
#' @example man/examples/edsurvey.data.frame.list.R
#' @aliases append.edsurvey.data.frame.list
#' @export
edsurvey.data.frame.list <- function(datalist, cov = NULL, labels = NULL) {
  # Search for covariates if no labels or covariates are provided
  searching <- ifelse(is.null(cov) & is.null(labels), TRUE, FALSE)
  if (searching) {
    # these are the attributes that might vary
    cov <- c(
      "subject", "year", "assessmentCode",
      "dataType", "gradeLevel", "survey", "achievementLevels",
      "country"
    )
  }
  # the eventual covs result
  covs <- NULL
  # if the user did not provide the cov (no "s") matrix
  if (!is.null(cov)) {
    # for each column of cov (c)
    covs <- extractCovs(datalist, cov, searching)
  }

  if (!is.null(labels)) {
    if (length(labels) != length(datalist)) {
      makeError <- FALSE
      if (!inherits(labels, "data.frame")) {
        makeError <- TRUE
      }
      if (nrow(labels) != length(datalist)) {
        makeError <- TRUE
      }
      if (makeError) {
        stop(paste0("Length of argument ", sQuote("labels"), " must be the same as the length of the ", sQuote("datalist"), " argument."))
      }
    }
    if (is.null(covs)) {
      if (is.data.frame(labels)) {
        covs <- labels
      } else {
        covs <- data.frame(stringsAsFactors = FALSE, labels = labels)
      }
    } else {
      covs$labels <- labels
    }
  }
  # final results
  res <- list(datalist = datalist, covs = covs)
  class(res) <- c("edsurvey.data.frame.list", "edsurvey.data")
  return(res)
}

extractCovs <- function(dataList, cov, searching) {
  covs <- sapply(cov, function(c) {
    # for each edsurvey.data.frame (z)
    sapply(dataList, function(z) {
      # grab attribute c from edsurvey.data.frame z
      thisAttr <- getAttributes(z, c)
      # if this attribute is a character or numeric
      if (inherits(thisAttr, "character") | inherits(thisAttr, "numeric")) {
        # if the attribute is a vector with more than one element we need to reduce it to a single element.
        if (length(thisAttr) > 1) {
          thisAttr <- paste(thisAttr, collapse = "; ")
        }
      } else {
        thisAttr <- "" # return a blank character value in case of a missing cov value otherwise its returned as a list and won't load into the data.frame
      }
      thisAttr
    }, simplify = TRUE)
  }, simplify = FALSE)
  # make proposed covs
  covs <- data.frame(stringsAsFactors = FALSE, covs)

  if (searching) {
    for (i in ncol(covs):1) {
      if (length(unique(covs[ , i])) == 1) {
        if (i == 1 & ncol(covs) == 1) {
          # we have removed all of the columns.
          # So, the user asked us to automatically identify attributes that varried and we could not.
          # warn them and just use LETTERS as labels
          warning("Cannot identify attributes that vary across elements in datalist. Using generated labels instead.")
          let <- LETTERS
          while (length(let) < nrow(covs)) {
            let <- paste0(rep(let, each = 26), rep(LETTERS, length(let)))
          }
          covs$labels <- let[1:nrow(covs)]
        }
        # if there is no variation in the column, get rid of it
        covs <- covs[ , -i, drop = FALSE]
      }
    }
  }
  return(covs)
}

# @author Huade Huo and Paul Bailey
#
#' @rdname edsurvey.data.frame.list
#' @export
append.edsurvey.data.frame.list <- function(sdfA, sdfB, labelsA = NULL, labelsB = NULL) {
  # return a list of sdfs from either an edsurvey.data.frame.list or a single edsurvey.data.frame
  getDataList <- function(sdf) {
    if (inherits(sdf, c("edsurvey.data.frame.list"))) {
      return(unlist(lapply(sdf[[1]], list), recursive = FALSE))
    } else {
      return(list(sdf))
    }
  }

  getDataLabel <- function(sdf, labels, name) {
    if (!is.null(labels)) {
      labels <- as.data.frame(labels)
      if (inherits(sdf, c("edsurvey.data.frame.list"))) {
        if (nrow(labels) != length(sdf[[1]])) {
          stop(paste0("Provided labels", name, " has ", nrow(labels), " labels, while sdf", name, " has ", length(sdf[[1]]), " elements."))
        }
        return(labels)
      } else {
        # single item
        if (nrow(labels) != 1) {
          stop(paste0("Provided labels", name, " has ", nrow(labels), " labels, while sdf", name, " has 1 element."))
        }
        return(labels)
      }
    } else {
      # no labels provided
      if (inherits(sdf, c("edsurvey.data.frame.list"))) {
        return(sdf[[2]])
      } else {
        return(data.frame(labels = paste0("label ", name)))
      }
    }
  }
  cov <- c(
    "subject", "year", "assessmentCode",
    "dataType", "gradeLevel", "survey", "achievementLevels",
    "country"
  )
  # get data list
  dataList <- c(
    getDataList(sdfA),
    getDataList(sdfB)
  )
  # if there are no provided labels and the existing labels appear to be autogenerated
  # simply autogenerate new labels.
  if (is.null(labelsA) & is.null(labelsB)) {
    if (inherits(sdfA, "edsurvey.data.frame.list") & inherits(sdfB, "edsurvey.data.frame")) {
      if (all(colnames(sdfA$covs) %in% cov)) {
        labs <- extractCovs(dataList = dataList, cov = cov, searching = TRUE)
        return(edsurvey.data.frame.list(
          datalist = dataList,
          labels = labs
        ))
      }
      # sdfA is an esdfl, sdfB is an esdf
    }
    if (inherits(sdfB, "edsurvey.data.frame.list") & inherits(sdfA, "edsurvey.data.frame")) {
      if (all(colnames(sdfB$covs) %in% cov)) {
        # sdfB is an esdfl, sdfA is an esdf
        labs <- extractCovs(dataList = dataList, cov = cov, searching = TRUE)
        return(edsurvey.data.frame.list(
          datalist = dataList,
          labels = labs
        ))
      }
    }
  }
  # get current labels
  labA <- getDataLabel(sdfA, labelsA, "A")
  labB <- getDataLabel(sdfB, labelsB, "B")
  cnA <- colnames(labA)
  cnB <- colnames(labB)
  for (i in seq_along(sdA <- setdiff(cnA, cnB))) {
    labB[ , sdA[i]] <- NA
  }
  for (i in seq_along(sdB <- setdiff(cnB, cnA))) {
    labA[ , sdB[i]] <- NA
  }
  labs <- rbind(labA, labB)
  # labels may not be unique, if so, start over
  if (nrow(unique(labs)) < nrow(labs)) {
    labs <- extractCovs(dataList, cov, searching = TRUE)
  }
  # bring it all together.
  return(edsurvey.data.frame.list(
    datalist = dataList,
    labels = labs
  ))
}

Try the EdSurvey package in your browser

Any scripts or data that you put into this service are public.

EdSurvey documentation built on Nov. 2, 2023, 6:25 p.m.