survey/R/formhub.R

library(stringr)
library(plyr)

setClass("formhubData", 
         representation(form="data.frame"),
         contains="data.frame")

#' Produce a data.frame out of a formhubDataObj
#'
#' @param the formhub object which will be possibly co-erced to a dataframe.
#' @export
#' @return A data.frame represntation of this formhub oject
#' @examples
#' #' Produce a SpatialPointsDataFrame if data has a column of type `gps` or `geopoint`.
#' Otherwise, return NA.
#'
#' @param the formhub object which will be possibly co-erced to a SpatialPointsDataFrame object.
#' @export
#' @return A SpatialPointsDataFrame representation of this formhub Object
#' @examples
#' good_eats_data <- as.data.frame(formhubDownload("good_eats", "mberg"))
#' class(ge_spdf) # "data.frame"
as.data.frame.formhubData <- function(fhD, ...) {
   data.frame(setNames(fhD@.Data, names(fhD)), stringsAsFactors=F)
}

#' Produce a SpatialPointsDataFrame if data has a column of type `gps` or `geopoint`.
#' Otherwise, return NA.
#'
#' @param the formhub object which will be possibly co-erced to a SpatialPointsDataFrame object.
#' @export
#' @return A SpatialPointsDataFrame representation of this formhub Object
#' @examples
#' good_eats_data <- as.data.frame(formhubDownload("good_eats", "mberg"))
#' ge_spdf <- as.SpatialPointsDataFrame(good_eats_data)
#' class(ge_spdf) # "SpatialPointsDataFrame"
as.SpatialPointsDataFrame <- function(formhubObj) {
  gpsfields = subset(formhubObj@form, type %in% c("gps", "geopoint"))$name
  gpsfields = gpsfields[which(gpsfields %in% names(data.frame(formhubObj)))]
  if(length(gpsfields) == 0) NA
  else {
    #TODO: deal with multiple gps questions?
    gpses <- data.frame(formhubObj)[,gpsfields[1]]
    dropRows <- gpses=="NA" | is.na(gpses)
    if(any(dropRows)) {    
      warning(paste("formhub.R: Dropping",table(dropRows)['TRUE'],"rows because GPS not present."))
      gpses <- gpses[!dropRows]
    }
    gpses_split <- apply(str_split_fixed(gpses, " ", 3)[,c(2,1)], 2, FUN=as.numeric)
    sp::SpatialPointsDataFrame(gpses_split, data.frame(formhubObj)[!dropRows,],
                           proj4string=sp::CRS("+proj=longlat +datum=WGS84 +ellps=WGS84"))
  }
}


#' Get a new dataframe, where the header contains the full questions as opposed to slugs.
#'
#' formhub Objects have some data, as well as the form, which documents how
#' the data was obtained through a survey. The data, by default, is represented by
#' slugs, ie, items in the `name` column in the original xfrom. This function
#' replaces slugs in the header with the actual question text.
#'
#' @param formhubDataObj is the formhub data object whose data slot will be renamed
#' @export
#' @return a new data frame with the column names renamed from `name`s (slugs) to `label`s(full questions)
#' @examples
#' good_eats <- formhubDownload("good_eats", "mberg")
#' names(good_eats) # still slugged names
#' summary(good_eats$rating)
#' full_header_good_eats <- replaceHeaderNamesWithLabels(good_eats)
#' names(full_header_good_eats) # not slugged anymore
#' summary(full_header_good_eats$Rating) # but data is the same
replaceHeaderNamesWithLabels <- function(formhubDataObj) {
  newNames <- lapply(names(formhubDataObj), function(nm) {
    index <- which(formhubDataObj@form$name == nm)
    if (length(index) == 0) {
        # 2nd pass: deals with data frame names where character are replaced by dots
        index <- which(str_detect(formhubDataObj@form$name, paste0('^',nm,'$'))) 
    } 
    # replace the name if exactly one other label matches
    if(length(index) == 1 && !is.na(formhubDataObj@form$label[[index]])) {
       formhubDataObj@form$label[[index]]
    } else {
      nm
    }
  })
  setNames(data.frame(formhubDataObj), newNames)
}

#' Get a column of a formhubData object, but with its values replaced by labels
#'
#' @param formhubDataObj is the formhub data object to operate on
#' @param colname is the column name we want to revalue and return
#' @export
#' @return A vector of re-valued data
#' @examples
#' good_eats <- formhubDownload("good_eats", "mberg")
#' replaceColumnNamesWithLabels(good_eats, 'rating')
replaceColumnNamesWithLabels <- function(formhubDataObj, colname) {
    coloptions = formhubDataObj@form[colname,'options']
    if(is.na(coloptions)) stop("names and labels not found for column ", colname)
    optiondf = ldply(RJSONIO::fromJSON(coloptions))
    revalue(formhubDataObj[[colname]], setNames(optiondf$label, optiondf$name))
}

#' Remap all of the columns of the formhub data object according to the remap_list
#'
#' @param remap A vector. The name is what to map, and the value what to map to. 
#'        Example:
#'            remap = c("yes" = TRUE, "no" = FALSE, "dk" = NA)
#'        maps all "yes" values to TRUE, "no" to FALSE, and "dk" to NA
#' @param strictness One of "exact", "all_found", or "any_found"; Default = all.
#'        Defines the strictness of finding data. For example, all_found ensures
#'        that all keys in the data are found in the keys of our remapList,
#'        anyFound will replace partial matches, whereas exact ensures a full 2way match.
#' @export
#' @return A data.frame with values replaced.
#' @examples
#' good_eats <- formhubDownload("good_eats", "mberg")
#' 
remapAllColumns <- function(formhubDataObj, remap, strictness="all_found") {
  data.frame(
    llply(formhubDataObj, function(column) {
      if(is.factor(column) || is.character(column)) {
        data_keys = levels(as.factor(column))
        new_keys = names(remap)
        strictness_criteria_met = switch(strictness,
          all_found = length(data_keys) > 0 & all(data_keys %in% new_keys),
          any_found = length(intersect(data_keys, new_keys)) > 0,
          exact = length(new_keys) == length(data_keys) &
            length(new_keys) == length(union(data_keys, new_keys)))
        if(strictness_criteria_met)
          if(is.logical(remap)) { ## SPECIAL CASE for TRUE/FALSE: cast before returning
            return(as.logical(revalue(column, remap, warn_missing=FALSE)))
          } else {
            return(revalue(column, remap, warn_missing=FALSE))
          }
      }
      return(column)
  }), stringsAsFactors=FALSE)
}

#' Get a new dataframe, where all 'name's are replaced with full labels.
#'
#' formhub Objects have some data, as well as the form, which documents how
#' the data was obtained through a survey. The data, by default, is represented by
#' slugs, ie, items in the `name` column in the original xfrom. This function
#' replaces slugs in the header with actual question text, and replaces slugs in
#' select one options with the actual resposne text.
#'
#' @param formhubDataObj is the formhub data object whose data slot will be renamed
#' @param language if this is a multi-lingual form, the language of choice for
#'        the labels should be passed in. For single-language forms language=NULL.
#' @export
#' @return a new data frames with the column names, as well as factor values, renamed from `name`s (slugs) to `label`s(full questions)
#' @examples
#' good_eats <- formhubDownload("good_eats", "mberg")
#' names(good_eats) # still slugged names
#' summary(good_eats$rating)
#' good_eats_readable <- replaceAllNamesWithLabels(good_eats)
#' names(good_eats_readable) # not slugged anymore
#' summary(good_eats_readable$`Risk Factor`) # not slugged anymore.
replaceAllNamesWithLabels <- function(formhubDataObj, language=NULL) {
  data <- data.frame(formhubDataObj)
  form <- formhubDataObj@form
  row.names(form) <- form$name
  
  l_ply(form[form$type == 'select one',]$name, function(field_name) {
    ol <- RJSONIO::fromJSON(form[field_name,'options'])
    old <- if (is.null(language)) {
        tryCatch({
            ldply(ol, rbind)
        }, error=function(e) {
            stop("If you have a multi-language form, please specify the language.") 
        })
    } else {
        tryCatch({
            ldply(ol, function(option) {
                c(name = option[['name']], label=option[['label']][[language]])
            })
        }, error=function(e) {
            stop("Language argument should be null for single-language forms.") 
        })
    }
    if (! field_name %in% names(data)) {
      col_name <- names(data)[str_detect(field_name, paste0('^', names(data), '$'))] # sometimes characters are
        # replaced by dot; we take advantage of fact that . is an all-character
        # match for regular expressions, and reassign field_name to a name that matches
    } else {
      col_name <- field_name
    }
    stopifnot(length(col_name) == 1) #form and data don't match
    levels(data[,col_name]) <<- revalue(levels(data[,col_name]), 
                                        setNames(as.character(old$label), as.character(old$name)))
  })
  stopifnot(is.data.frame(data))
  replaceHeaderNamesWithLabels(new("formhubData", data, form=form))
}

#' Download data from formhub.
#'
#' This function downloads a dataset for the given form and username, and produces a 
#' formhubData Object.
#'
#' @param formName formname on formhub.org for which we download the data
#' @param uname formhub.org username
#' @param pass formhub.org password, if the data and/or form is private
#' @param ... other parameters to pass onto formhubRead
#' @export
#' @return formhubDataObj a formhubData Object, with "data" and "form" slots
#' @examples
#' good_eats <- formhubDownload("good_eats", "mberg")
#' good_eats # is a data frame of all the data
#' good_eats@form # is the form for that data, encoded as a dataframe
#' privateData <- formhubDownload("Private_Data_For_Testing", uname="formhub_r", pass="t3st~p4ss")
formhubDownload = function(formName, uname, pass=NA, authfile=NA, url='http://formhub.org/', ...) {
  fUrl <- function(formName, uname, form=F) {
    str_c(url, uname, '/forms/', formName,
          ifelse(form,'/form.json', '/data.csv'))
  }
  dataUrl = fUrl(formName, uname)
  formUrl = fUrl(formName, uname, form=T)
  
  ## TODO: implement better authentication in formhub. until then, we use a simple authfile
  upass <- if(!is.na(authfile)) {
    scan(authfile, what=character())
  } else if(!is.na(pass)) {
    str_c(uname,pass,sep=":")
  }
  
  #TODO -- pre-flight check? below doesn't work; expects 200+ status
  #if(!url.exists(datUrl)) { stop("could not find ", dataUrl)}
  #if(!url.exists(formUrl)) { stop("could not find ", formUrl)}
  
  # get the data, depending on public or not
  dataCSVstr <- ifelse(is.na(pass) & is.na(authfile),
                 RCurl::getURI(dataUrl),
                 RCurl::getURI(dataUrl, userpwd=upass, httpauth = 1L))
  
  # get the form, depending on public or not
  # TODO: situations where data is public, form is not
  formJSON <- ifelse(is.na(pass) & is.na(authfile),
                     RCurl::getURI(formUrl),
                     RCurl::getURI(formUrl, userpwd=upass, httpauth = 1L))
  formhubRead(textConnection(dataCSVstr), formJSON, ...)
}

#' Reads data from a passed csv filename and json filename into a formhubData object.
#'
#' This function creates a formhubData object from two files: a csv data file, and a 
#' json form file. These should both be downloaded from formhub.org for the same form.
#'
#' @param csvfilename filename (or a connection object) that has the formhub data
#' @param jsonfilename filename of a json file (or a connection object) that has the form.json form
#' @param extraFormDF override the form (such as by providing a type for a calculate, a new label, etc.)
#' @param dropCols a regular expression, any column name that matches that regexp will be dropped
#' @param na.strings list of na.strings to be passed onto read.csv (default: "n/a")
#' @param keepGroupNames for a question with name foo in group bar, keepGroupName=T will generate
#'        a name foo.bar, while keepGroupName=F will generate a name bar
#' @export
#' @return formhubDataObj a formhubData Object, with "data" and "form" slots
#' @examples
#' # will need to download data.csv and form.json for a specific form on formhub, for below, download
#' http://formhub.org/mberg/forms/good_eats/data.csv http://formhub.org/mberg/forms/good_eats/form.json
#' good_eats <- formhubRead("~/Downloads/good_eats_2013_05_05.csv", "~/Downloads/good_eats.json")
#' head(good_eats) # is a data frame of all the data
#' good_eatsX <- formhubRead("~/Downloads/good_eats_2013_05_05.csv", "~/Downloads/good_eats.json",
#'              extraFormDF=data.frame(name="imei", type="integer", label="IMEI"))
#' good_eatsX@form # note that imei is now slated as type "integer" instead of type "imei"
#' str(good_eatsX$imei) # also notice that it is numeric instead of a factor
#' good_eatsWO <- formhubRead("~/Downloads/good_eats_2013_05_05.csv", "~/Downloads/good_eats.json",
#'              dropCols="submit*")
#' names(good_eatsWO) # notice how submit_date and submit_date are no longer there
#' good_eatsNA <- formhubRead("~/Downloads/good_eats_2013_05_05.csv", "~/Downloads/good_eats.json",
#'              na.strings=c("999"))
#' good_eatsNA$amount # notice that the value that was 999 is now missing. This is helpful when using values such
#'                    # as 999 to indicate no data
formhubRead = function(csvfilename, jsonfilename, extraFormDF=data.frame(), dropCols="", na.strings=c("n/a"),
                        convert.dates=TRUE, keepGroupNames=TRUE) {
  dataframe <- read.csv(csvfilename, stringsAsFactors=FALSE, header=TRUE, na.strings=na.strings)
  formDF <- form_to_df(RJSONIO::fromJSON(jsonfilename, encoding='utf-8'), keepGroupNames=keepGroupNames)
  
  # drop group names from data frame names
  dataframe <- setNames(dataframe, llply(names(dataframe), function(name) {
    if (name %in% formDF$name) {
      name
    } else { # try to deal with the fact that R munges non-alphanumeric characters into a .
      split_by_dots <- unlist(str_split(name, '\\.'))
      sbd_length <- length(split_by_dots)
      if(sbd_length == 1) { name } else {
        # first try and see if we find a match as a simple question
        searchstring <- paste0('^',split_by_dots[sbd_length],'$')
        possible_name <- formDF$name[str_detect(formDF$name, searchstring)]
        if(length(possible_name) == 1) { # we're home free
          possible_name
        } else {
          # next try to see if we match the last dot-phrase (ie, an option for multi-select)
          searchstring <- paste0('^',split_by_dots[sbd_length-1], '\\.',
                                 split_by_dots[sbd_length],'$')
          possible_name <- formDF$name[str_detect(formDF$name, searchstring)]
          if(length(possible_name) == 1) possible_name else name
        }
      }
    }
  }))

  formhubCast(dataframe, formDF, extraFormDF=extraFormDF, dropCols=dropCols,
              convert.dates=convert.dates)
}

#' Casts a dataframe to the right types based on a form-dataframe.
#'
#' This function creates a formhubData object based on a pair of dataframes: the data
#' and the form that describes the data. The column names of the data match with the "name" column of 
#' the form, and the "type" column in the form provide information for type conversion.
#'
#' @param dataDF data
#' @param formDF form data frame. See format above.
#' @param extraFormDF override the form (such as by providing a type for a calculate, a new label, etc.)
#' @param dropCols a regular expression, any column name that matches that regexp will be dropped
#' @return formhubDataObj a formhubData Object, with "data" and "form" slots
#' @examples
#' 
#' #See examples under formhubRead; this should be used through formhubRead in almost all cases
formhubCast  = function(dataDF, formDF, extraFormDF=data.frame(), dropCols="", convert.dates=TRUE) {
  dataDF <- removeColumns(dataDF, dropCols)

  extraFormDF <- colwise(as.character)(extraFormDF)
  formDF <- rbind.fill(extraFormDF, formDF)
  if(anyDuplicated(formDF$name)) {
    warning("Question names not unique in form; questions may be dropped from form df.")
    formDF <- formDF[!duplicated(formDF$name),]
  }
  row.names(formDF) <- formDF$name
  
  data = recastDataFrameBasedOnFormDF(dataDF, formDF, convert.dates=convert.dates)
  stopifnot(is.data.frame(data))
  new("formhubData", data, form=formDF)
}

#' Converts formhub form.json format to dataframe format. Dataframe has name, type, label columns.
#'
#' @param formJSON formJSON that has been freshly read from JSON using JSONIO's fromJSON function.
#' @param keepGroupNames for a question with name foo in group bar, keepGroupName=T will generate
#'        a name foo.bar, while keepGroupName=F will generate a name bar
#' @return formDF
#' @examples
#' good_eats_form_df <- form_to_df(fromJSON("~/Downloads/good_eats.json"))
form_to_df = function(formJSON, keepGroupNames=TRUE) {
  form_to_df_internal = function(thisJSON, prefix="") {
    ldply(thisJSON[["children"]], function(child) {
      nom <- if (prefix == "") { child[["name"]] } else { paste(prefix, child[["name"]], sep=".") }
      if (child[["type"]] == "group") {
        if(keepGroupNames) {
          form_to_df_internal(child, prefix=nom)
        } else {
          form_to_df_internal(child)
        }
      } else if (child[["type"]] == "select all that apply") {

        options <- child[["children"]]        
        nameprefix <- ifelse(prefix=="", child[["name"]], str_c(prefix, child[["name"]], sep="."))
        names <- paste(nameprefix, sapply(options, function(o) o['name']), sep=".")
        
        labels <- sapply(options, function(o) { paste( child[["label"]], o['label'], sep=" >> ")})
        #TODO: fix properly; this is a hack for multi-lingual labels
        labels <- if(class(labels) == 'matrix') { labels[1,] } else { labels }
        data.frame(name=names, label=labels, type="boolean", options=NA, stringsAsFactors=F)
      } else if (child[["type"]] == "select one") {
        if("children" %in% names(child)) {
            data.frame(name=nom, type=child[["type"]], options=RJSONIO::toJSON(child$children),
                   label=if("label" %in% names(child)) {child[["label"]]} else {child[["name"]]},
                   stringsAsFactors=F)
        } else if ("itemset" %in% names(child)) {
            data.frame(name=nom, type=child[["type"]],
                     options=RJSONIO::toJSON(formJSON$choices[[child[['itemset']]]]), # options are more complex with itemset
                     label=if("label" %in% names(child)) {child[["label"]]} else {child[["name"]]},
                     stringsAsFactors=F)
        }
      } else {
        data.frame(name=nom, type=child[["type"]], options=NA,
                   label=if("label" %in% names(child)) {child[["label"]]} else {child[["name"]]},
                   stringsAsFactors=F)
      }
    })
  }
  df <- form_to_df_internal(formJSON)
  df
}

#' Casts a dataframe to the right types based on a form-dataframe. Used by formhubCast
#'
#' @param df data
#' @param formdf form data frame. See format on formhubCast.
#' @return df re-casted data frame.
#' @examples
#' 
#' #See examples under formhubRead; this should be used through formhubRead in almost all cases
recastDataFrameBasedOnFormDF = function(df, formdf, convert.dates=TRUE) {
  # do this by type
  #TODO: refactor
  stopifnot(is.character(formdf$name))
  # re-type everything in df of type in types with reTypeFunc
  reTypeColumns <- function(types, reTypeFunc) { 
    cols <- c(subset(formdf, type %in% types)$name)
    colsToReType <- unique(cols[cols %in% names(df)])
    suppressWarnings(suppressMessages(
      df[colsToReType] <<- colwise(reTypeFunc)(df[colsToReType])
    ))
  }
  # lubridate doesn't handle ISO 8601 datetimes yet, so we just chuck the timezone info
  iso8601DateTimeConvert <- function(x) { lubridate::ymd_hms(str_extract(x, '^[^+Z]*(T| )[^+Z-]*')) }
  
  # some formhub dates come in the format 2011-04-24T00:20:00.000000
  iso8601DateConvert <- function(x) { lubridate::ymd(str_extract(x, '^[^T]*')) }
  
  reTypeColumns(c("integer", "decimal"), as.numeric)
  reTypeColumns(c("boolean"), as.logical)
  reTypeColumns(c("select one", "imei", "subscriberid", "simserial", "deviceid", "phonenumber"), as.factor)
  if(convert.dates) {
    reTypeColumns(c("date", "today"), iso8601DateConvert)
    reTypeColumns(c("start", "end", "datetime"), iso8601DateTimeConvert)
  }
  df
}

#' Add columns corresponding to the original, as well as medium and small thumbnails of images
#' as stored on the formhub server.
#'
#' @param the formhubData object which to create URLs from.
#' @param the formhub username of the person who owns this form.
#' @param type: "url" or "img". URL only puts image url in, img puts image tag in.
#'
#' @export
#' @return a formhubData object, with a few additional URL columns
#' @examples
#' good_eats <- as.data.frame(formhubDownload("good_eats", "mberg"))
#' good_eats_with_photos <- addPhotoURLs(good_eats, "mberg")
#' grep("URL", names(good_eats_with_photo), value=T) # the new columns
addPhotoURLs = function(formhubDataObj, formhubUsername, type="url") {
  photos <- c(subset(formhubDataObj@form, type %in% "photo")$name)
  htmlFromCol <- function(photoCol, size, type) {
    stopifnot(size %in% c("", "medium", "small"))
    if (type == "url") { 
      ifelse(is.na(photoCol), "",
           sprintf("https://formhub.org/attachment/%s?media_file=%s/attachments/%s",
                   size, formhubUsername, photoCol))
    } else if (type == "img") {
      ifelse(is.na(photoCol), "",
             sprintf('<img src="https://formhub.org/attachment/%s?media_file=%s/attachments/%s" />',
                     size, formhubUsername, photoCol))
    } else { 
      stop("Type must be either 'url' or 'img'.")
    }
  }
  tmp <- as.data.frame(llply(photos, function(photoColName) {
    photoCol <- formhubDataObj[[photoColName]]
    setNames(data.frame(
      htmlFromCol(photoCol, "", type),
      htmlFromCol(photoCol, "medium", type),
      htmlFromCol(photoCol, "small", type),
      stringsAsFactors=FALSE
    ), paste0(photoColName, c("_URL_original", "_URL_medium", "_URL_small")))
  }))
  tmp <- cbind(formhubDataObj, tmp)
  new("formhubData", tmp, form=formhubDataObj@form)
}

#' Helper function to remove columns from data based on reg-exp matching. Also takes list of strings.
#'
#' @param df data
#' @param columnNameRegExpMatcher pattern(s) to match to columns; matched columns are dropped.
#' @return a smaller data frame.
#' @examples
#' good_eats_df <- formhubDownload("good_eats", "mberg")
#' names(good_eats_form_df) # note it includes submit_date and submit_data both
#' names(removeColumns(good_eats_form_df, "submit*")) # both of which are gone now
#' names(removeColumns(good_eats_form_df, c("submit*", "_gps*")) # you can pass a list of regular expressions
removeColumns <- function(df, columnNameRegExpMatcher) {
  if (columnNameRegExpMatcher=="" || is.na(columnNameRegExpMatcher)) { 
    df 
  } else {
    orMatcher <- paste(columnNameRegExpMatcher, collapse="|")
    df[,-which(str_detect(names(df), orMatcher))]
  }
}
mlgrm/svy documentation built on May 23, 2019, 2:09 a.m.