R/read-epx.R
In epxToR: Import 'Epidata' XML Files '.epx'

Documented in abstract as.data.frame read.epx

## Read read data using the new Epidata XML format into R

## Copyright (C) 2011 David Whiting

## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 3 of the
## License, or (at your option) any later version.

## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software Foundation,
## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA


## PLEASE NOTE THAT I AM STILL EXPERIMENTING WITH THIS AND IT MIGHT
## NOT ALWAYS WORK.


status.log <- function(x) {
  ## Purpose: Simple logging mechanism. Can be useful for detecting bottlenecks etc.
  ## ----------------------------------------------------------------------
  ## Arguments: a message to be recorded in the log
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 12 Jun 2011, 18:27
  right.now <- strftime(Sys.time(), format = "%Y-%m-%d %H:%M:%S")
  cat(paste(right.now, x, "\n"), file = "STATUS.LOG", append = TRUE)
}



extract.epidata.records <- function(rec, fields) {
  ## Purpose: Extract records from xml structure, allowing for missing fields.
  ## ----------------------------------------------------------------------
  ## Arguments: rec: a single record
  ## fields: a vector of field names (probably from the info table)
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 12 Jun 2011, 19:59
  #require(httr)

  dd <- xmlValue(rec)
  L_ <- gsub(";","&", dd)
  L_ <- gsub('\\"','"', L_)
  URL <- parse_url(sprintf("?%s",L_))$query
  DAT <- data.frame(URL, stringsAsFactors=FALSE)

  names.of.missing.fields <- fields[!fields %in% names(DAT)]
  if (length(names.of.missing.fields)) {
    num.missing.flds <- length(names.of.missing.fields)
    missing.flds <- rep(NA, num.missing.flds)
    names(missing.flds) <- names.of.missing.fields
    DAT <- c(DAT, missing.flds)
  }
  ## Sort the fields so that they are all in the same order.
  DAT <- DAT[order(names(DAT))]
  DAT
}



## ======================================================================
## Purpose: Get the records from the XML file
## ----------------------------------------------------------------------
## Arguments: datfile: the DataFiles node from the XML file.
## flds: field information, as generated by fld.info()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 01 Feb 2017
## ======================================================================
epidata.records <- function(datfile, flds) {
  status.log(paste(">>>> epidata.records", "start"))

  epi.records <- xmlChildren(datfile)[["Records"]]
  num.recs <- xmlSize(epi.records)
  status.log(paste("Found", num.recs, "records"))
  status.log(paste(">>>> extract.epidata.records", "start"))
  recs <- xmlApply(epi.records, extract.epidata.records, flds)
  status.log("rbind the records")
  recs <- as.data.frame(do.call(rbind, recs))
  status.log(paste("Extracted", nrow(recs), "records"))
  rownames(recs) <- NULL
  recs
}


## ======================================================================
## Purpose: Convert from epidata to R data types
## ----------------------------------------------------------------------
## Arguments: x: a vector of values
## fld.type: the epidata type (a code number)
## Settings: settings information, as returned by epidata.meta.data()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 7 Mar 2017
## ======================================================================
convert.type <- function(x, fld.type, Settings) {

  if (!is.factor(x)) {
    if (fld.type == "ftString") {
      x <- gsub('\\"', '', x)
    } else if (fld.type == "ftInteger") {
      x <- as.integer(x)
    }else if (fld.type == "ftFloat") {
       x <-type.convert(as.character(x), dec=",")
    }else if (fld.type == "ftDMYDate") {
      x <- as.Date(as.character(x), "%d/%m/%Y")
    }
    else {
      status.log(paste("Field type not handled:", fld.type))
    }

    # if (fld.type %in% c(1, 2)) {
    #   x <- as.numeric(as.character(x))
    # } else if (fld.type %in% c(12, 13)){
    #   ## Characters, do nothing
    # } else if (fld.type == 3){
    #   ## Decimal separator hack. It should convert to whatever R is using.
    #   levels(x) <- gsub("[,.]", Sys.localeconv()[['decimal_point']], levels(x))
    #   x <- as.numeric(as.character(x))
    # } else if (fld.type %in% c(4, 7) ){
    #   ## 16/05/1968 (DD/MM/YYYY, i.e. 16th of May, 1968)
    #   dateFormat <- paste("%d", "%m", "%Y", sep = Settings$DateSeparator)
    #   x <- as.Date(x, dateFormat)
    # } else if (fld.type %in% c(5, 8) ){
    #   ## 16/05/1968 (MM/DD/YYYY, i.e. May 16th, 1968)
    #   dateFormat <- paste("%m", "%d", "%Y", sep = Settings$DateSeparator)
    #   x <- as.Date(x, dateFormat)
    # } else if (fld.type %in% c(6, 9) ){
    #   ## 16/05/1968 (YYYY/MM/DD, i.e. 1968, May 16th)
    #   dateFormat <- paste("%Y", "%m", "%d", sep = Settings$DateSeparator)
    #   x <- as.Date(x, dateFormat)
    # } else if (fld.type %in% c(10, 11) ){
    #   ## Time fields. At the moment it sets the date part to the current date.
    #   timeFormat <- paste("%H", "%M", "%S", sep = Settings$TimeSeparator)
    #   x <- as.POSIXct(strptime(x, timeFormat))
    # } else if (fld.type == 0){
    #   ## Logical - empty to NA, Y to TRUE, else to FALSE
    #   x[x == ""] <- NA
    #   x <- x == "Y"
    # } else {
    #   status.log(paste("Field type not handled:", fld.type))
    # }
  }
  x
}


## ======================================================================
## Purpose: Apply the field definition information to each field
## ----------------------------------------------------------------------
## Arguments: sections: sections node from the XML file (these are
## sections of the data entry screen.
## dat: a dataframe of records that have been extracted from the XML file.
## Settings: settings information, as returned by epidata.meta.data()
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 18:27
## Author: jp.decorps@epiconcept.fr, Date: 29 Jan 2017
## ======================================================================
epidata.apply.field.structure <- function(sections, dat, Settings) {

  status.log(paste(">>>> epidata.apply.field.structure", "start"))

  num.sections <- xmlSize(xmlChildren(sections))

  print(num.sections)

  for (si in 1:num.sections) {
    fields <- xmlChildren(xmlChildren(sections)[[si]])[["Fields"]]
    num.flds <- xmlSize(fields)
    if (num.flds > 1) {
      for (i in 1:num.flds) {
        field <- xmlChildren(fields)[[i]]
        A_ <- xmlAttrs(field)
        fld.id <- A_["id"]
        # fld.id <- xmlAttrs(field)
        # fld.name <- xmlValue(xmlChildren(field)[["Name"]])
        fld.type <- A_["type"]
        # fld.type <- xmlValue(xmlChildren(field)[["type"]])
        # print(fld.type)
        fld <- which(names(dat) == fld.id)
        # names(dat)[fld] <- fld.name
        names(dat)[fld] <- fld.id
        dat[, fld] <- convert.type(dat[, fld], fld.type, Settings)
      }
    }
  }
  dat
}




## ======================================================================
## Function: read.epx (constructor)
## Description: Main user function to read in the XML file.
## ----------------------------------------------------------------------
## Arguments: x: the name of an XML file.
## use.epidata.labels: If FALSE do not use the epidata value labels.
## set.missing.na: if TRUE (the default) use the epidata definition
## of missing values and set the value in R to NA. Epidata allows
## for more than one definition of missing value, and all of these
## will be mapped to NA.
## ----------------------------------------------------------------------
## Return: epx.data object
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 18:27
## Author: jp.decorps@epiconcept.fr, Date: 03 Feb 2017, 02:11
## ======================================================================
read.epx <- function(x,
                     use.epidata.labels = TRUE,
                     set.missing.na = TRUE) {

  #require(XML)

  unlink("STATUS.LOG")

  t1 <- Sys.time()
  status.log(paste("Parsing", x))
  y <- list()
  y[['filename']] <- x

  ## Take all the records.
  x <- xmlTreeParse(x)

  epidata <- xmlRoot(x)
  x.fld.info <- fld.info(epidata)

  y[['Settings']] <- epidata.meta.data(epidata, "Settings")
  ## Get the data files
  num.datafiles <- xmlSize(xmlChildren(epidata)["DataFiles"])
  for (i in 1:num.datafiles) {
    datfile <- xmlChildren(xmlChildren(epidata)[["DataFiles"]])[[i]]
    datfile.name <- xmlAttrs(datfile)[["id"]]
    sections <- xmlChildren(datfile)[["Sections"]]

    # status.log("Get the records")
    dat1 <- epidata.records(datfile, x.fld.info$id)
    if (nrow(dat1) > 0) {
      status.log("Apply field structure")
      dat1 <- epidata.apply.field.structure(sections, dat1, y$Settings)
      y$data[i] <- list(dat1)
      names(y$data)[i] <- datfile.name
    }
  }

  y[['field.info']] <- x.fld.info
  y[['labels']] <- get.epidata.value.labels(epidata, y$Settings)
  y[['ProjectSettings']] <- epidata.meta.data(epidata, "ProjectSettings")
#  y[['Admin']] <- epidata.meta.data(epidata, "Admin")
  y[['Study']] <- epidata.study.info(epidata)

  if (use.epidata.labels & "data" %in% names(y)) {
    status.log("Use epidata labels")
    y <- use.epidata.labels(y, set.missing.na)
  }
  duration <- round(as.numeric(difftime(Sys.time(), t1), units = "secs"), 1)
  status.log(paste("Finished in", duration, "seconds."))

  # return(y) # y

  # --- Create an epx.data object
  # structure(list("epx"=y), class = "epx")
  # epx <- list(y)
  class(y) <- "epx"
  y
}

## ======================================================================
## Function: as.data.frame
## Description: S3 method - return data.frame from epx object.
## ----------------------------------------------------------------------
## Arguments: x: an epx object from read.epx
## Return: a data.frame
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 09 Feb 2017, 02:11
## ======================================================================
as.data.frame <- function(x) UseMethod("as.data.frame", x)
as.data.frame.epx <- function(x) {
  x$data[[1]]
}

## ======================================================================
## Function: return Study infos as a data.frame
## Description: S3 method - return data.frame from epx object.
## ----------------------------------------------------------------------
## Arguments: x: an epx object from read.epx
## Return: a data.frame
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 07 Mar 2017, 02:38
## ======================================================================
abstract <- function(x) UseMethod("abstract", x)
abstract.epx <- function(x) {
  I <- c("File name", "Title", "Author", "Agency",
            "Created", "Identifier", "Modified",  "Notes", "Version")
  Z <- x$Study
  V <- lapply(x$Study, function(x) ifelse(is.null(x), NA, x))
  R <- c(V$Title[[1]], V$Author, V$Agency, V$Created, V$Identifier, V$Modified, V$Notes, V$Version)
  R <- c(x$filename, R)

  #return(R)
  df <- data.frame(I, R)
  colnames(df) <- c("Info", "Value")
  df
}

## ======================================================================
## Purpose: Get somes infos about the study
## ----------------------------------------------------------------------
## Arguments: x: an xmlRoot()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 07 Mar 2017, 01:15
## ======================================================================
epidata.study.info <- function(x) {
  .tag <- "StudyInfo"
  l_node <- xmlElementsByTagName(x, .tag, recursive = TRUE)[[.tag]]
  .l = xmlToList(node=l_node)
  .l

}
## ======================================================================
## Purpose: Get the epidata settings information
## ----------------------------------------------------------------------
## Arguments: x: an xmlRoot()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 29 Jan 2017, 03:35
## ======================================================================
epidata.meta.data <- function(x, tag) {

  status.log(paste("epidata.meta.data", tag))

  y <- list()
  l_node <- xmlElementsByTagName(x, tag, recursive = TRUE)[[tag]]
  l_attr <- xmlAttrs(l_node, TRUE, TRUE)
  status.log("epidata.meta.data")
  for (i in 1:xmlSize(l_attr)) {
    dd <- l_attr[[i]]
    if (length(dd) == 0) dd <- ""
    y[names(l_attr)[[i]]] <- dd
  }
  y
}






fld.info <- function(x) {
  ## Purpose: Create a table of info about the fields
  ## ----------------------------------------------------------------------
  ## Arguments: x: an xmlRoot()
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 12 Jun 2011, 18:26

  status.log(paste("fld.info", "start"))

  y <- xmlElementsByTagName(x, "Field", recursive = TRUE)
  fld.id <- NULL
  fld.name <- NULL
  fld.type <- NULL
  fld.length <- NULL
  fld.decimals <- NULL
  fld.question <- NULL
  fld.valuelabel <- NULL
  for (i in 1:xmlSize(y)) {
    A_ <- xmlAttrs(y[[i]])
    fld.id <- c(fld.id, A_["id"])
    fld.name <- c(fld.name,  A_["id"])
    fld.type <- c(fld.type,  A_["type"])
    fld.valuelabel <- c(fld.valuelabel, A_["valueLabelRef"])
    # fld.length <- c(fld.length, xmlValue(xmlChildren(y[[i]])[["Length"]]))
    # fld.decimals <- c(fld.decimals, xmlValue(xmlChildren(y[[i]])[["Decimals"]]))
    # fld.question <- c(fld.question, xmlValue(xmlChildren(y[[i]])[["Question"]]))
    # fld.valuelabel <- c(fld.valuelabel, xmlValue(xmlChildren(y[[i]])[["ValueLabelId"]]))
  }
  dt <- data.frame(id = fld.id,
             name = fld.name,
             type = fld.type,
             value.labelset = fld.valuelabel
  )
  dt
}




get.epidata.value.labels <- function(x, Settings) {
  ## Purpose: Create a list of epidata labels
  ## ----------------------------------------------------------------------
  ## Arguments: x: an xmlRoot
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 14 Jun 2011, 20:26
  ## ----------------------------------------------------------------------
  status.log(paste("get.epidata.value.labels", "start"))

  y <- xmlElementsByTagName(x, "ValueLabelSet", recursive = TRUE)

  if (xmlSize(y) == 0) return(NULL)

  i <- 1
  value.labels <- list()
  for (i in 1:xmlSize(y)) {
    this.valueset <- y[[i]]
    A_ <- xmlAttrs(this.valueset)
    valueset.id <- A_["id"]
    valueset.name <- A_["id"]
    valueset.type <- A_["type"]
    j <- 1
    this.value <- NULL
    this.order <- NULL
    this.label <- NULL
    this.missing <- NULL
    L_size = xmlSize(this.valueset)
    for (j in 1:L_size) {
      VLS = this.valueset[[j]]
      ATTR <- xmlAttrs(VLS)

      this.value <- c(this.value, ATTR["value"])
      this.order <- c(this.order, ATTR["order"])

      LB <- xmlValue(VLS[[1]])
      LB <- gsub('\\"','"', LB)

      if (LB == "Manquant") {
        this.missing <- c(this.missing, TRUE)
      } else {
        this.missing <- c(this.missing, FALSE)
      }
      this.label <- c(this.label, xmlValue(VLS[[1]]))
    }
    ## Convert the value to the right data type
    this.value <- convert.type(factor(this.value), valueset.type, Settings)
    these.labels <- data.frame(value = this.value, order = this.order, label = this.label, missing = this.missing)
    these.labels <- list(name = valueset.name,
                         type = valueset.type,
                         labels = these.labels)
    value.labels[valueset.id] <- list(these.labels)
  }
  value.labels
}


is.epidata.na <- function(x, value.labels, label.set) {
  ## Purpose: Determine if a value is missing or not
  ## ----------------------------------------------------------------------
  ## Arguments: x: a vector of values
  ## value.labels: a list of value labels created by get.epidata.value.labels()
  ## label.set: the name of a set of labels.
  ## ----------------------------------------------------------------------
  ## Returns: a logical vector (TRUE/FALSE)
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 14 Jun 2011, 20:26

  retval <- NULL
  for (j in 1:length(x)) {
    if (is.na(x[j])) {
      this.val <- NA
    } else {
      i <- as.character(value.labels[[label.set]]$labels$value) == as.character(x[j])
      this.val <- value.labels[[label.set]]$labels$missing[i]
    }
    retval <- c(retval, this.val)
  }
  retval
}




epidata.value.label <- function(x, value.labels, label.set) {
  ## Purpose: Return the value label for a given value
  ## ----------------------------------------------------------------------
  ## Arguments: x: a vector of values
  ## value.labels: a list of value labels created by get.epidata.value.labels()
  ## label.set: the name of a set of labels.
  ## ----------------------------------------------------------------------
  ## Returns: a factor vector of value labels for a given values
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 14 Jun 2011, 20:26

  retval <- NULL
  missing.levels <- NULL

  for (j in 1:length(x)) {
    if (is.na(x[j])) {
      this.val <- NA
    } else {
      VF = gsub('\\"', '', as.character(x[j]))
      i <- as.character(value.labels[[label.set]]$labels$value) == VF
      if (any(i)) {
        this.val <- as.character(value.labels[[label.set]]$labels$label[i])
      } else {
        missing.levels <- unique(c(missing.levels, as.character(x[j])))
        this.val <- as.character(x[j])
      }
    }
    retval <- c(retval, this.val)
  }
  if (!is.null(missing.levels)) {
    missing.levels <- paste(missing.levels, collapse = ", ")
    status.log(paste("Levels missing in label set ",  label.set, ": ", missing.levels, sep = ""))
  }
  as.factor(retval)
}



use.epidata.labels <- function(x, set.missing.na = TRUE) {
  ## Purpose: Recode the data using the epidata value labels
  ## ----------------------------------------------------------------------
  ## Arguments: x: an imported object with data and field info
  ## ----------------------------------------------------------------------
  ## Author: David Whiting, Date: 14 Jun 2011, 20:26
  ## ----------------------------------------------------------------------
  status.log(paste("Use epidata labels", "start"))

  for (i in 1:nrow(x$field.info)) {
    if (!is.na(x$field.info$value.labelset)[i]) {
      this.labelset <- as.character(x$field.info$value.labelset[i])
      this.field <- as.character(x$field.info$name[i])
      ## This is a bit clumsy, but I had to break it down to get my head
      ## around it.
      j <- which(names(x$data[[1]]) == this.field)
      dd <- x$data[[1]][, j]
      ## Mark the missing values first
      if (set.missing.na) {
        dd[is.epidata.na(dd, x$labels, this.labelset)] <- NA
      }
      ## Relabel the values.
      dd <- epidata.value.label(dd, x$labels, this.labelset)
      x$data[[1]][, j] <- dd
    }
  }
  x
}