Nothing
## Read read data using the new Epidata XML format into R
## Copyright (C) 2011 David Whiting
## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 3 of the
## License, or (at your option) any later version.
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software Foundation,
## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
## PLEASE NOTE THAT I AM STILL EXPERIMENTING WITH THIS AND IT MIGHT
## NOT ALWAYS WORK.
status.log <- function(x) {
## Purpose: Simple logging mechanism. Can be useful for detecting bottlenecks etc.
## ----------------------------------------------------------------------
## Arguments: a message to be recorded in the log
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 18:27
right.now <- strftime(Sys.time(), format = "%Y-%m-%d %H:%M:%S")
cat(paste(right.now, x, "\n"), file = "STATUS.LOG", append = TRUE)
}
extract.epidata.records <- function(rec, fields) {
## Purpose: Extract records from xml structure, allowing for missing fields.
## ----------------------------------------------------------------------
## Arguments: rec: a single record
## fields: a vector of field names (probably from the info table)
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 19:59
#require(httr)
dd <- xmlValue(rec)
L_ <- gsub(";","&", dd)
L_ <- gsub('\\"','"', L_)
URL <- parse_url(sprintf("?%s",L_))$query
DAT <- data.frame(URL, stringsAsFactors=FALSE)
names.of.missing.fields <- fields[!fields %in% names(DAT)]
if (length(names.of.missing.fields)) {
num.missing.flds <- length(names.of.missing.fields)
missing.flds <- rep(NA, num.missing.flds)
names(missing.flds) <- names.of.missing.fields
DAT <- c(DAT, missing.flds)
}
## Sort the fields so that they are all in the same order.
DAT <- DAT[order(names(DAT))]
DAT
}
## ======================================================================
## Purpose: Get the records from the XML file
## ----------------------------------------------------------------------
## Arguments: datfile: the DataFiles node from the XML file.
## flds: field information, as generated by fld.info()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 01 Feb 2017
## ======================================================================
epidata.records <- function(datfile, flds) {
status.log(paste(">>>> epidata.records", "start"))
epi.records <- xmlChildren(datfile)[["Records"]]
num.recs <- xmlSize(epi.records)
status.log(paste("Found", num.recs, "records"))
status.log(paste(">>>> extract.epidata.records", "start"))
recs <- xmlApply(epi.records, extract.epidata.records, flds)
status.log("rbind the records")
recs <- as.data.frame(do.call(rbind, recs))
status.log(paste("Extracted", nrow(recs), "records"))
rownames(recs) <- NULL
recs
}
## ======================================================================
## Purpose: Convert from epidata to R data types
## ----------------------------------------------------------------------
## Arguments: x: a vector of values
## fld.type: the epidata type (a code number)
## Settings: settings information, as returned by epidata.meta.data()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 7 Mar 2017
## ======================================================================
convert.type <- function(x, fld.type, Settings) {
if (!is.factor(x)) {
if (fld.type == "ftString") {
x <- gsub('\\"', '', x)
} else if (fld.type == "ftInteger") {
x <- as.integer(x)
}else if (fld.type == "ftFloat") {
x <-type.convert(as.character(x), dec=",")
}else if (fld.type == "ftDMYDate") {
x <- as.Date(as.character(x), "%d/%m/%Y")
}
else {
status.log(paste("Field type not handled:", fld.type))
}
# if (fld.type %in% c(1, 2)) {
# x <- as.numeric(as.character(x))
# } else if (fld.type %in% c(12, 13)){
# ## Characters, do nothing
# } else if (fld.type == 3){
# ## Decimal separator hack. It should convert to whatever R is using.
# levels(x) <- gsub("[,.]", Sys.localeconv()[['decimal_point']], levels(x))
# x <- as.numeric(as.character(x))
# } else if (fld.type %in% c(4, 7) ){
# ## 16/05/1968 (DD/MM/YYYY, i.e. 16th of May, 1968)
# dateFormat <- paste("%d", "%m", "%Y", sep = Settings$DateSeparator)
# x <- as.Date(x, dateFormat)
# } else if (fld.type %in% c(5, 8) ){
# ## 16/05/1968 (MM/DD/YYYY, i.e. May 16th, 1968)
# dateFormat <- paste("%m", "%d", "%Y", sep = Settings$DateSeparator)
# x <- as.Date(x, dateFormat)
# } else if (fld.type %in% c(6, 9) ){
# ## 16/05/1968 (YYYY/MM/DD, i.e. 1968, May 16th)
# dateFormat <- paste("%Y", "%m", "%d", sep = Settings$DateSeparator)
# x <- as.Date(x, dateFormat)
# } else if (fld.type %in% c(10, 11) ){
# ## Time fields. At the moment it sets the date part to the current date.
# timeFormat <- paste("%H", "%M", "%S", sep = Settings$TimeSeparator)
# x <- as.POSIXct(strptime(x, timeFormat))
# } else if (fld.type == 0){
# ## Logical - empty to NA, Y to TRUE, else to FALSE
# x[x == ""] <- NA
# x <- x == "Y"
# } else {
# status.log(paste("Field type not handled:", fld.type))
# }
}
x
}
## ======================================================================
## Purpose: Apply the field definition information to each field
## ----------------------------------------------------------------------
## Arguments: sections: sections node from the XML file (these are
## sections of the data entry screen.
## dat: a dataframe of records that have been extracted from the XML file.
## Settings: settings information, as returned by epidata.meta.data()
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 18:27
## Author: jp.decorps@epiconcept.fr, Date: 29 Jan 2017
## ======================================================================
epidata.apply.field.structure <- function(sections, dat, Settings) {
status.log(paste(">>>> epidata.apply.field.structure", "start"))
num.sections <- xmlSize(xmlChildren(sections))
print(num.sections)
for (si in 1:num.sections) {
fields <- xmlChildren(xmlChildren(sections)[[si]])[["Fields"]]
num.flds <- xmlSize(fields)
if (num.flds > 1) {
for (i in 1:num.flds) {
field <- xmlChildren(fields)[[i]]
A_ <- xmlAttrs(field)
fld.id <- A_["id"]
# fld.id <- xmlAttrs(field)
# fld.name <- xmlValue(xmlChildren(field)[["Name"]])
fld.type <- A_["type"]
# fld.type <- xmlValue(xmlChildren(field)[["type"]])
# print(fld.type)
fld <- which(names(dat) == fld.id)
# names(dat)[fld] <- fld.name
names(dat)[fld] <- fld.id
dat[, fld] <- convert.type(dat[, fld], fld.type, Settings)
}
}
}
dat
}
## ======================================================================
## Function: read.epx (constructor)
## Description: Main user function to read in the XML file.
## ----------------------------------------------------------------------
## Arguments: x: the name of an XML file.
## use.epidata.labels: If FALSE do not use the epidata value labels.
## set.missing.na: if TRUE (the default) use the epidata definition
## of missing values and set the value in R to NA. Epidata allows
## for more than one definition of missing value, and all of these
## will be mapped to NA.
## ----------------------------------------------------------------------
## Return: epx.data object
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 18:27
## Author: jp.decorps@epiconcept.fr, Date: 03 Feb 2017, 02:11
## ======================================================================
read.epx <- function(x,
use.epidata.labels = TRUE,
set.missing.na = TRUE) {
#require(XML)
unlink("STATUS.LOG")
t1 <- Sys.time()
status.log(paste("Parsing", x))
y <- list()
y[['filename']] <- x
## Take all the records.
x <- xmlTreeParse(x)
epidata <- xmlRoot(x)
x.fld.info <- fld.info(epidata)
y[['Settings']] <- epidata.meta.data(epidata, "Settings")
## Get the data files
num.datafiles <- xmlSize(xmlChildren(epidata)["DataFiles"])
for (i in 1:num.datafiles) {
datfile <- xmlChildren(xmlChildren(epidata)[["DataFiles"]])[[i]]
datfile.name <- xmlAttrs(datfile)[["id"]]
sections <- xmlChildren(datfile)[["Sections"]]
# status.log("Get the records")
dat1 <- epidata.records(datfile, x.fld.info$id)
if (nrow(dat1) > 0) {
status.log("Apply field structure")
dat1 <- epidata.apply.field.structure(sections, dat1, y$Settings)
y$data[i] <- list(dat1)
names(y$data)[i] <- datfile.name
}
}
y[['field.info']] <- x.fld.info
y[['labels']] <- get.epidata.value.labels(epidata, y$Settings)
y[['ProjectSettings']] <- epidata.meta.data(epidata, "ProjectSettings")
# y[['Admin']] <- epidata.meta.data(epidata, "Admin")
y[['Study']] <- epidata.study.info(epidata)
if (use.epidata.labels & "data" %in% names(y)) {
status.log("Use epidata labels")
y <- use.epidata.labels(y, set.missing.na)
}
duration <- round(as.numeric(difftime(Sys.time(), t1), units = "secs"), 1)
status.log(paste("Finished in", duration, "seconds."))
# return(y) # y
# --- Create an epx.data object
# structure(list("epx"=y), class = "epx")
# epx <- list(y)
class(y) <- "epx"
y
}
## ======================================================================
## Function: as.data.frame
## Description: S3 method - return data.frame from epx object.
## ----------------------------------------------------------------------
## Arguments: x: an epx object from read.epx
## Return: a data.frame
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 09 Feb 2017, 02:11
## ======================================================================
as.data.frame <- function(x) UseMethod("as.data.frame", x)
as.data.frame.epx <- function(x) {
x$data[[1]]
}
## ======================================================================
## Function: return Study infos as a data.frame
## Description: S3 method - return data.frame from epx object.
## ----------------------------------------------------------------------
## Arguments: x: an epx object from read.epx
## Return: a data.frame
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 07 Mar 2017, 02:38
## ======================================================================
abstract <- function(x) UseMethod("abstract", x)
abstract.epx <- function(x) {
I <- c("File name", "Title", "Author", "Agency",
"Created", "Identifier", "Modified", "Notes", "Version")
Z <- x$Study
V <- lapply(x$Study, function(x) ifelse(is.null(x), NA, x))
R <- c(V$Title[[1]], V$Author, V$Agency, V$Created, V$Identifier, V$Modified, V$Notes, V$Version)
R <- c(x$filename, R)
#return(R)
df <- data.frame(I, R)
colnames(df) <- c("Info", "Value")
df
}
## ======================================================================
## Purpose: Get somes infos about the study
## ----------------------------------------------------------------------
## Arguments: x: an xmlRoot()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 07 Mar 2017, 01:15
## ======================================================================
epidata.study.info <- function(x) {
.tag <- "StudyInfo"
l_node <- xmlElementsByTagName(x, .tag, recursive = TRUE)[[.tag]]
.l = xmlToList(node=l_node)
.l
}
## ======================================================================
## Purpose: Get the epidata settings information
## ----------------------------------------------------------------------
## Arguments: x: an xmlRoot()
## ----------------------------------------------------------------------
## Author: jp.decorps@epiconcept.fr, Date: 29 Jan 2017, 03:35
## ======================================================================
epidata.meta.data <- function(x, tag) {
status.log(paste("epidata.meta.data", tag))
y <- list()
l_node <- xmlElementsByTagName(x, tag, recursive = TRUE)[[tag]]
l_attr <- xmlAttrs(l_node, TRUE, TRUE)
status.log("epidata.meta.data")
for (i in 1:xmlSize(l_attr)) {
dd <- l_attr[[i]]
if (length(dd) == 0) dd <- ""
y[names(l_attr)[[i]]] <- dd
}
y
}
fld.info <- function(x) {
## Purpose: Create a table of info about the fields
## ----------------------------------------------------------------------
## Arguments: x: an xmlRoot()
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 12 Jun 2011, 18:26
status.log(paste("fld.info", "start"))
y <- xmlElementsByTagName(x, "Field", recursive = TRUE)
fld.id <- NULL
fld.name <- NULL
fld.type <- NULL
fld.length <- NULL
fld.decimals <- NULL
fld.question <- NULL
fld.valuelabel <- NULL
for (i in 1:xmlSize(y)) {
A_ <- xmlAttrs(y[[i]])
fld.id <- c(fld.id, A_["id"])
fld.name <- c(fld.name, A_["id"])
fld.type <- c(fld.type, A_["type"])
fld.valuelabel <- c(fld.valuelabel, A_["valueLabelRef"])
# fld.length <- c(fld.length, xmlValue(xmlChildren(y[[i]])[["Length"]]))
# fld.decimals <- c(fld.decimals, xmlValue(xmlChildren(y[[i]])[["Decimals"]]))
# fld.question <- c(fld.question, xmlValue(xmlChildren(y[[i]])[["Question"]]))
# fld.valuelabel <- c(fld.valuelabel, xmlValue(xmlChildren(y[[i]])[["ValueLabelId"]]))
}
dt <- data.frame(id = fld.id,
name = fld.name,
type = fld.type,
value.labelset = fld.valuelabel
)
dt
}
get.epidata.value.labels <- function(x, Settings) {
## Purpose: Create a list of epidata labels
## ----------------------------------------------------------------------
## Arguments: x: an xmlRoot
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 14 Jun 2011, 20:26
## ----------------------------------------------------------------------
status.log(paste("get.epidata.value.labels", "start"))
y <- xmlElementsByTagName(x, "ValueLabelSet", recursive = TRUE)
if (xmlSize(y) == 0) return(NULL)
i <- 1
value.labels <- list()
for (i in 1:xmlSize(y)) {
this.valueset <- y[[i]]
A_ <- xmlAttrs(this.valueset)
valueset.id <- A_["id"]
valueset.name <- A_["id"]
valueset.type <- A_["type"]
j <- 1
this.value <- NULL
this.order <- NULL
this.label <- NULL
this.missing <- NULL
L_size = xmlSize(this.valueset)
for (j in 1:L_size) {
VLS = this.valueset[[j]]
ATTR <- xmlAttrs(VLS)
this.value <- c(this.value, ATTR["value"])
this.order <- c(this.order, ATTR["order"])
LB <- xmlValue(VLS[[1]])
LB <- gsub('\\"','"', LB)
if (LB == "Manquant") {
this.missing <- c(this.missing, TRUE)
} else {
this.missing <- c(this.missing, FALSE)
}
this.label <- c(this.label, xmlValue(VLS[[1]]))
}
## Convert the value to the right data type
this.value <- convert.type(factor(this.value), valueset.type, Settings)
these.labels <- data.frame(value = this.value, order = this.order, label = this.label, missing = this.missing)
these.labels <- list(name = valueset.name,
type = valueset.type,
labels = these.labels)
value.labels[valueset.id] <- list(these.labels)
}
value.labels
}
is.epidata.na <- function(x, value.labels, label.set) {
## Purpose: Determine if a value is missing or not
## ----------------------------------------------------------------------
## Arguments: x: a vector of values
## value.labels: a list of value labels created by get.epidata.value.labels()
## label.set: the name of a set of labels.
## ----------------------------------------------------------------------
## Returns: a logical vector (TRUE/FALSE)
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 14 Jun 2011, 20:26
retval <- NULL
for (j in 1:length(x)) {
if (is.na(x[j])) {
this.val <- NA
} else {
i <- as.character(value.labels[[label.set]]$labels$value) == as.character(x[j])
this.val <- value.labels[[label.set]]$labels$missing[i]
}
retval <- c(retval, this.val)
}
retval
}
epidata.value.label <- function(x, value.labels, label.set) {
## Purpose: Return the value label for a given value
## ----------------------------------------------------------------------
## Arguments: x: a vector of values
## value.labels: a list of value labels created by get.epidata.value.labels()
## label.set: the name of a set of labels.
## ----------------------------------------------------------------------
## Returns: a factor vector of value labels for a given values
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 14 Jun 2011, 20:26
retval <- NULL
missing.levels <- NULL
for (j in 1:length(x)) {
if (is.na(x[j])) {
this.val <- NA
} else {
VF = gsub('\\"', '', as.character(x[j]))
i <- as.character(value.labels[[label.set]]$labels$value) == VF
if (any(i)) {
this.val <- as.character(value.labels[[label.set]]$labels$label[i])
} else {
missing.levels <- unique(c(missing.levels, as.character(x[j])))
this.val <- as.character(x[j])
}
}
retval <- c(retval, this.val)
}
if (!is.null(missing.levels)) {
missing.levels <- paste(missing.levels, collapse = ", ")
status.log(paste("Levels missing in label set ", label.set, ": ", missing.levels, sep = ""))
}
as.factor(retval)
}
use.epidata.labels <- function(x, set.missing.na = TRUE) {
## Purpose: Recode the data using the epidata value labels
## ----------------------------------------------------------------------
## Arguments: x: an imported object with data and field info
## ----------------------------------------------------------------------
## Author: David Whiting, Date: 14 Jun 2011, 20:26
## ----------------------------------------------------------------------
status.log(paste("Use epidata labels", "start"))
for (i in 1:nrow(x$field.info)) {
if (!is.na(x$field.info$value.labelset)[i]) {
this.labelset <- as.character(x$field.info$value.labelset[i])
this.field <- as.character(x$field.info$name[i])
## This is a bit clumsy, but I had to break it down to get my head
## around it.
j <- which(names(x$data[[1]]) == this.field)
dd <- x$data[[1]][, j]
## Mark the missing values first
if (set.missing.na) {
dd[is.epidata.na(dd, x$labels, this.labelset)] <- NA
}
## Relabel the values.
dd <- epidata.value.label(dd, x$labels, this.labelset)
x$data[[1]][, j] <- dd
}
}
x
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.