#' Read in and remove sensitive data
#'
#' \code{outbreak_dataset_read} reads data from outbreak practical file and fills in ID blanks.
#' See inst/extdata for examples of how the excel files should look like before importing. Can be one
#' of two types now.
#'
#' @param xlsx.file Full file path to where xlsx Outbreak dataset is stored.
#' @param attempt.imputation Boolean detailing whether to impute data. If FALSE (default),
#' no imputation will occur but if TRUE then it will automatically happen
#' @param fill.in.end.infection.hours Boolean detailing whether to fill in empty end infection hours. Will be needed
#' if \code{outbreak_dataset_read} throws an error associated with missing data.
#' @export
#'
#'
#'
outbreak_dataset_read <- function(xlsx.file,attempt.imputation=FALSE, fill.in.end.infection.hours=FALSE){
# read in .xlsx file of data - see inst/extdata/paper.txt for example formatting with names removed
df <- XLConnect::readWorksheetFromFile(xlsx.file,sheet=1,startRow = 2,endCol = 20,colTypes=c(rep("character",15),rep("character",5)),useCachedValues=T)
# grab seed positions
seeds <- which(df$Parent.ID=="Seed")
# Handle for other type of excel sheet provided
if(is.element("Hours.since.start.4",names(df))){
# Given variability match for these columns which are the beginning columns in the old sheet style
col.pos <- match(c("ID","Parent.ID","Reinfection","Date","Time","Hours.since.start",
"Date.1","Time.1","Hours.since.start.1","Date.2","Time.2","Hours.since.start.2",
"Attempted.","Successful","Hours.since.start.4"),names(df))
# subset by these positions
df <- df[,col.pos]
# turn into useful numeric type, supressing warnings that throw due to characters in Parent.ID which are the seeds
df[,c(1,2,6,9,12,13,14,15)] <- suppressWarnings(lapply(df[,c(1,2,6,9,12,13,14,15)],as.numeric))
} else {
# turn into useful numeric type, supressing warnings that throw due to characters in Parent.ID which are the seeds
df[,c(1,2,6,9,12,13,14,15)] <- suppressWarnings(lapply(df[,c(1,2,6,9,12,13,14,15)],as.numeric))
}
names(df)[c(4:12,15)] <- c("Infection_Date","Infection_Time","Infection_Hours.since.start",
"Symptoms_Date","Symptoms_Time","Symptoms_Hours.since.start",
"End_Infection_Date","End_Infection_Time","End_Infection_Hours.since.start",
"Onward_Infection_Hours.since.start")
df$Reinfection <- as.logical(df$Reinfection)
## ERROR CHECKING ##
## --------------------------------------------------------------------------
# If we have decided to error check
if(attempt.imputation==TRUE){
# first work out the non reinfections
non.reinfections <- which(df$Reinfection==FALSE)
## ERROR CHECK 1: Looking for empty end infections and missing parents
## --------------------------------------------------------------------------
# find missing end infections and parents and then remove the seeds
wrong_rows <- non.reinfections[which(is.na(df$End_Infection_Hours.since.start[non.reinfections]) | is.na(df$Parent.ID[non.reinfections]))]
wrong_rows <- wrong_rows[-match(seeds,wrong_rows)]
# if we are missing some end infections
if(length(wrong_rows)>0){
error.message <- paste("End Infection Hours since start column missing data at rows:\n",paste(wrong_rows+2,collapse=", "),
"\n Attempting Data Imputation...")
## Imputation
for(i in wrong_rows){
## First, does the missing data not have a parent, otherwise we work out the most likely parent
if(is.na(df$Parent.ID[i])){
## first what's the beinning infection hour and if it's not there then quit
begin.inf <- df$Infection_Hours.since.start[i]
if(!is.numeric(begin.inf)) stop("Data can't be imputed for end of infection if beginning is not known")
# which individuals had infection times before this and hadn't already ended their infection
possible.rows <- intersect(which(df$End_Infection_Hours.since.start > begin.inf),
which(df$Infection_Hours.since.start < begin.inf))
# then let's see of these how many did they infect and probabilistically given the poisson distribution who is most
# likely to have infected another individual
# recorded attempted, which will use for the dpois bounds and then find out which attempted infections
# for the possible rows is most likly to have occured if it was +1
attempted.infections <- df$Attempted.[possible.rows]
att.inf.range <- sort(unique(attempted.infections))
sorted.probs <- sort.int((dpois(att.inf.range + 1,1.8)),decreasing = TRUE,index.return = T)$ix
# Which possible rows have the greatest chance of having another infection
more.possible.rows <- possible.rows[which(attempted.infections==(att.inf.range[sorted.probs[1]]))]
# If more than one row is still here then sample
if(length(more.possible.rows)>1){
imputed.parent.row <- sample(more.possible.rows,size = 1)
} else {
imputed.parent.row <- more.possible.rows
}
# Fill in the blank parent row
df$Parent.ID[i] <- df$ID[imputed.parent.row]
# if the newly chosen parent did not have an onward infection then add the onward infection time
if(is.na(df$Onward_Infection_Hours.since.start[imputed.parent.row])){
df$Onward_Infection_Hours.since.start[imputed.parent.row] <- df$Infection_Hours.since.start[i]
}
# If the parent was known just found the row for later
} else {
imputed.parent.row <- which(df$ID == df$Parent.ID[i])
}
# Second, if they don't have and end infection then pick a likely time
# between the individuals infection begin and end infection and round to 2dp
if(is.na(df$End_Infection_Hours.since.start[i])){
# if this indvidual caused secondary infections then they must have ended after this time
children <- which(df$Parent.ID == df$ID[i])
if(length(children > 1)){
min.start <- max(df$Infection_Hours.since.start[children],na.rm = T)
} else {
min.start <- df$Infection_Hours.since.start[i]
}
df$End_Infection_Hours.since.start[i] <- round(runif(1,min=min.start,
max=min( df$Infection_Hours.since.start[i]+48,
max(df$End_Infection_Hours.since.start,na.rm=T))
), digits = 2)
}
}
}
## ERROR CHECK 2: Looking for empty begin infections
## --------------------------------------------------------------------------
wrong_rows <- non.reinfections[which(is.na(df$Infection_Hours.since.start[non.reinfections]))]
if(length(wrong_rows)>0) stop (paste("Begin Infection Hours since start column missing data at rows",paste(wrong_rows+2,collapse=", ")))
## ERROR CHECK 3: Looking for missing onward infection times given known parent ids for these individuals exist
## --------------------------------------------------------------------------
wrong_rows <- non.reinfections[which(is.na(df$Onward_Infection_Hours.since.start[match(df$Parent.ID[non.reinfections][!is.na(df$Parent.ID[non.reinfections])],df$ID)]))]
if(length(wrong_rows)>0) stop (paste("Missing time of onward infection information for individuals who cause non-reinfection infections at rows",paste(wrong_rows+2,collapse=", ")))
# If we have decided not to error check then simply fill the end infection hours in
} else if (fill.in.end.infection.dates == TRUE) {
non.reinfections <- which(df$Reinfection==FALSE)
wrong_rows <- non.reinfections[which(is.na(df$End_Infection_Hours.since.start[non.reinfections]))]
df$End_Infection_Hours.since.start[wrong_rows] <- max(df$End_Infection_Hours.since.start,na.rm=TRUE)
## Something to do with parents
## TODO:
}
# Nice format times to remove the years
df$Infection_Time <- strftime(strptime(x = as.character(df$Infection_Time),format = "%H.%M"),"%H:%M:%S")
df$Symptoms_Time <- strftime(strptime(x = as.character(df$Symptoms_Time),format = "%H.%M"),"%H:%M:%S")
df$End_Infection_Time <- strftime(strptime(x = as.character(df$End_Infection_Time),format = "%H.%M"),"%H:%M:%S")
df$Infection_Date <- strftime(df$Infection_Date,"%Y/%m/%d")
df$Symptoms_Date <- strftime(df$Symptoms_Date,"%Y/%m/%d")
df$End_Infection_Date <- strftime(df$End_Infection_Date,"%Y/%m/%d")
# Calculate epidemiological parameters
df$Latent_Period_Hours <- df$Onward_Infection_Hours.since.start - df$Infection_Hours.since.start
df$Incubation_Period_Hours <- df$Symptoms_Hours.since.start - df$Infection_Hours.since.start
df$Infectious_Period_Hours <- df$End_Infection_Hours.since.start - df$Onward_Infection_Hours.since.start
df$Generation_Time_Hours <- df$Infection_Hours.since.start - df$Infection_Hours.since.start[match(df$Parent.ID,df$ID,incomparables = NA)]
# Remove errors in Generation Times
error.pos <- which(df$Generation_Time_Hours<0)
if(length(error.pos)>0){
catch = 1
df$Generation_Time_Hours[df$Generation_Time_Hours<0] <- NA
message(paste("Warning: Some negative generation times were recorded and subsequently",
"removed. Please check rows: \n ",paste(error.pos+2,collapse = ", ")))
}
# Format data to include all necessary contacts, i.e. fill in id number in columns where omitted
counter <- 0
for ( i in 1:length(df$ID)){
if(!is.na(df$ID[i])){
counter <- counter + 1
} else {
df$ID[i] <- counter
}
}
## Let's change those who were unifected to have this column equal to false for the sake of including them
## within graph plotting
df$Reinfection[is.na(df$Reinfection)] <- FALSE
## remove unnecessary data if not required
res <- df[,-c(13,14)]
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.