#' Process input data for outbreaker
#'
#' This function performs various checks on input data given to outbreaker. It
#' takes a list of named items as input, performs various checks, set defaults
#' where arguments are missing, and return a correct list of data input. If no
#' input is given, it returns the default settings.
#'
#' Acceptables arguments for ... are:
#' \describe{
#' \item{dates}{dates a vector indicating the collection dates, provided either as
#' integer numbers or in a usual date format such as \code{Date} or
#' \code{POSIXct} format. By convention, zero will indicate the oldest date. If
#' the vector is named, the vector names will be used for matching cases to
#' contact tracing data and labelled DNA sequences.}
#'
#' \item{dna}{the DNA sequences in \code{DNAbin} format (see
#' \code{\link[ape]{read.dna}} in the ape package); this can be imported from a
#' fasta file (extension .fa, .fas, or .fasta) using \code{adegenet}'s function
#' \link[adegenet]{fasta2DNAbin}.}
#'
#' \item{ctd}{the contact tracing data provided as a matrix/dataframe of two
#' columns, indicating a reported contact between the two individuals whose ids
#' are provided in a given row of the data, or an epicontacts object. In the case
#' of the latter, linelist IDs will be used for matching dates and DNA
#' sequences.}
#'
#' \item{w_dens}{a vector of numeric values indicating the generation time
#' distribution, reflecting the infectious potential of a case t = 1, 2, ...
#' time steps after infection. By convention, it is assumed that
#' newly infected patients cannot see new infections on the same time step. If not
#' standardized, this distribution is rescaled to sum to 1.}
#'
#' \item{f_dens}{similar to \code{w_dens}, except that this is the distribution
#' of the colonization time, i_e. time interval during which the pathogen can
#' be sampled from the patient.}}
#'
#' @param ... a list of data items to be processed (see description).
#'
#' @param data optionally, an existing list of data item as returned by \code{outbreaker_data}.
#'
#' @author Thibaut Jombart (\email{thibautjombart@@gmail.com}).
#'
#' @export
#'
#' @examples
#'
#' x <- fake_outbreak
#' outbreaker_data(dates = x$sample, dna = x$dna, w_dens = x$w)
#'
outbreaker_data <- function(..., data = list(...)) {
## SET DEFAULTS ##
defaults <- list(dates = NULL,
w_dens = NULL,
f_dens = NULL,
dna = NULL,
ctd = NULL,
N = 0L,
L = 0L,
D = NULL,
max_range = NA,
can_be_ances = NULL,
log_w_dens = NULL,
log_f_dens = NULL,
contacts = NULL,
C_combn = NULL,
C_nrow = NULL,
ids = NULL,
has_dna = logical(0),
id_in_dna = integer(0))
## MODIFY DATA WITH ARGUMENTS ##
data <- modify_defaults(defaults, data, FALSE)
## Set up case ids
if(is.null(data$ids)) {
if(!is.null(names(data$dates))) {
data$ids <- names(data$dates)
} else if(!is.null(data$ctd) & inherits(data$ctd, "epicontacts")){
data$ids <- as.character(data$ctd$linelist$id)
} else {
data$ids <- as.character(seq_along(data$dates))
}
}
## CHECK DATA ##
## CHECK DATES
if (!is.null(data$dates)) {
if (inherits(data$dates, "Date")) {
data$dates <- data$dates-min(data$dates)
}
if (inherits(data$dates, "POSIXct")) {
data$dates <- difftime(data$dates, min(data$dates), units="days")
}
if (inherits(data$dates, "numeric") && any(data$dates %% 1 != 0)) {
warning("Rounding non-integer dates to nearest integer")
}
data$dates <- as.integer(round(data$dates))
data$N <- length(data$dates)
data$max_range <- diff(range(data$dates))
}
## CHECK W_DENS
if (!is.null(data$w_dens)) {
if (any(data$w_dens<0)) {
stop("w_dens has negative entries (these should be probabilities!)")
}
if (any(!is.finite(data$w_dens))) {
stop("non-finite values detected in w_dens")
}
## Remove trailing zeroes to prevent starting with -Inf temporal loglike
if(data$w_dens[length(data$w_dens)] < 1e-15) {
final_index <- max(which(data$w_dens > 1e-15))
data$w_dens <- data$w_dens[1:final_index]
}
## add an exponential tail summing to 1e-4 to 'w'
## to cover the span of the outbreak
## (avoids starting with -Inf temporal loglike)
if (length(data$w_dens) < data$max_range) {
length_to_add <- (data$max_range-length(data$w_dens)) + 10 # +10 to be on the safe side
val_to_add <- stats::dexp(seq_len(length_to_add), 1)
val_to_add <- 1e-4*(val_to_add/sum(val_to_add))
data$w_dens <- c(data$w_dens, val_to_add)
}
## standardize the mass function
data$w_dens <- data$w_dens / sum(data$w_dens)
data$log_w_dens <- matrix(log(data$w_dens), nrow = 1)
}
## CHECK F_DENS
if (!is.null(data$w_dens) && is.null(data$f_dens)) {
data$f_dens <- data$w_dens
}
if (!is.null(data$f_dens)) {
if (any(data$f_dens<0)) {
stop("f_dens has negative entries (these should be probabilities!)")
}
if (any(!is.finite(data$f_dens))) {
stop("non-finite values detected in f_dens")
}
data$f_dens <- data$f_dens / sum(data$f_dens)
data$log_f_dens <- log(data$f_dens)
}
## CHECK POTENTIAL ANCESTRIES
if(!is.null(data$dates)) {
## get temporal ordering constraint:
## canBeAnces[i,j] is 'i' can be ancestor of 'j'
## Calculate the serial interval from w_dens and f_dens
.get_SI <- function(w_dens, f_dens) {
wf <- stats::convolve(w_dens, rev(f_dens), type = 'open')
conv <- stats::convolve(rev(f_dens), rev(wf), type = 'open')
lf <- length(f_dens)
lw <- length(w_dens)
return(data.frame(x = (-lf + 2):(lw + lf - 1), d = conv))
}
## Check if difference in sampling dates falls within serial interval
## This allows for i to infect j even if it sampled after (SI < 0)
.can_be_ances <- function(date1, date2, SI) {
tdiff <- date2 - date1
out <- sapply(tdiff, function(i) return(i %in% SI$x))
return(out)
}
SI <- .get_SI(data$w_dens, data$f_dens)
data$can_be_ances <- outer(data$dates,
data$dates,
FUN=.can_be_ances,
SI = SI) # strict < is needed as we impose w(0)=0
diag(data$can_be_ances) <- FALSE
}
## CHECK DNA
if (!is.null(data$dna)) {
if (!inherits(data$dna, "DNAbin")) stop("dna is not a DNAbin object.")
if (!is.matrix(data$dna)) data$dna <- as.matrix(data$dna)
## get matrix of distances
data$L <- ncol(data$dna) # (genome length)
data$D <- as.matrix(ape::dist.dna(data$dna, model="N")) # distance matrix
storage.mode(data$D) <- "integer" # essential for C/C++ interface
## get matching between sequences and cases
if (is.null(rownames(data$dna))) {
if (nrow(data$dna) != data$N) {
msg <- sprintf(paste("numbers of sequences and cases differ (%d vs %d):",
"please label sequences"),
nrow(data$dna), data$N)
stop(msg)
}
## These need to be indices
rownames(data$D) <- colnames(data$D) <- seq_len(data$N)
## These need to match dates/ctd ids
rownames(data$dna) <- data$ids
}
data$id_in_dna <- match(data$ids, rownames(data$dna))
if(any(is.na(match(rownames(data$dna), data$ids)))) {
stop("DNA sequence labels don't match case ids")
}
} else {
data$L <- 0L
data$D <- matrix(integer(0), ncol = 0, nrow = 0)
data$id_in_dna <- rep(NA_integer_, data$N)
}
data$has_dna <- !is.na(data$id_in_dna)
## CHECK CTD
if (!is.null(data$ctd)) {
ctd <- data$ctd
if (inherits(ctd, c("matrix", "data.frame"))) {
## prevent factor -> integer conversion
ctd <- apply(ctd, 2, as.character)
if (!is.matrix(ctd)) {
ctd <- as.matrix(ctd)
}
if(ncol(ctd) != 2) {
stop("ctd must contain two columns")
}
} else if(inherits(ctd, "epicontacts")) {
## prevent factor -> integer conversion
ctd <- apply(ctd$contacts[c("from", "to")], 2, as.character)
} else {
stop("ctd is not a matrix, data.frame or epicontacts object")
}
unq <- unique(as.vector(ctd[,1:2]))
not_found <- unq[!unq %in% data$ids]
if (length(not_found) != 0) {
not_found <- sort(unique(not_found))
stop(paste("Individual(s)", paste(not_found, collapse = ", "),
"in ctd are unknown cases (idx < 1 or > N")
)
}
contacts <- matrix(0, data$N, data$N)
mtch_1 <- match(ctd[,1], data$ids)
mtch_2 <- match(ctd[,2], data$ids)
contacts[cbind(mtch_2, mtch_1)] <- 1
data$contacts <- contacts
data$C_combn <- data$N*(data$N - 1)
data$C_nrow <- nrow(ctd)
} else {
data$contacts <- matrix(integer(0), ncol = 0, nrow = 0)
}
## output is a list of checked data
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.