R/outbreaker_data.R
In outbreaker2: Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data

Documented in outbreaker_data

#' Process input data for outbreaker
#'
#' This function performs various checks on input data given to outbreaker.  It
#' takes a list of named items as input, performs various checks, set defaults
#' where arguments are missing, and return a correct list of data input. If no
#' input is given, it returns the default settings.
#'
#' Acceptables arguments for ... are:
#' \describe{
#' \item{dates}{dates a vector indicating the collection dates, provided either as
#' integer numbers or in a usual date format such as \code{Date} or
#' \code{POSIXct} format. By convention, zero will indicate the oldest date. If
#' the vector is named, the vector names will be used for matching cases to
#' contact tracing data and labelled DNA sequences.}
#'
#' \item{dna}{the DNA sequences in \code{DNAbin} format (see
#' \code{\link[ape]{read.dna}} in the ape package); this can be imported from a
#' fasta file (extension .fa, .fas, or .fasta) using \code{adegenet}'s function
#' \link[adegenet]{fasta2DNAbin}.}
#'
#' \item{ctd}{the contact tracing data provided as a matrix/dataframe of two
#' columns, indicating a reported contact between the two individuals whose ids
#' are provided in a given row of the data, or an epicontacts object. In the case
#' of the latter, linelist IDs will be used for matching dates and DNA
#' sequences}
#'
#' \item{w_dens}{a vector of numeric values indicating the generation time
#' distribution, reflecting the infectious potential of a case t = 1, 2, ...
#' time steps after infection. By convention, it is assumed that
#' newly infected patients cannot see new infections on the same time step. If not
#' standardized, this distribution is rescaled to sum to 1.}
#'
#' \item{f_dens}{similar to \code{w_dens}, except that this is the distribution
#' of the colonization time, i_e. time interval during which the pathogen can
#' be sampled from the patient.}}
#'
#' @param ... a list of data items to be processed (see description)
#'
#' @param data optionally, an existing list of data item as returned by \code{outbreaker_data}.
#'
#' @author Thibaut Jombart (\email{thibautjombart@@gmail.com})
#'
#' @export
#'
#' @examples
#'
#' x <- fake_outbreak
#' outbreaker_data(dates = x$sample, dna = x$dna, w_dens = x$w)
#'
outbreaker_data <- function(..., data = list(...)) {

  ## SET DEFAULTS ##
  defaults <- list(dates = NULL,
                   w_dens = NULL,
                   f_dens = NULL,
                   dna = NULL,
                   ctd = NULL,
                   N = 0L,
                   L = 0L,
                   D = NULL,
                   max_range = NA,
                   can_be_ances = NULL,
                   log_w_dens = NULL,
                   log_f_dens = NULL,
                   contacts = NULL,
                   C_combn = NULL,
                   C_nrow = NULL,
                   ids = NULL,
                   has_dna = logical(0),
                   id_in_dna = integer(0))

  ## MODIFY DATA WITH ARGUMENTS ##
  data <- modify_defaults(defaults, data, FALSE)

  ## Set up case ids
  if(is.null(data$ids)) {
    if(!is.null(names(data$dates))) {
      data$ids <- names(data$dates)
    } else if(!is.null(data$ctd) & inherits(data$ctd, "epicontacts")){
      data$ids <- as.character(data$ctd$linelist$id)
    } else {
      data$ids <- as.character(seq_along(data$dates))
    }
  }

  ## CHECK DATA ##
  ## CHECK DATES
  if (!is.null(data$dates)) {
    if (inherits(data$dates, "Date")) {
      data$dates <- data$dates-min(data$dates)
    }
    if (inherits(data$dates, "POSIXct")) {
      data$dates <- difftime(data$dates, min(data$dates), units="days")
    }
    if (inherits(data$dates, "numeric") && any(data$dates %% 1 != 0)) {
      warning("Rounding non-integer dates to nearest integer")
    }
    data$dates <- as.integer(round(data$dates))
    data$N <- length(data$dates)
    data$max_range <- diff(range(data$dates))
  }

  ## CHECK W_DENS
  if (!is.null(data$w_dens)) {
    if (any(data$w_dens<0)) {
      stop("w_dens has negative entries (these should be probabilities!)")
    }

    if (any(!is.finite(data$w_dens))) {
      stop("non-finite values detected in w_dens")
    }


    ## Remove trailing zeroes to prevent starting with -Inf temporal loglike
    if(data$w_dens[length(data$w_dens)] < 1e-15) {
      final_index <- max(which(data$w_dens > 1e-15))
      data$w_dens <- data$w_dens[1:final_index]
    }

    ## add an exponential tail summing to 1e-4 to 'w'
    ## to cover the span of the outbreak
    ## (avoids starting with -Inf temporal loglike)
    if (length(data$w_dens) < data$max_range) {
      length_to_add <- (data$max_range-length(data$w_dens)) + 10 # +10 to be on the safe side
      val_to_add <- stats::dexp(seq_len(length_to_add), 1)
      val_to_add <- 1e-4*(val_to_add/sum(val_to_add))
      data$w_dens <- c(data$w_dens, val_to_add)
    }

    ## standardize the mass function
    data$w_dens <- data$w_dens / sum(data$w_dens)
    data$log_w_dens <- matrix(log(data$w_dens), nrow = 1)
  }

  ## CHECK F_DENS
  if (!is.null(data$w_dens) && is.null(data$f_dens)) {
    data$f_dens <- data$w_dens
  }
  if (!is.null(data$f_dens)) {
    if (any(data$f_dens<0)) {
      stop("f_dens has negative entries (these should be probabilities!)")
    }

    if (any(!is.finite(data$f_dens))) {
      stop("non-finite values detected in f_dens")
    }

    data$f_dens <- data$f_dens / sum(data$f_dens)
    data$log_f_dens <- log(data$f_dens)
  }

  ## CHECK POTENTIAL ANCESTRIES
  if(!is.null(data$dates)) {
    ## get temporal ordering constraint:
    ## canBeAnces[i,j] is 'i' can be ancestor of 'j'
    ## Calculate the serial interval from w_dens and f_dens
    .get_SI <- function(w_dens, f_dens) {
      wf <- stats::convolve(w_dens, rev(f_dens), type = 'open')
      conv <- stats::convolve(rev(f_dens), rev(wf), type = 'open')
      lf <- length(f_dens)
      lw <- length(w_dens)
      return(data.frame(x = (-lf + 2):(lw + lf - 1), d = conv))
    }
    ## Check if difference in sampling dates falls within serial interval
    ## This allows for i to infect j even if it sampled after (SI < 0)
    .can_be_ances <- function(date1, date2, SI) {
      tdiff <- date2 - date1
      out <- sapply(tdiff, function(i) return(i %in% SI$x))
      return(out)
    }
    SI <- .get_SI(data$w_dens, data$f_dens)
    data$can_be_ances <- outer(data$dates,
                               data$dates,
                               FUN=.can_be_ances,
                               SI = SI) # strict < is needed as we impose w(0)=0
    diag(data$can_be_ances) <- FALSE
  }

  ## CHECK DNA

  if (!is.null(data$dna)) {
    if (!inherits(data$dna, "DNAbin")) stop("dna is not a DNAbin object.")
    if (!is.matrix(data$dna)) data$dna <- as.matrix(data$dna)

    ## get matrix of distances

    data$L <- ncol(data$dna) #  (genome length)
    data$D <- as.matrix(ape::dist.dna(data$dna, model="N")) # distance matrix
    storage.mode(data$D) <- "integer" # essential for C/C++ interface

    ## get matching between sequences and cases

    if (is.null(rownames(data$dna))) {
      if (nrow(data$dna) != data$N) {
        msg <- sprintf(paste("numbers of sequences and cases differ (%d vs %d):",
                             "please label sequences"),
                       nrow(data$dna), data$N)
        stop(msg)
      }

      ## These need to be indices
      rownames(data$D) <- colnames(data$D) <- seq_len(data$N)

      ## These need to match dates/ctd ids
      rownames(data$dna) <- data$ids
    }
    data$id_in_dna <- match(data$ids, rownames(data$dna))
    if(any(is.na(match(rownames(data$dna), data$ids)))) {
      stop("DNA sequence labels don't match case ids")
    }

  } else {
    data$L <- 0L
    data$D <- matrix(integer(0), ncol = 0, nrow = 0)
    data$id_in_dna <- rep(NA_integer_, data$N)
  }
  data$has_dna <- !is.na(data$id_in_dna)


  ## CHECK CTD

  if (!is.null(data$ctd)) {
    ctd <- data$ctd
    if (inherits(ctd, c("matrix", "data.frame"))) {
      ## prevent factor -> integer conversion
      ctd <- apply(ctd, 2, as.character)
      if (!is.matrix(ctd)) {
        ctd <- as.matrix(ctd)
      }
      if(ncol(ctd) != 2) {
        stop("ctd must contain two columns")
      }
    } else if(inherits(ctd, "epicontacts")) {
      ## prevent factor -> integer conversion
      ctd <- apply(ctd$contacts[c("from", "to")], 2, as.character)
    } else {
      stop("ctd is not a matrix, data.frame or epicontacts object")
    }

    unq <- unique(as.vector(ctd[,1:2]))
    not_found <- unq[!unq %in% data$ids]
    if (length(not_found) != 0) {
      not_found <- sort(unique(not_found))
      stop(paste("Individual(s)", paste(not_found, collapse = ", "),
                 "in ctd are unknown cases (idx < 1 or > N")
           )
    }
    contacts <- matrix(0, data$N, data$N)
    mtch_1 <- match(ctd[,1], data$ids)
    mtch_2 <- match(ctd[,2], data$ids)
    contacts[cbind(mtch_2, mtch_1)] <- 1

    data$contacts <- contacts
    data$C_combn <- data$N*(data$N - 1)
    data$C_nrow <- nrow(ctd)
  } else {
    data$contacts <- matrix(integer(0), ncol = 0, nrow = 0)
  }

  ## output is a list of checked data
  return(data)

}

Any scripts or data that you put into this service are public.

outbreaker2 documentation built on May 23, 2022, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

outbreaker2
Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data

R/outbreaker_data.R
In outbreaker2: Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data

Defines functions outbreaker_data

Documented in outbreaker_data

Try the outbreaker2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

outbreaker2 Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data

R/outbreaker_data.R In outbreaker2: Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data

Defines functions outbreaker_data

Documented in outbreaker_data

Try the outbreaker2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

outbreaker2
Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data

R/outbreaker_data.R
In outbreaker2: Bayesian Reconstruction of Disease Outbreaks by Combining Epidemiologic and Genomic Data