R/glm_nwinf_lagged.R

Defines functions logit_nwinf logit_nwinf_lagged add_risk_factor make_nwinf_data

Documented in add_risk_factor logit_nwinf logit_nwinf_lagged make_nwinf_data

#' Creates a dataset with new infection events from herd level test results
#'
#' @param sfd a STOCfree_data object
#' @param time_of_inf a character string. When 'mid', the time of new infection is the median of the months between which the consecutive tests occurred.
#'
#' @return
#' @export
#'
make_nwinf_data <- function(sfd,
                            time_of_inf = c("mid", "first", "last")){


  month_first <- attr(sfd, "month first test")
  month_last  <- attr(sfd, "month last test")

  # test_res_col <- sfd$var_names[["test_res_col"]]

  nwinf_data <- sfd$test_data
  nwinf_data <- nwinf_data[order(nwinf_data$status_id),]

  ## test_res_col name changed
  # colnames(nwinf_data)[match(test_res_col, colnames(nwinf_data))] <- "test_res"

  nwinf_data$prev_herd_id    <- c(NA, nwinf_data$herd_id[-nrow(nwinf_data)])
  ## Previous month id
  nwinf_data$prev_month_id   <- c(NA, nwinf_data$month_id[-nrow(nwinf_data)])
  nwinf_data$prev_month_id   <- with(nwinf_data, ifelse(herd_id == prev_herd_id,
                                                        prev_month_id, NA))
  ## Previous test result
  nwinf_data$prev_test_res <- c(NA, nwinf_data$test_res[-nrow(nwinf_data)])
  nwinf_data$prev_test_res <- with(nwinf_data, ifelse(herd_id == prev_herd_id,
                                                      prev_test_res, NA))

  ## Only rows eligible for new infections are kept
  nwinf_data <- nwinf_data[!is.na(nwinf_data$test_res) &  !is.na(nwinf_data$prev_test_res) & nwinf_data$prev_test_res == 0,]

  ## New infection variable
  nwinf_data$nwinf <- with(nwinf_data, ifelse(test_res == 1, 1, 0))

  ## month_id is changed to the midpoint between previous and current test
  nwinf_data$mid_month_id <- with(nwinf_data,
                                  floor(prev_month_id + .5 *(month_id - prev_month_id)))

  ## Name of the column to keep for month_id
  ## by default, month_id is the month of the last test
  if(time_of_inf == "mid"){

    nwinf_data$month_id <- with(nwinf_data,
                                floor(prev_month_id + .5 *(month_id - prev_month_id)))

  }

  ## Name of the column to keep for month_id
  if(time_of_inf == "first"){

    nwinf_data$month_id <- nwinf_data$prev_month_id

  }

  nwinf_data <- nwinf_data[, c("status_id", "herd_id", "month_id",
                               "nwinf")]

  nwinf_data <- nwinf_data[order(nwinf_data$status_id),]
  rownames(nwinf_data) <- 1:nrow(nwinf_data)

  nwinf_list <- list(
    herd_id_corresp = sfd$herd_id_corresp,
    nwinf_data = nwinf_data)

  attr(nwinf_list, "month first test") <- month_first
  attr(nwinf_list, "month last test")  <- month_last
  attr(nwinf_list, "time of inf")      <- time_of_inf

  class(nwinf_list) <- "nwinf_data"

  nwinf_list

}


#' Adds a risk factor to a new infection dataset
#'
#' @param nwinf a dataset created with the make_nwinf_data() function
#' @param rf_data risk factor data. Ids for farms must be the same as in the original test results data.
#' @param rf_col name of the column containing the risk factor data
#' @param rf_date_col name of the column with date of risk factor occurrence
#' @param lag1 start of interval between risk factor occurrence and new infection
#' @param lag2 end of interval between risk factor occurrence and new infection
#'
#' @return
#' @export
#'
add_risk_factor <- function(nwinf = nwinf_data(),
                            rf_data,
                            rf_col = character(),
                            rf_date_col = character(),
                            lag1 = 6,
                            lag2 = 12,
                            FUN = sum){

  nwinf_data <- nwinf$nwinf_data

  ## Month of first test -> month_id = 1
  month_first <- attr(nwinf, "month first test")
  ## Month of last test
  month_last <- attr(nwinf, "month last test")

  ## risk factor column in the risk factor data renamed
  colnames(rf_data)[match(rf_col, colnames(rf_data))] <- "risk_factor"

  ## List of months used in the study
  ## month_id = 0 for the first month in the STOCfree dataset
  rf_first_month <- date_from_lag(
    date = paste0(month_first, "-01"), time_lag = -max(lag2))

  all_months <- data.frame(
    date__1 = seq(as.Date(rf_first_month),
                  as.Date(paste0(month_last, "-01")), by = "1 month"),
    rf_month_id = rep(NA),
    stringsAsFactors = FALSE
  )

  all_months$rf_month_id <- 1:nrow(all_months) -
    which(all_months$date__1 == paste0(month_first, "-01"))

  ## name of column with old herd_id in risk factor data
  risk_herd_col <- colnames(nwinf$herd_id_corresp)[1]

  ## herd_id added to risk factor data
  rf_data <- merge(rf_data, nwinf$herd_id_corresp)

  ## month of risk factor occurrence
  rf_data$date__1 <- as.Date(paste0(
    format(as.Date(rf_data[[rf_date_col]]), "%Y-%m"), "-01"))

  rf_data <- merge(rf_data, all_months)

  ## columns of interest selected
  rf_data <- rf_data[, c("herd_id", rf_date_col, "rf_month_id", "risk_factor")]

  ## sequence of lags to study
  sq_lag <- expand.grid(month_id = sort(unique(nwinf_data$month_id)),
                        rf_month_id = lag1:lag2)

  nwinf_data <- merge(
    nwinf_data[, c("herd_id", "month_id", "nwinf")],
    sq_lag)

  nwinf_data$rf_month_id <- with(nwinf_data,
                                 month_id - rf_month_id)

  nwinf_data <- merge(nwinf_data, rf_data, all.x = TRUE)

  nwinf_data$risk_factor[is.na(nwinf_data$risk_factor)] <- 0

  ## data aggregation
  aggr_data <- dplyr::group_by(nwinf_data,
                               herd_id, month_id)
  aggr_data <- dplyr::summarise(aggr_data,
                                nwinf = unique(nwinf),
                                risk_factor = FUN(risk_factor))

  ## Risk factor column renamed using risk factor name and lags
  colnames(aggr_data)[match("risk_factor", colnames(aggr_data))] <- paste(rf_col, lag1, lag2, sep = "_")

  nwinf$nwinf_data <- merge(nwinf$nwinf_data,
                            as.data.frame(aggr_data),
                            all.x = TRUE)

  nwinf

}


#' Modelling of all possible lagged relationships between risk factor occurrence and new infection
#'
#' @param sf_data a object of class STOCfree_data with test results
#' @param rf_data risk factor data
#' @param rf_herd_col name of the column containing the herd id. The ids should be the same as in the test data
#' @param rf_date_col name of the column the date of risk factor occurrence. The dates should be formatted as yyyy-mm-dd
#' @param rf_col name of the column with risk factor
#' @param lag1 an integer value representing the minimum number of months from which to consider for risk factor occurrence
#' @param lag2 an integer value representing the maximum number of months to consider for risk factor occurrence
#'
#' @details this function models a probability of new infection as a function of risk factor occurrence considered at various time intervals before a test is performed. New infection is defined, at the herd test level, as a positive test result recorded after a negative test result on the previous test. All herds with 2 test results of which the first one is negative are eligible for a new infection. The association with risk factors recorded between lag1 months and lag2 months before the second test result are modelled with logistic regression. For each combination of lag1 and lag2 the AIC of the logistic model is recorded. The best interval is the one with the lowest AIC.
#'
#'When several test results are available on a month, the first test result is used.
#'
#' @return a data.frame with a model AIC for each combination of lag1 (start of interval) and lag2 (end of interval)
#' @export
#'
logit_nwinf_lagged <- function(sf_data,
                             rf_data,
                             rf_date_col = character(),
                             rf_col = character(),
                             lag1 = 0,
                             lag2 = 36,
                             time_of_inf = c("mid", "first", "last"),
                             FUN = sum){

  ## Month of first test -> month_id = 1
  month_first <- attr(sf_data, "month first test")
  ## Month of last test
  month_last <- attr(sf_data, "month last test")
  # name of column with herd_id
  rf_herd_col <- colnames(sf_data$herd_id_corresp)[1]

  ## List of months used in the study
  ## month_id = 0 for the first month in the STOCfree dataset
  rf_first_month <- date_from_lag(
    date = paste0(month_first, "-01"), time_lag = -lag2)

  all_months <- data.frame(
    date__1 = seq(as.Date(rf_first_month),
                  as.Date(paste0(month_last, "-01")), by = "1 month"),
    month_id = rep(NA),
    stringsAsFactors = FALSE
  )

  all_months$month_id <- 1:nrow(all_months) -
    which(all_months$date__1 == paste0(month_first, "-01"))

  ## New infection data created from test results
  nwinf_data <- sf_data$test_data
  nwinf_data_dup <- which(duplicated(nwinf_data[, c("herd_id", "month_id")]))
  if(length(nwinf_data_dup) > 0) nwinf_data <- nwinf_data[-nwinf_data_dup,]

  nwinf_data <- nwinf_data[order(nwinf_data$status_id),]

  nwinf_data$prev_herd_id    <- c(NA, nwinf_data$herd_id[-nrow(nwinf_data)])
  ## Previous month id
  nwinf_data$prev_month_id   <- c(NA, nwinf_data$month_id[-nrow(nwinf_data)])
  nwinf_data$prev_month_id   <- with(nwinf_data, ifelse(herd_id == prev_herd_id,
                                                        prev_month_id, NA))
  ## Previous test result
  nwinf_data$prev_test_res <- c(NA, nwinf_data$test_res[-nrow(nwinf_data)])
  nwinf_data$prev_test_res <- with(nwinf_data, ifelse(herd_id == prev_herd_id,
                                                      prev_test_res, NA))

  ## Only rows eligible for new infections are kept
  nwinf_data <- nwinf_data[!is.na(nwinf_data$test_res) &  !is.na(nwinf_data$prev_test_res) & nwinf_data$prev_test_res == 0,]

  ## New infection variable
  nwinf_data$nwinf <- with(nwinf_data, ifelse(test_res == 1, 1, 0))

  ## month_id is changed to the midpoint between previous and current test
  nwinf_data$mid_month_id <- with(nwinf_data,
                                  floor(prev_month_id + .5 *(month_id - prev_month_id)))


  cln <- match(c("prev_month_id", "month_id", "mid_month_id"),
               colnames(nwinf_data))

  colnames(nwinf_data)[cln] <- c("first", "last", "mid")

  colnames(nwinf_data)[match(time_of_inf, colnames(nwinf_data))] <- "month_id"

  nwinf_data <- nwinf_data[, c("herd_id", "month_id", "nwinf")]

  ## Risk factor data - herd id added
  rf_data <- merge(sf_data$herd_id_corresp, rf_data, all.x = TRUE)
  rf_data$herd_id <- as.integer(rf_data$herd_id)

  ## month_id added
  rf_data$date__1 <- as.Date(paste0(format(as.Date(as.character(rf_data[[rf_date_col]])),
                                           "%Y-%m"), "-01"))
  rf_data <- merge(all_months, rf_data, by = "date__1")

  ## sequence of lags to study
  sq_lag <- expand.grid(month_id = sort(unique(nwinf_data$month_id)),
                        time_lag = lag1:lag2)

  nwinf_model_data <- merge(
    nwinf_data[, c("herd_id", "month_id", "nwinf")],
    sq_lag)

  nwinf_model_data$rf_month_id <- with(nwinf_model_data,
                                       month_id - time_lag)

  rf_data <- rf_data[, c("herd_id", "month_id", rf_col)]
  colnames(rf_data)[2] <- "rf_month_id"
  colnames(rf_data)[3] <- "risk_factor"

  nwinf_model_data <- merge(nwinf_model_data, rf_data, all.x = TRUE)
  ## removing duplicates generated by the merge function
  dup_ninf <- which(duplicated(nwinf_model_data))
  if(length(dup_ninf) > 0){

    nwinf_model_data <- nwinf_model_data[-which(duplicated(nwinf_model_data)),]

  }
  ## Filling missing values with 0s
  nwinf_model_data$risk_factor[is.na(nwinf_model_data$risk_factor)] <- 0

  ## Modelling of lagged relationships

  # Grid of all possible lags to explore
  l1l2 <- expand.grid(
    lag1 = lag1:lag2,
    lag2 = lag1:lag2,
    l = rep(NA),
    AIC = rep(NA)
  )

  l1l2 <- l1l2[l1l2$lag2 >= l1l2$lag1,]
  l1l2$l <- with(l1l2, lag2 - lag1)

  ## length and sum of response variable - used for checking AIC values
  unique_y <- nwinf_model_data$nwinf[-which(duplicated(nwinf_model_data[, c("herd_id", "month_id")]))]
  lg_y  <- length(unique_y)
  sum_y <- sum(unique_y)

  for(i in 1:nrow(l1l2)){

    l1 <- l1l2$lag1[i]
    l2 <- l1l2$lag2[i]

    model_data <- dplyr::filter(nwinf_model_data,
                                time_lag >= l1 & time_lag <= l2)
    model_data <- dplyr::group_by(model_data,
                                  herd_id, month_id,
                                  .groups = "drop")
    model_data <- dplyr::summarise(model_data,
                                   nwinf = round(median(nwinf)+10^-6),
                                   risk_factor = FUN(risk_factor))

    lg_yi  <- length(model_data$nwinf)
    sum_yi <- sum(model_data$nwinf)

    if(lg_yi == lg_y & sum_yi == sum_y){

      l1l2$AIC[i] <- AIC(glm(nwinf ~ risk_factor,
                             data = model_data,
                             family = binomial(link = "logit")))
    } else {

      l1l2$AIC[i] <- NA

    }

  }

  rownames(l1l2) <- 1:nrow(l1l2)
  l1l2

}

#' Logistic regression for the probability of new infection
#'
#' @param nwinf a dataset created with the make_nwinf_data() and add_risk_factor() functions
#' @param risk_factors a character string with rhe risk factors to include in the regression
#'
#' @return results of a logistic regression model performed with the glm function
#' @export
#'
logit_nwinf <- function(nwinf = nwinf_data(),
                      risk_factors = character){

  data <- nwinf$nwinf_data

  formula <- as.formula(paste("nwinf ~ ",
                       paste(risk_factors, collapse = " + ")))

  log_reg <- glm(formula = formula,
                 data = data,
                 family = binomial(link = "logit"))

  log_reg

  }
AurMad/STOCfree documentation built on Sept. 13, 2022, 3:20 a.m.