R/find-errors.R

#' Find problems in Biography data.
#'
#' Returns a named list of tbl_df objects, including
#' \itemize{
#' \item error_dates: Rows with one or more bad dates
#' \item error_duplicates: Rows with duplicate animals
#' }
#'
#' @param x Name of the tbl_df containing biography data generated by the function read_bio_table.
#' @export
#' @examples
#' bio_errors <- find_bio_errors(lh)
find_bio_errors <- function(x){

  if(!("tbl_df" %in% class(x)) | length(names(x)) != 17){
    message("Error: unrecognized object. Please use the tbl_df produced by the function read_bio_table.")
    return(NULL)
  }

  problems <- list()

  # Find duplicate animals
  error_duplicates <- x %>%
    dplyr::group_by(Study.Id, Animal.Id) %>%
    dplyr::summarise(n_records = n()) %>%
    dplyr::filter(n_records > 1)

  if(nrow(error_duplicates) > 0){
    problems[["error_duplicates"]] <- error_duplicates
  }

  # Find problem dates
  realistic_dates <- lubridate::new_interval(lubridate::ymd_hms("1914-01-01 00:00:00"), Sys.time())

  temp <- x %>%
    dplyr::filter(Birth.Date %within% realistic_dates &
                    Min.Birth.Date %within% realistic_dates &
                    Max.Birth.Date %within% realistic_dates &
                    Entry.Date %within% realistic_dates &
                    Depart.Date %within% realistic_dates)

  error_dates <- x %>%
    dplyr::anti_join(temp, by = c("Study.Id", "Animal.Id")) %>%
    dplyr::select(ends_with("Id"), ends_with("Date")) %>%
    suppressMessages()

  if(nrow(error_dates) > 0){
    problems[["error_dates"]] <- error_dates
  }

  if(length(problems) == 0){
    message("No errors found!")
    problems <- NULL
  }

  return(problems)
}

#' Find problems in Fertility data.
#'
#' Returns a named list of tbl_df objects, including
#' \itemize{
#' \item error_dates: Rows with one or more bad dates
#' }
#'
#' @param x Name of the tbl_df containing fertility data generated by the function read_fert_table.
#' @export
#' @examples
#' fert_errors <- find_fert_errors(fert)
find_fert_errors <- function(x){

  if(!("tbl_df" %in% class(x)) | length(names(x)) != 6){
    message("Error: unrecognized object. Please use the tbl_df produced by the function read_fert_table.")
    return(NULL)
  }

  problems <- list()

  # Find problem dates
  realistic_dates <- lubridate::new_interval(lubridate::ymd_hms("1914-01-01 00:00:00"), Sys.time())

  temp <- x %>%
    dplyr::filter(Start.Date %within% realistic_dates &
                    Stop.Date %within% realistic_dates)

  error_dates <- x %>%
    dplyr::anti_join(temp, by = c("Study.Id", "Animal.Id")) %>%
    dplyr::select(ends_with("Id"), ends_with("Date")) %>%
    suppressMessages()

  if(nrow(error_dates) > 0){
    problems[["error_dates"]] <- error_dates
  }

  if(length(problems) == 0){
    message("No errors found!")
    problems <- NULL
  }

  return(problems)
}

#' Find mothers with no corresponding Animal.Id entry.
#'
#' Returns a named list Mom.Id records that don't match the Animal.Id records for each site
#'
#' @param x Name of the tbl_df containing biography data generated by the function read_bio_table.
#' @export
#' @examples
#' mom_id_errors <- find_mom_id_errors(lh)
find_mom_id_errors <- function(x){

  if(!("tbl_df" %in% class(x)) | length(names(x)) != 17){
    message("Error: unrecognized object. Please use the tbl_df produced by the function read_bio_table.")
    return(NULL)
  }

  `%ni%` = Negate(`%in%`)

  f <- function(df){
    inds <-  levels(factor(df$Animal.Id))
    moms <-  levels(factor(df$Mom.Id))
    missing <- moms[moms %ni% inds]
    return(missing)
  }

  mom_errors <- plyr::dlply(x, .(Study.Id), f)

  return(mom_errors)
}

#' Find mothers with no corresponding Animal.Id entry.
#'
#' Returns a named list Mom.Id records that don't match the Animal.Id records for each site
#'
#' @param x Name of the tbl_df containing biography data generated by the function read_bio_table.
#' @export
#' @examples
#' first_born_errors <- find_first_born_errors(lh)
find_first_born_errors <- function(x){

  if(!("tbl_df" %in% class(x)) | length(names(x)) != 17){
    message("Error: unrecognized object. Please use the tbl_df produced by the function read_bio_table.")
    return(NULL)
  }

  problems <- list()

  fb_unknown_mother <- x %>%
    dplyr::filter(is.na(Mom.Id) & First.Born %in% c("N", "Y")) %>%
    dplyr::select(Study.Id, Mom.Id, Animal.Id, Animal.Name, Birth.Date, First.Born)

  if(nrow(fb_unknown_mother) > 0){
    problems[["unknown_mother_first_born"]] <- fb_unknown_mother
  }

  fb_multiple <- x %>%
    dplyr::filter(First.Born == "Y") %>%
    dplyr::group_by(Study.Id, Mom.Id) %>%
    dplyr::summarise(n = n()) %>%
    dplyr::filter(n > 1) %>%
    dplyr::left_join(x, by = c("Study.Id" = "Study.Id", "Mom.Id" = "Mom.Id")) %>%
    dplyr::filter(First.Born == "Y") %>%
    dplyr::select(Study.Id, Mom.Id, Animal.Id, Animal.Name, Birth.Date, First.Born)

  # Filter out special case of twins at Karisoke
  fb_multiple <- fb_multiple %>%
    dplyr::filter(!(Study.Id == "karisoke" & (Animal.Id == "I81" | Animal.Id == "I82")))

  if(nrow(fb_multiple) > 0){
    problems[["multiple_first_born"]] <- fb_multiple
  }

  return(problems)
}
camposfa/plhdbR documentation built on May 13, 2019, 11:02 a.m.