data-raw/Poop-link/poop-link-RAW.R

#Code to extract poop_ids from trapping tables
#Most code is adapted from mstrimas

#I have modified it for version control and some dependencies
library(krsp)
library(tidyverse)
library(lubridate)
library(stringr)
library(purrr)


# Raw Poop-Link -----------------------------------------------------------

#I have some home cooked functions that are hidden with the gitignore. They allow easier connection to the krsp database
#They are solely for development purposes.
#There are however, some functions in the `extract_functions.R` script that aid in generating these tables
list.files("data-raw/Poop-link/fn", full.names = TRUE, recursive = T) %>%
  walk(source)

# trapping tables
# mstrimas uses dbplyr with SQL. I have modified it to be in dplyr notation
trapping <- tbl(con, "trapping") %>%
  select(trapping_id = id, squirrel_id, date, comments, contains("tvariable")) %>%
  collect()
trapping_dba <- tbl(con, "dbatrapping") %>%
  select(trapping_id = id, squirrel_id, date, comments = TagHist, poop, ptime) %>%
  collect()

#years are treated independently due to different data collection and sample naming conventions
#2006-2007; 2009-2011 come from external files which I will document here
poop_external <- read_csv("https://raw.githubusercontent.com/KluaneRedSquirrelProject/poop/master/output/poop_external.csv") %>%
  mutate(poop_time = as.character(poop_time))
#load("data-raw/poop_external.rda") #exists in raw compressed form in this repo now.

# 2008: trappings comments column, no pv
pt_rgx <- "(?<=p[0-9]{4}[-;,:]?\\s{0,2})[0-2]?[0-9](h|:)?[0-9]{2}\\s?(am|pm)?"
poop_2008 <- trapping %>%
  filter(year(date) == 2008, comments != "") %>%
  mutate(pv = str_extract(str_to_lower(comments), "p[0-9]{4}"),
         pv_second = str_extract_all(str_to_lower(comments), "p[0-9]{4}"),
         pv_second = map_chr(pv_second,
                             ~ ifelse(length(.x) > 1, .x[2], NA))) %>%
  filter(!is.na(pv)) %>%
  mutate(poop_time = str_extract(str_to_lower(comments), pt_rgx)) %>%
  pivot_longer(cols = c("pv", "pv_second"), names_to = c("vial_number"), values_to = "poop_id") %>% #adjusted this as gather is deprecated
  filter(!(is.na(poop_id) & vial_number == "pv_second")) %>%
  mutate(vial_number = if_else(vial_number == "pv", 1L, 2L),
         poop_id = str_to_upper(poop_id),
         poop_time = clean_pt(poop_time),
         year = year(date)) %>%
  select(squirrel_id, trapping_id, year, vial_number, poop_id, poop_time,
         comments)
#2009-2011 come from poop_external

# 2012: dbatrapping designated columns
poop_2012 <- trapping_dba %>%
  filter(year(date) == 2012) %>%
  filter(poop != "") %>%
  mutate(vial_number = 1, year = year(date), ptime = clean_pt(ptime)) %>%
  select(squirrel_id, trapping_id, year, vial_number,
         poop_id = poop,  poop_time = ptime, comments)

# 2014-17: trappings comments column
poop_2014_7 <- trapping %>%
  filter(year(date) %in% 2014:2017,
         str_detect(comments, "((p|P)(v|V))|(Poop H)")) %>%
  mutate(pv = extract_pv(comments),
         pv_second = extract_pv(comments, first = FALSE),
         poop_time = extract_pt(comments)) %>%
  pivot_longer(cols = c("pv", "pv_second"), names_to = c("vial_number"), values_to = "poop_id") %>%
  filter(!(is.na(poop_id) & vial_number == "pv_second")) %>%
  mutate(vial_number = if_else(vial_number == "pv", 1L, 2L),
         year = year(date)) %>%
  select(squirrel_id, trapping_id, year, vial_number, poop_id, poop_time,
         comments)

poop_2018_on <- trapping %>%
  filter(year(date) >= 2018) %>%
  mutate(pv_second = str_extract(comments, "[0-9]{3}\\.[0-9]{2}"), #at this point some of the pv_second are axy collars so we need to filter those out
         pv_second = ifelse(str_detect(pv_second, "15|14[0-9]\\.[0-9]{2}"), NA, pv_second),
         #in some cases the pv_second is just the pv with some extra zeroes, so we will filter those out
         pv_second = ifelse(str_remove(pv_second, "^0+") == str_remove(tvariable1, "^0+"), NA, pv_second),
         pv_second = as.numeric(pv_second),
         year = year(date)) %>%
  select(contains("id"), year, pv = tvariable1, pv_second, poop_time = tvariable2, comments) %>%
  pivot_longer(cols = c("pv", "pv_second"), names_to = c("vial_number"), values_to = "poop_id") %>%
  filter(!is.na(poop_id)) %>%
  mutate(vial_number = if_else(vial_number == "pv", 1L, 2L)) %>%
  select(contains("id"), contains("poop"), year, vial_number, comments) %>%
  mutate(poop_time = clean_tvar2(poop_time),
         lens = str_length(poop_time),
         poop_id = as.character(poop_id)) %>%
  select(contains("id"), contains("poop"), year, vial_number, comments)

poop_link_RAW = bind_rows(
  poop_external,
  poop_2008,
  poop_2012,
  poop_2014_7,
  poop_2018_on
) %>%
  mutate_at(c("squirrel_id", "trapping_id", "vial_number", "poop_time"), as.integer)
usethis::use_data(poop_link_RAW, overwrite = TRUE, internal = TRUE)


# Cleaning: Remove Duplicates ----------------------------------------------------------------

poop = poop_link_RAW
poop$row_id = seq.int(nrow(poop))


problems = poop %>%
  group_by(poop_id) %>%
  filter(n() > 1,
         n_distinct(year) > 1 | n_distinct(comments) > 1) %>%
  ungroup() %>%
  arrange(poop_id) %>%
  mutate(problem_duplicate = TRUE)

poop = poop %>%
  filter(row_id %notin% problems$row_id) %>%
  group_by(poop_id) %>%
  arrange(trapping_id) %>%
  slice(1) %>%
  ungroup() %>%
  mutate(problem_duplicate = FALSE) %>%
  select(-row_id)

poop = bind_rows(poop, problems %>%  select(-row_id))


# Cleaning: Fill Poop_Time ------------------------------------------------

ptime_rgx <- "[0-9]{4}(h|[[:space:]]h)|[0-9]{1,2}:[0-9]{2}|[0-9]{4}ish|Ptime[:space:][0-9]{4}|\\([0-9]{3,4}"

guesses = poop %>%
  filter(is.na(poop_time)) %>%
  mutate(time_guess = str_match(comments, ptime_rgx)[,1],
         time_guess = gsub("Ptime|h|\\(|:", "", time_guess))


poop_link = poop %>%
  filter(!is.na(poop_time)) %>%
  bind_rows(guesses) %>%
  mutate(poop_id = gsub(" ", "", poop_id))

usethis::use_data(poop_link, overwrite = TRUE)
mwhalen18/krspfecals documentation built on Dec. 21, 2021, 11:05 p.m.