#Code to extract poop_ids from trapping tables
#Most code is adapted from mstrimas
#I have modified it for version control and some dependencies
library(krsp)
library(tidyverse)
library(lubridate)
library(stringr)
library(purrr)
# Raw Poop-Link -----------------------------------------------------------
#I have some home cooked functions that are hidden with the gitignore. They allow easier connection to the krsp database
#They are solely for development purposes.
#There are however, some functions in the `extract_functions.R` script that aid in generating these tables
list.files("data-raw/Poop-link/fn", full.names = TRUE, recursive = T) %>%
walk(source)
# trapping tables
# mstrimas uses dbplyr with SQL. I have modified it to be in dplyr notation
trapping <- tbl(con, "trapping") %>%
select(trapping_id = id, squirrel_id, date, comments, contains("tvariable")) %>%
collect()
trapping_dba <- tbl(con, "dbatrapping") %>%
select(trapping_id = id, squirrel_id, date, comments = TagHist, poop, ptime) %>%
collect()
#years are treated independently due to different data collection and sample naming conventions
#2006-2007; 2009-2011 come from external files which I will document here
poop_external <- read_csv("https://raw.githubusercontent.com/KluaneRedSquirrelProject/poop/master/output/poop_external.csv") %>%
mutate(poop_time = as.character(poop_time))
#load("data-raw/poop_external.rda") #exists in raw compressed form in this repo now.
# 2008: trappings comments column, no pv
pt_rgx <- "(?<=p[0-9]{4}[-;,:]?\\s{0,2})[0-2]?[0-9](h|:)?[0-9]{2}\\s?(am|pm)?"
poop_2008 <- trapping %>%
filter(year(date) == 2008, comments != "") %>%
mutate(pv = str_extract(str_to_lower(comments), "p[0-9]{4}"),
pv_second = str_extract_all(str_to_lower(comments), "p[0-9]{4}"),
pv_second = map_chr(pv_second,
~ ifelse(length(.x) > 1, .x[2], NA))) %>%
filter(!is.na(pv)) %>%
mutate(poop_time = str_extract(str_to_lower(comments), pt_rgx)) %>%
pivot_longer(cols = c("pv", "pv_second"), names_to = c("vial_number"), values_to = "poop_id") %>% #adjusted this as gather is deprecated
filter(!(is.na(poop_id) & vial_number == "pv_second")) %>%
mutate(vial_number = if_else(vial_number == "pv", 1L, 2L),
poop_id = str_to_upper(poop_id),
poop_time = clean_pt(poop_time),
year = year(date)) %>%
select(squirrel_id, trapping_id, year, vial_number, poop_id, poop_time,
comments)
#2009-2011 come from poop_external
# 2012: dbatrapping designated columns
poop_2012 <- trapping_dba %>%
filter(year(date) == 2012) %>%
filter(poop != "") %>%
mutate(vial_number = 1, year = year(date), ptime = clean_pt(ptime)) %>%
select(squirrel_id, trapping_id, year, vial_number,
poop_id = poop, poop_time = ptime, comments)
# 2014-17: trappings comments column
poop_2014_7 <- trapping %>%
filter(year(date) %in% 2014:2017,
str_detect(comments, "((p|P)(v|V))|(Poop H)")) %>%
mutate(pv = extract_pv(comments),
pv_second = extract_pv(comments, first = FALSE),
poop_time = extract_pt(comments)) %>%
pivot_longer(cols = c("pv", "pv_second"), names_to = c("vial_number"), values_to = "poop_id") %>%
filter(!(is.na(poop_id) & vial_number == "pv_second")) %>%
mutate(vial_number = if_else(vial_number == "pv", 1L, 2L),
year = year(date)) %>%
select(squirrel_id, trapping_id, year, vial_number, poop_id, poop_time,
comments)
poop_2018_on <- trapping %>%
filter(year(date) >= 2018) %>%
mutate(pv_second = str_extract(comments, "[0-9]{3}\\.[0-9]{2}"), #at this point some of the pv_second are axy collars so we need to filter those out
pv_second = ifelse(str_detect(pv_second, "15|14[0-9]\\.[0-9]{2}"), NA, pv_second),
#in some cases the pv_second is just the pv with some extra zeroes, so we will filter those out
pv_second = ifelse(str_remove(pv_second, "^0+") == str_remove(tvariable1, "^0+"), NA, pv_second),
pv_second = as.numeric(pv_second),
year = year(date)) %>%
select(contains("id"), year, pv = tvariable1, pv_second, poop_time = tvariable2, comments) %>%
pivot_longer(cols = c("pv", "pv_second"), names_to = c("vial_number"), values_to = "poop_id") %>%
filter(!is.na(poop_id)) %>%
mutate(vial_number = if_else(vial_number == "pv", 1L, 2L)) %>%
select(contains("id"), contains("poop"), year, vial_number, comments) %>%
mutate(poop_time = clean_tvar2(poop_time),
lens = str_length(poop_time),
poop_id = as.character(poop_id)) %>%
select(contains("id"), contains("poop"), year, vial_number, comments)
poop_link_RAW = bind_rows(
poop_external,
poop_2008,
poop_2012,
poop_2014_7,
poop_2018_on
) %>%
mutate_at(c("squirrel_id", "trapping_id", "vial_number", "poop_time"), as.integer)
usethis::use_data(poop_link_RAW, overwrite = TRUE, internal = TRUE)
# Cleaning: Remove Duplicates ----------------------------------------------------------------
poop = poop_link_RAW
poop$row_id = seq.int(nrow(poop))
problems = poop %>%
group_by(poop_id) %>%
filter(n() > 1,
n_distinct(year) > 1 | n_distinct(comments) > 1) %>%
ungroup() %>%
arrange(poop_id) %>%
mutate(problem_duplicate = TRUE)
poop = poop %>%
filter(row_id %notin% problems$row_id) %>%
group_by(poop_id) %>%
arrange(trapping_id) %>%
slice(1) %>%
ungroup() %>%
mutate(problem_duplicate = FALSE) %>%
select(-row_id)
poop = bind_rows(poop, problems %>% select(-row_id))
# Cleaning: Fill Poop_Time ------------------------------------------------
ptime_rgx <- "[0-9]{4}(h|[[:space:]]h)|[0-9]{1,2}:[0-9]{2}|[0-9]{4}ish|Ptime[:space:][0-9]{4}|\\([0-9]{3,4}"
guesses = poop %>%
filter(is.na(poop_time)) %>%
mutate(time_guess = str_match(comments, ptime_rgx)[,1],
time_guess = gsub("Ptime|h|\\(|:", "", time_guess))
poop_link = poop %>%
filter(!is.na(poop_time)) %>%
bind_rows(guesses) %>%
mutate(poop_id = gsub(" ", "", poop_id))
usethis::use_data(poop_link, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.