R/read_scrape_data.R

Defines functions read_scrape_data

Documented in read_scrape_data

#' Read data extracted by webscraper
#'
#' Reads either time series or latest data from the web scraper runs.
#'
#' @param all_dates logical, get all data from all dates recorded by webscraper
#' @param date_cutoff date, the earliest date of acceptable data to pull from 
#' if all_dates is FALSE for .Confirmed and .Deaths variables 
#' @param window integer, the day range of acceptable data to pull from 
#' if all_dates is FALSE for all variables EXCEPT .Confirmed and .Deaths 
#' @param window_pop int, how far to go back (in days) to look for values from a given
#' facility to populate NAs in Residents.Population 
#' @param coalesce_func function, how to combine redundant rows
#' @param drop_noncovid_obs logical, drop rows missing all COVID variables
#' @param debug logical, print debug statements on number of rows maintained in
#' @param state character vector, states to limit data to
#' @param wide_data logical, return wide data as opposed to long
#'
#' @return dataframe with scraped data
#'
#' @examples
#' \dontrun{
#' read_scrape_data(all_dates = FALSE)
#' }
#' read_scrape_data(all_dates = TRUE, state = "Wyoming")
#'
#' @export

read_scrape_data <- function(
    all_dates = FALSE, date_cutoff = DATE_CUTOFF, window = 31, window_pop = 90, 
    coalesce_func = sum_na_rm, drop_noncovid_obs = TRUE, debug = FALSE, 
    state = NULL, wide_data = TRUE){

    remote_loc <- stringr::str_c(
        SRVR_SCRAPE_LOC, "summary_data/aggregated_data.csv")

    jnk <- read.csv(remote_loc, nrows=1, check.names=FALSE)
    # all columns are character columns unless otherwise denoted
    ctypes <- rep("c", ncol(jnk))
    names(ctypes) <- names(jnk)
    # columns that start with residents or staff or data
    ctypes[stringr::str_starts(names(ctypes), "Residents|Staff")] <- "d"
    # date is date type
    ctypes[names(ctypes) == "Date"] <- "D"

    dat_df <- remote_loc %>%
        readr::read_csv(col_types = paste0(ctypes, collapse = "")) %>%
        mutate(State = translate_state(State)) %>%
        # rename this variable for clarity
        rename(jurisdiction_scraper = jurisdiction) %>%
        # the following steps are time intensive so its better to do them
        # while the data is wide
        mutate(Name = clean_fac_col_txt(Name, to_upper = TRUE)) %>%
        mutate(
            pop_scraper = ifelse(stringr::str_detect(id, "pop"), T, F),
            historical_covid = ifelse(stringr::str_detect(id, "pre-nov"), T, F)
            ) %>%
        clean_facility_name(debug = debug) %>%
        # if Jurisdiction is NA (no match in facility_spellings),
        # make it scraper jurisdiction
        mutate(Jurisdiction = ifelse(
            (is.na(Jurisdiction) & !is.na(jurisdiction_scraper)),
            jurisdiction_scraper, Jurisdiction)) %>%
        # now we can pivot the data long
        tidyr::pivot_longer(starts_with(c("Residents", "Staff")))

    if(debug){
        message(stringr::str_c(
            "Base data frame contains ", nrow(dat_df), " rows."))
    }

    if(!is.null(state)){
        filt_df <- dat_df %>%
            filter(State %in% state & !is.na(value)) %>%
            data.table::as.data.table()

        if(debug){
            message(stringr::str_c(
                "State specific data frame contains ", nrow(filt_df), " rows."))
        }
    }
    else {
        filt_df <- data.table::as.data.table(dat_df)[!is.na(value), ]
    }

    # resolve population issues, prioritize dedicated pop scrapers
    pop_full_df <- filt_df[name == "Residents.Population",]
    pop_full_df[,
                singlepop := length(unique(pop_scraper)) == 1,
                by = list(
                    Date, Name, State, jurisdiction_scraper, Facility.ID, name)]
    pop_sub <- pop_full_df[pop_scraper | singlepop,]
    pop_sub[,singlepop := NULL]

    # resolve duplicate scrapers issues, prioritize old scrapers
    cov_full_df <- filt_df[name != "Residents.Population",]
    cov_full_df[,
                singlescrape := length(unique(historical_covid)) == 1,
                by = list(
                    Date, Name, State, jurisdiction_scraper, Facility.ID, name)]
    cov_sub <- cov_full_df[singlescrape | historical_covid,]

    # combine scrapers together
    var_sub_df <- bind_rows(pop_sub,cov_sub) %>%
        select(-pop_scraper, -historical_covid, -singlescrape)

    # Coalesce values together using the passed in coalesce function
    metric_df <- var_sub_df %>%
        select(Date, Name, State, jurisdiction_scraper, Facility.ID, name, value)

    metric_coal_df <- metric_df[,.(value = coalesce_func(value)), by = list(
        Date, Name, State, jurisdiction_scraper, Facility.ID, name
    )]

    # for facilities with missing population data for some days
    # we want to hold valid data for window_pop days
    base_df <- metric_coal_df[
        name == "Residents.Population",
        list(Name, State, jurisdiction_scraper, Facility.ID)] %>%
        unique()

    if(nrow(base_df) > 0){

        fill_dates <- tibble(
            Date = seq.Date(lubridate::ymd("2020-04-01"), Sys.Date(), by = "day")
            )

        full_date_df <- bind_rows(lapply(1:nrow(base_df), function(i){
            bind_cols(fill_dates, base_df[i,])
            })) %>%
            mutate(name = "Residents.Population") %>%
            left_join(
                metric_coal_df[name == "Residents.Population",],
                by = c(
                    "Date", "Name", "State", "jurisdiction_scraper",
                    "Facility.ID", "name")
                ) %>%
            data.table::as.data.table() %>%
            mutate(CDate = ifelse(is.na(value), NA_integer_, Date))

        full_date_df[,
            CDate := last_not_na(CDate),
            by = list(Name, State, jurisdiction_scraper, Facility.ID)]

        sub_date_df <- full_date_df[(as.numeric(Date) - CDate) <= window_pop,]
        sub_date_df[,
            value := last_not_na(value),
            by = list(Name, State, jurisdiction_scraper, Facility.ID)
            ]
        sub_date_df[,DDate := Date]
        sub_date_df[,Date := lubridate::as_date(CDate)]
        sub_date_df[,CDate := NULL]

        metric_coal_pop_fix_df <- rbindlist(list(
            metric_coal_df[name != "Residents.Population",] %>%
                mutate(DDate = lubridate::as_date(NA)),
            sub_date_df
            ))
    }
    else{
        metric_coal_pop_fix_df <- metric_coal_df %>%
            mutate(DDate = lubridate::as_date(NA))
    }

    # for ambiguous non metric values such as source use the first value
    non_metric_df <- var_sub_df %>%
        select(-value)

    non_metric_coal_df <- non_metric_df[,lapply(.SD, first), by = list(
        Date, Name, State, jurisdiction_scraper, Facility.ID, name
    )]

    # merge metric and non metric groups together
    comb_df <- left_join(
        metric_coal_pop_fix_df, non_metric_coal_df,
        by = c(
            "Date", "Name", "State", "jurisdiction_scraper",
            "Facility.ID", "name")) %>%
        mutate(Date = lubridate::as_date(ifelse(is.na(DDate), Date, DDate))) %>%
        select(-DDate)

    if(debug){
        message(stringr::str_c(
            "Coalesced data frame contains ", nrow(comb_df), " rows."))
    }

    # merge on facility level info
    out_df <- merge_facility_info(comb_df) %>%
        rename(Measure = name) %>%
        select(-id)

    if(!all_dates){
        out_df <- out_df %>%
            # only keep values newer than date cutoff for confirmed, deaths 
            # only keep values within window for other variables 
            filter(
                (Date >= date_cutoff & stringr::str_ends(Measure, "Confirmed|Deaths")) |
                    (Date >= (Sys.Date() - window))
            ) %>% 
            group_by(Facility.ID, jurisdiction_scraper, State, Name, Measure) %>%
            arrange(Facility.ID, jurisdiction_scraper, State, Name, Measure, Date) %>%
            # keep only last observed value
            filter(1:n() == n()) %>%
            ungroup()
    }

    if(wide_data){
        if(!all_dates){
            out_df <- out_df %>%
                group_by(
                    Facility.ID, jurisdiction_scraper, State, Name) %>%
                # make all the sources the same for wide data only
                mutate(source = first(source)) %>%
                # make all the dates the same for wide data not all dates only
                mutate(Date = max(Date)) %>%
                ungroup() %>%
                group_by(State, Date, Measure, jurisdiction_scraper) %>%
                mutate(has_statewide = "STATEWIDE" %in% Name) %>%
                # if state wide and other counts exist for a measure only take max date
                filter(!(has_statewide) | Date == max(Date)) %>%
                # if state wide and other counts still exist for a measure only
                # use non-statewide
                mutate(has_statewide = "STATEWIDE" %in% Name) %>%
                mutate(has_other = any("STATEWIDE" != Name, na.rm=T)) %>%
                filter(!(has_other & has_statewide & Name == "STATEWIDE")) %>%
                ungroup() %>%
                select(-has_statewide, -has_other)
        }else{
            out_df <- out_df %>%
                group_by(Facility.ID, jurisdiction_scraper, State, Name, Measure) %>%
                arrange(Facility.ID, jurisdiction_scraper, State, Name, Measure, Date) %>%
                # make all the sources the same for wide data only
                group_by(Facility.ID, jurisdiction_scraper, State, Name, Date) %>%
                mutate(source = first(source)) %>%
                ungroup()
        }

        out_df <- out_df %>%
            tidyr::pivot_wider(names_from = Measure, values_from = value)

        if(drop_noncovid_obs){
            rowAny <- function(x) rowSums(x) > 0

            out_df <- out_df %>%
                # drop rows missing COVID data (e.g. only with population data)
                filter(rowAny(across(ends_with(c(
                    ".Confirmed", ".Deaths", ".Recovered", ".Tadmin", ".Tested", ".Active",
                    ".Negative", ".Pending", ".Quarantine", ".Initiated", ".Completed", ".Vadmin")),
                    ~ !is.na(.x))))
        }

        out_df <- out_df %>%
            arrange(Facility.ID, jurisdiction_scraper, State, Name, Date) %>%
            reorder_cols()
    }

    if(debug){
        message(stringr::str_c(
            "Named data frame contains ", nrow(out_df), " rows."))
    }
    
    out_df <- assign_web_group(out_df)

    return(out_df)
}
uclalawcovid19behindbars/behindbarstools documentation built on April 22, 2022, 4:08 a.m.