uk-covid-datatools: Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data

#' NHS datasets
#' @export
NHSDatasetProvider = R6::R6Class("NHSDatasetProvider", inherit=CovidTimeseriesProvider, public = list(
    
    initialize = function(providerController, ...) {
      super$initialize(providerController, ...)
    },
    
    #' @description Load google mobility file aligned with LAD codes
    #' @return google mobility 0 custom format
    getGoogleMobility = function(...) {
      self$getDaily("GOOGLE_MOBILITY", ..., orElse = function (...) {
        Global_Mobility_Report <- readr::read_csv(
          "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv", 
          col_types = cols(
            sub_region_1 = col_character(), sub_region_2 = col_character(), iso_3166_2_code = col_character(), metro_area = col_character(),
            census_fips_code = col_character(), date = col_date(format = "%Y-%m-%d"))) %>% filter(country_region_code == "GB")
        mobility_to_LAD = readr::read_csv("https://raw.githubusercontent.com/datasciencecampus/google-mobility-reports-data/master/geography/google_mobility_lad_lookup_200903.csv")
        out = Global_Mobility_Report %>% inner_join(mobility_to_LAD, by=c("country_region_code","sub_region_1","sub_region_2"))
        return(out %>% rename(code = lad19cd, name=la_name) %>% mutate(codeType = ifelse(flag_2018,"LAD18","LAD19")) %>% select(-place_id,-census_fips_code,-metro_area,-country_region_code,-country_region) )
      })
    },
    
    #' @description Load curated significant dates file
    #' @return public 111 data including pathways and online, pre and post april. Codes are CCG20 codes.
    getSignificantDates = function(signif = Inf, ...) {
      return(ukcovidtools::ukCovidDates %>% filter(Significance <= signif))
    },
    
    #' @description Load public 111 summary file
    #' @return public 111 data including pathways and online, pre and post april. Codes are CCG20 codes.
    getPublicOneOneOne = function(...) {
      self$getDaily("PUBLIC-111", ..., orElse = function (...) covidTimeseriesFormat({
        # https://digital.nhs.uk/data-and-information/publications/statistical/mi-potential-covid-19-symptoms-reported-through-nhs-pathways-and-111-online/latest
        # Load landing page
        tmp = xml2::read_html("https://digital.nhs.uk/data-and-information/publications/statistical/mi-potential-covid-19-symptoms-reported-through-nhs-pathways-and-111-online/latest")
        links = tmp %>% rvest::html_nodes(xpath="//a") %>% rvest::html_attr(name = "href")
        url = function(str) {return(links[links %>% stringr::str_detect(str)])}
        
        #Load files matched from landing page
        pathwaysPreApril = readr::read_csv(url("NHS%20Pathways%20Covid-19%20data%20CCG%20mapped"), 
                col_types = readr::cols(SiteType = readr::col_character(),  `Call Date` = readr::col_date(format = "%d/%m/%Y")))
        pathways = readr::read_csv(url("NHS%20Pathways%20Covid-19%20data%202020"), 
                col_types = readr::cols(SiteType = readr::col_character(), `Call Date` = readr::col_date(format = "%d/%m/%Y")))
        online = readr::read_csv(url("111%20Online%20Covid-19%20data_2020"), 
                col_types = readr::cols(`journeydate` = readr::col_date(format = "%d/%m/%Y")))
        onlinePreApril = readr::read_csv(url("111%20Online%20Covid-19%20data_CCG%20mapped"), 
                col_types = readr::cols(`journeydate` = readr::col_date(format = "%d/%m/%Y")))
        
        out = bind_rows(
          # pathways spreadsheets
          pathwaysPreApril %>% 
            dplyr::select(-CCGCode, -CCGName) %>% 
            dplyr::rename(source = SiteType, date = `Call Date`, gender = `Gender`, ageCat = `AgeBand`, code = `April20 mapped CCGCode`, name=`April20 mapped CCGName`, incidence = `TriageCount`) %>%
            dplyr::group_by(source,date,gender,ageCat,code,name) %>% 
            dplyr::summarise(incidence = sum(incidence)) %>% 
            dplyr::ungroup(),
          
          pathways %>% 
            dplyr::filter(`Call Date` > max(pathwaysPreApril$`Call Date`,na.rm=TRUE)) %>% 
            dplyr::rename(source = SiteType, date = `Call Date`, gender = `Sex`, ageCat = `AgeBand`, code = `CCGCode`, name=`CCGName`, incidence = `TriageCount`) %>% 
            dplyr::ungroup(),
          
          onlinePreApril %>% 
            dplyr::select(-CCGCode, -CCGName) %>% 
            dplyr::mutate(source = "online") %>% 
            dplyr::rename(date = `journeydate`, gender = `gender`, ageCat = `ageband`, code = `April20 mapped CCGCode`, name=`April20 mappedCCGName`, incidence = `Total`) %>%
            dplyr::group_by(source,date,gender,ageCat,code,name) %>% 
            dplyr::summarise(incidence = sum(incidence)) %>% 
            dplyr::ungroup(),
          
          online %>% 
            dplyr::filter(journeydate > max(onlinePreApril$journeydate,na.rm=TRUE)) %>% dplyr::mutate(source = "online") %>% 
            dplyr::rename(date = `journeydate`, gender = `sex`, ageCat = `ageband`, code = `ccgcode`, name=`ccgname`, incidence = `Total`) %>% 
            dplyr::ungroup()
        )
        
        # standardise age categories
        out2 = out %>% dplyr::mutate(
          gender = self$normaliseGender(gender),
          ageCat = case_when(
            ageCat == "70-120 years" ~ "70+",
            ageCat == "70+ years" ~ "70+",
            ageCat == "19-69 years" ~ "19-69",
            TRUE ~ NA_character_
          ))
        out2 = out2 %>% 
          dplyr::filter(!is.na(ageCat) & code != "NULL" & name != "NULL")
        out2 = out2 %>% 
          #dplyr::group_by(gender,ageCat,code,name,source) %>% 
          #dplyr::arrange(date) %>% 
          dplyr::mutate(codeType = "CCG20", type="incidence",statistic="triage",subgroup=NA_character_, note=NA_character_, source=paste0(source,"-public")) %>% 
          dplyr::rename(value=incidence)
        # out2 = out2 %>% dplyr::select(-name) %>% self$codes$findNamesByCode(outputCodeTypeVar = NULL)
        return(out2 %>% self$fillAbsent() %>% self$fixDatesAndNames(0) %>% self$complete())
      }))
    },
    
    
    # TODO: https://www.health-ni.gov.uk/publications/daily-dashboard-updates-covid-19-august-2020
    # Wales: http://www2.nphs.wales.nhs.uk:8080/CommunitySurveillanceDocs.nsf/3dc04669c9e1eaa880257062003b246b/77fdb9a33544aee88025855100300cab/$FILE/Rapid%20COVID-19%20surveillance%20data.xlsx
    # Scotland: https://www.opendata.nhs.scot/dataset/covid-19-in-scotland
    
    getTomWhiteCases = function(truncate=4,...) {
      self$getDaily("TOM-WHITE-CASES", ..., orElse = function (...) covidTimeseriesFormat({
        walesUAtoHealthBoard = self$codes$getMappings() %>% filter(fromCodeType=="UA",toCodeType=="LHB")
        covid_19_cases_uk <- readr::read_csv("https://github.com/geeogi/covid-19-uk-data/raw/master/data/covid-19-cases-uk.csv", 
                                      col_types = readr::cols(Date = readr::col_date(format = "%Y-%m-%d")), 
                                      na = c("","NaN","NA"))
        tmp_cases_uk = covid_19_cases_uk %>% 
          dplyr::mutate(value = as.numeric(TotalCases)) %>%
          dplyr::rename(code = AreaCode, date = Date) %>%
          dplyr::mutate(code = case_when(
            !is.na(code) ~ code,
            Country == "England" ~ "E99999999",
            Country == "Scotland" ~ "S99999999",
            Country == "Wales" ~ "W99999999",
            Country == "Northern Ireland" ~ "N99999999",
            TRUE ~ NA_character_)) %>% 
          dplyr::left_join(walesUAtoHealthBoard, by=c("code"="fromCode")) %>%
          dplyr::mutate(
            code = if_else(is.na(toCode),code,toCode)
          ) %>% 
          dplyr::group_by(code,date) %>% 
          dplyr::summarise(value = sum(value, na.rm=TRUE)) %>% 
          dplyr::filter(!is.na(code)) %>% 
          self$codes$findNamesByCode() %>%
          dplyr::ungroup() %>%
          dplyr::mutate(
            statistic = "case",
            source = "covid-19-cases-uk-tom-white",
            gender = NA_character_,
            ageCat = NA_character_,
            type = "cumulative",
            subgroup = NA_character_
          ) %>%
          #dplyr::group_by(code,codeType,name,source,subgroup,statistic,gender,ageCat,type) %>% 
          #tidyr::complete(date = as.Date(min(date):max(date),"1970-01-01")) %>%
          #tidyr::fill(value) %>%
          self$complete() %>%
          dplyr::ungroup()
        return(tmp_cases_uk %>% self$fillAbsent() %>% self$fixDatesAndNames(truncate) %>% self$complete())
      }))
    },
      
    #browser()
    getTomWhiteIndicators = function(truncate=4,...) {
      self$getDaily("TOM-WHITE-INDIC", ..., orElse = function (...) covidTimeseriesFormat({
        covid_19_indicators_uk <- readr::read_csv("https://github.com/geeogi/covid-19-uk-data/raw/master/data/covid-19-indicators-uk.csv", 
                                           col_types = readr::cols(Date = readr::col_date(format = "%Y-%m-%d")))
        
        country_totals = covid_19_indicators_uk %>% 
          dplyr::filter(Date > as.Date("2020-02-12")) %>%
          dplyr::group_by(Indicator,Country) %>% 
          tidyr::complete(Date = as.Date(min(Date):max(Date),"1970-01-01")) %>%
          tidyr::fill(Value) %>% 
          dplyr::ungroup() %>%
          tidyr::pivot_wider(names_from = Country, values_from = Value) %>%
          dplyr::mutate(`England` = ifelse(is.na(`England`),`UK`-`Northern Ireland`-`Wales`-`Scotland`,`England`)) %>%
          dplyr::mutate(missing = `UK`-`England`-`Northern Ireland`-`Wales`-`Scotland`)
          
        country_totals2 = country_totals %>%
          tidyr::pivot_longer(cols=c(-Indicator,-Date), names_to = "name", values_to = "value") %>%
          dplyr::mutate(statistic = case_when(
            Indicator == "ConfirmedCases" ~ "case",
            Indicator == "Deaths" ~ "death",
            Indicator == "Tests" ~ "test",
            ),
            type = ifelse(name == "missing", "bias", "cumulative"),
            name = ifelse(name=="missing" | name=="UK","United Kingdom", name)
          ) %>%
          self$codes$findCodesByName() %>%
          dplyr::rename(date = Date) %>%
          dplyr::select(-Indicator,-name.original) %>%
          dplyr::mutate(
            source = "covid-19-indicators-uk",
            gender = NA_character_,
            ageCat = NA_character_,
            subgroup = NA_character_
          )
        
        country_totals2 %>% 
          # dplyr::group_by(code,codeType,name,source,subgroup,statistic,gender,ageCat,type) %>% 
          # tidyr::complete(date = as.Date(min(date):max(date),"1970-01-01")) %>%
          # tidyr::fill(value) %>%
          self$complete() %>%
          dplyr::ungroup()
        
        return(country_totals2 %>% self$fixDatesAndNames(truncate))
        
      }))
    },
  
    getNHSDeaths = function(truncate = 7,...) {
      self$getDaily("NHS-DEATHS", ..., orElse = function (...) covidTimeseriesFormat({
        # Load landing page
        tmp = xml2::read_html("https://www.england.nhs.uk/statistics/statistical-work-areas/covid-19-daily-deaths/")
        links = tmp %>% rvest::html_nodes(xpath="//a") %>% rvest::html_attr(name = "href")
        url = function(str) {return(links[links %>% stringr::str_detect(str)])}
        #Load files matched from landing page
        file = self$downloadDaily(id = "NHS_DEATHS",url = url("COVID-19-total-announced-deaths(?!.*-weekly-file)(.*)\\.xlsx"),type="xlsx")
        NHS_DEATHS <- suppressMessages(readxl::read_excel(file, sheet = "Tab4 Deaths by trust", skip = 15))
        NHS_DEATHS = NHS_DEATHS %>% 
          dplyr::select(-c(1,2,5)) %>%
          dplyr::rename(code=Code,name=Name) %>%
          dplyr::mutate(
            code = ifelse(name=="ENGLAND","E92000001",code),
            codeType = ifelse(name=="ENGLAND","CTRY","NHS trust")
          ) %>% 
          dplyr::filter(!is.na(code))
          
        out = NHS_DEATHS %>% 
          tidyr::pivot_longer(cols=c(-code,-codeType, -name), names_to = "date", values_to = "value") %>% 
          dplyr::mutate(date=suppressWarnings(as.Date(as.numeric(date),"1899-12-30"))) %>% 
          dplyr::filter(!is.na(date)) %>%
          dplyr::mutate(
            source = "COVID-19-total-announced-deaths",
            subgroup=NA_character_,
            statistic="death",
            type="incidence",
            gender=NA_character_,
            ageCat=NA_character_
          )
        out = out %>%  self$fillAbsent() %>% self$fixDatesAndNames(truncate) %>% self$complete()
        return(out)
        
      }))
    },
    
    
    # getPHEAdmissions = function(...) {
    #   csv = "https://api.coronavirus.data.gov.uk/v2/data?areaType=nhsTrust&metric=cumAdmissions&metric=newAdmissions&format=csv"
    #   # https://coronavirus.data.gov.uk/api/v1/data?filters=areaType=nhstrust;areaName=Alder%2520Hey%2520Children%27s%2520NHS%2520Foundation%2520Trust&structure=%7B%22areaType%22:%22areaType%22,%22areaName%22:%22areaName%22,%22areaCode%22:%22areaCode%22,%22date%22:%22date%22,%22hospitalCases%22:%22hospitalCases%22%7D&format=csv
    #   tmp = readr::read_csv(csv)
    #   tmp = tmp %>%
    #     dplyr::rename(code = areaCode, name = areaName, code) %>%
    #     dplyr::select(-areaType) %>%
    #     tidyr::complete(date,tidyr::nesting(code,name,type)) %>% 
    #     
    # }
    
    
    #' @description Get UK outbreak timeseries data
    #' @return a covidTimeserisFormat data frame with several timeseries in it
    getPHEDashboard = function(truncate = 4,...) {
      self$getDaily("PHE-DASH", ..., orElse = function (...) covidTimeseriesFormat({
        ph_cases = readr::read_csv("https://coronavirus.data.gov.uk/downloads/csv/coronavirus-cases_latest.csv", 
                  col_types = readr::cols(`Specimen date` = readr::col_date(format = "%Y-%m-%d")))
        ph_cases = ph_cases %>% select(date = `Specimen date`, code = `Area code`, name=`Area name`,cumulative_cases = `Cumulative lab-confirmed cases`, daily_cases = `Daily lab-confirmed cases`, type=`Area type`)
        full_ph_cases = ph_cases %>% 
          tidyr::complete(date,tidyr::nesting(code,name,type)) %>% 
          dplyr::group_by(code,name,type) %>% 
          dplyr::arrange(date) %>% 
          tidyr::fill(cumulative_cases) %>% 
          dplyr::mutate(daily_cases=ifelse(is.na(daily_cases),0,daily_cases), cumulative_cases=ifelse(is.na(cumulative_cases),0,cumulative_cases)) %>% 
          dplyr::mutate(codeType = case_when(
            type == "Nation" ~ "CTRY",
            type == "Region" ~ "RGN",
            type == "Upper tier local authority" ~ "UTLA",
            type == "Lower tier local authority" ~ "LAD",
            type == "utla" ~ "UTLA",
            type == "ltla" ~ "LAD",
            type == "nation" ~ "CTRY",
            type == "region" ~ "RGN"
          )) %>%
          dplyr::ungroup() %>%
          dplyr::select(-type)
        
        # missingness2 = full_ph_cases %>% filter(type == "Nation" & name == "England") %>% select(-type) %>% 
        #   left_join(
        #     full_ph_cases %>% filter(codeType == "RGN") %>% select(-type) %>% group_by(date) %>% summarise(regions_cumulative_cases = sum(cumulative_cases), regions_daily_cases = sum(daily_cases), regions_count = n())
        #   ) %>% left_join(
        #     full_ph_cases %>% filter(codeType == "UTLA") %>% select(-type) %>% group_by(date) %>% summarise(utla_cumulative_cases = sum(cumulative_cases), utla_daily_cases = sum(daily_cases), utla_count = n())
        #   ) %>% left_join(
        #     full_ph_cases %>% filter(codeType == "LAD") %>% select(-type) %>% group_by(date) %>% summarise(ltla_cumulative_cases = sum(cumulative_cases), ltla_daily_cases = sum(daily_cases), ltla_count = n())
        #   )
        
        # full_ph_cases %>% group_by(code,name,codeType) %>% 
        #   arrange(date) %>% mutate(prev_cum =lag(cumulative_cases,default=0)) %>% filter(daily_cases != cumulative_cases-prev_cum)
        # 
        full_ph_cases = full_ph_cases %>% 
          dplyr::group_by(code,name,codeType) %>% 
          dplyr::mutate(daily_cases = cumulative_cases-lag(cumulative_cases,default=0)) %>% 
          dplyr::ungroup()
        
        LTLA_to_PHEC = readr::read_csv(self$codes$mapping$LAD19CD_PHEC19CD$url) %>% 
          dplyr::select(LAD19CD,PHEC19CD,PHEC19NM) %>% distinct()
        
        LTLA_to_NHSER = readr::read_csv(self$codes$mapping$LAD19CD_CCG19CD$url) %>% 
          dplyr::left_join(readr::read_csv(self$codes$mapping$CCG19CD_NHSER19CD$url), by="CCG19CD") %>% 
          dplyr::select(LAD19CD,NHSER19CD,NHSER19NM) %>% distinct() 
        # There are 6 LTLAS that span NHSER boundaries. Assign them to an individual one (this is minorly lossy).
        LTLA_to_NHSER = LTLA_to_NHSER %>% 
          dplyr::group_by(LAD19CD) %>% 
          dplyr::filter(row_number()==1)
        
        # zero rows
        # LTLA_to_NHSER %>% group_by(LAD19CD) %>% count() %>% filter(n>1)
        # LTLA_to_PHEC %>% group_by(LAD19CD) %>% count() %>% filter(n>1)
        # 
        # zero rows
        # england_ltla %>% anti_join(LTLA_to_NHSER, by=c("code"="LAD19CD"))
        # england_ltla %>% anti_join(LTLA_to_PHEC, by=c("code"="LAD19CD"))
        # 
        
        # add in NHSER
        tmp2 = full_ph_cases %>% 
          dplyr::filter(codeType == "LAD") %>% 
          dplyr::inner_join(LTLA_to_NHSER, by=c("code"="LAD19CD")) %>% 
          dplyr::select(-code,-name) %>% 
          dplyr::rename(code = NHSER19CD, name = NHSER19NM) %>% 
          dplyr::group_by(date,code,name) %>% 
          dplyr::summarise(cumulative_cases = sum(cumulative_cases), daily_cases = sum(daily_cases)) %>% 
          dplyr::mutate(codeType = "NHSER")
        
        # add in PHEC
        tmp4 = full_ph_cases %>% 
          dplyr::filter(codeType == "LAD") %>% 
          dplyr::inner_join(LTLA_to_PHEC, by=c("code"="LAD19CD")) %>% 
          dplyr::select(-code,-name) %>% 
          dplyr::rename(code = PHEC19CD, name = PHEC19NM) %>% 
          dplyr::group_by(date,code,name) %>% 
          dplyr::summarise(cumulative_cases = sum(cumulative_cases), daily_cases = sum(daily_cases)) %>% 
          dplyr::mutate(codeType = "PHEC")
        
        out = dplyr::bind_rows(full_ph_cases, tmp2, tmp4) %>% 
          dplyr::mutate(statistic = "case",source="PHE dashboard", subgroup=NA,ageCat = NA,gender=NA) %>% 
          dplyr::rename(incidence = daily_cases, cumulative=cumulative_cases) %>% 
          tidyr::pivot_longer(cols=c(incidence,cumulative), names_to ="type")
        
        return(out  %>% self$fillAbsent() %>% self$fixDatesAndNames(truncate) %>% self$complete())
      }))
    },
    
    getPHEApiNations = function(...) {
      self$getDaily("PHE-API-CTRY", ..., orElse = function (...) covidTimeseriesFormat({
        ctryCodes = dpc$codes$getCodes() %>% filter(codeType=="CTRY" & status=="live") %>% pull(code)
        tmp = bind_rows(lapply(ctryCodes, FUN=function(x) self$getPHEApi(areaType = "nation", areaCode=x)))
        return(tmp)
      }))
    },
    
    getPHEApiNHSRegions = function(...) {
      self$getDaily("PHE-API-NHSER", ..., orElse = function (...) covidTimeseriesFormat({
        nhserCodes = dpc$codes$getCodes() %>% filter(codeType=="NHSER" & status=="live") %>% pull(code)
        tmp = bind_rows(lapply(nhserCodes, FUN=function(x) self$getPHEApi(areaType = "nhsRegion", areaCode=x)))
        return(tmp)
      }))
    },
    
    getPHEApiNHSTrusts = function(...) {
      self$getDaily("PHE-API-NHS-TRUST", ..., orElse = function (...) covidTimeseriesFormat({
        #nhserCodes = dpc$codes$getCodes() %>% filter(codeType=="NHS trust" & status=="live") %>% pull(code)
        tmp = self$getPHEApi(areaType = "nhsTrust")
        return(tmp)
      }))
    },
    
    
    #' Extracts paginated data by requesting all of the pages
    #' and combining the results.
    #'
    #' @param filters    API filters. See the API documentations for 
    #'                   additional information.
    #'                   
    #' @param structure  Structure parameter. See the API documentations 
    #'                   for additional information.
    #'                   
    #' @return list      Comprehensive list of dictionaries containing all 
    #'                   the data for the given ``filter`` and ``structure`.`
    getPHEPaginatedData = function (filters, structure) {
      
      endpoint     <- "https://api.coronavirus.data.gov.uk/v1/data"
      results      <- list()
      current_page <- 1
      
      repeat {
        
        httr::GET(
          url   = endpoint,
          query = list(
            filters   = paste(filters, collapse = ";"),
            structure = jsonlite::toJSON(structure, auto_unbox = TRUE),
            page      = current_page
          ),
          httr::timeout(10)
        ) -> response
        
        # Handle errors:
        if ( response$status_code >= 400 ) {
          err_msg = httr::http_status(response)
          stop(err_msg)
        } else if ( response$status_code == 204 ) {
          break
        }
        
        # Convert response from binary to JSON:
        json_text <- httr::content(response, "text")
        dt        <- jsonlite::fromJSON(json_text)
        results   <- rbind(results, dt$data)
        
        if ( is.null( dt$pagination$`next` ) ){
          break
        }
        
        current_page <- current_page + 1;
        
      }
      
      return(results)
      
    },
    
    
    getPHEApi = function(areaType = "nation", areaName= NULL, areaCode = NULL, truncate=4, ...) {
      
      # Create filters:
      filters = sprintf("areaType=%s", areaType)
      if(!identical(areaName,NULL)) filters = c(filters, sprintf("areaName=%s", areaName))
      if(!identical(areaCode,NULL)) filters = c(filters, sprintf("areaCode=%s", areaCode))
      
      # Create the structure as a list or a list of lists:
      structure <- list(
        date = "date", 
        codeType = "areaType", 
        name = "areaName", 
        code = "areaCode", 
        case = "newCasesBySpecimenDate",
        death = "newDeaths28DaysByDeathDate",
        admission = "newAdmissions",
        cumAdmission = "cumAdmissions",
        hospitalCases = "hospitalCases",
        icuCases = "covidOccupiedMVBeds"
      )
      
      ## https://coronavirus.data.gov.uk/api/v1/data?filters=areaType=nhstrust;areaName=Alder%2520Hey%2520Children%27s%2520NHS%2520Foundation%2520Trust&structure=%7B%22areaType%22:%22areaType%22,%22areaName%22:%22areaName%22,%22areaCode%22:%22areaCode%22,%22date%22:%22date%22,%22newAdmissions%22:%22newAdmissions%22,%22cumAdmissions%22:%22cumAdmissions%22%7D&format=csv
      ## https://coronavirus.data.gov.uk/api/v1/data?filters=areaType=nhstrust;areaName=Alder%2520Hey%2520Children%27s%2520NHS%2520Foundation%2520Trust&structure=%7B%22areaType%22:%22areaType%22,%22areaName%22:%22areaName%22,%22areaCode%22:%22areaCode%22,%22date%22:%22date%22,%22hospitalCases%22:%22hospitalCases%22%7D&format=csv
      ## https://coronavirus.data.gov.uk/api/v1/data?filters=areaType=nhstrust;areaName=Alder%2520Hey%2520Children%27s%2520NHS%2520Foundation%2520Trust&structure=%7B%22areaType%22:%22areaType%22,%22areaName%22:%22areaName%22,%22areaCode%22:%22areaCode%22,%22date%22:%22date%22,%22covidOccupiedMVBeds%22:%22covidOccupiedMVBeds%22%7D&format=csv
      
      ts = self$getPHEPaginatedData(filters,structure)
      
      ts = ts %>% pivot_longer(cols=c("case","death","admission","cumAdmission","hospitalCases","icuCases"),names_to = "label",values_to = "value") %>%
        mutate(
          type = case_when(
            label == "cumAdmission" ~ "cumulative",
            label == "hospitalCases" ~ "prevalence",
            label == "icuCases" ~ "prevalence",
            TRUE ~ "incidence"
          ),
          codeType = case_when(
            codeType == "overview" ~ "UK",
            codeType == "nation" ~ "CTRY",
            codeType == "region" ~ "RGN",
            codeType == "utla" ~ "UTLA",
            codeType == "ltla" ~ "LAD",
            codeType == "msoa" ~ "MSOA",
            codeType == "nhsRegion" ~ "NHSER",
            codeType == "nhsTrust" ~ "NHS trust",
            TRUE ~ NA_character_
          ),
          source = "phe api",
          subgroup = NA_character_,
          ageCat = NA_character_,
          gender = NA_character_,
          note = NA_character_,
          statistic = case_when(
            label == "admission" ~ "hospital admission",
            label == "case" ~ "case",
            label == "death" ~ "death",
            label == "cumAdmission" ~ "hospital admission",
            label == "hospitalCases" ~ "hospital admission",
            label == "icuCases" ~ "icu admission",
            TRUE ~ NA_character_
          ),
          date = as.Date(date,"%Y-%m-%d")
        ) %>% select(-label)
      return(covidTimeseriesFormat(ts %>% filter(!is.na(value) & !is.na(statistic)) %>% self$fillAbsent() %>% self$fixDatesAndNames(truncate) %>% self$complete()))
    },
    
    getCLIMB = function(...) {
      self$getDaily("CLIMB", ..., orElse = function (...) {
        tmp = readr::read_csv("https://cog-uk.s3.climb.ac.uk/phylogenetics/latest/cog_metadata.csv")
        return(tmp)
      })
    },
    
    getCOGUK = function(...) {
      self$getDaily("COG", ..., orElse = function (...) {
        cogDate = Sys.Date()
        cogData = NULL
        while (identical(cogData,NULL)) {
          tryCatch({
            #cogData = readr::read_csv(paste0("http://cog-uk-microreact.s3.climb.ac.uk/",cogDate,"/cog_metadata_microreact_public.csv"))
            cogData = readr::read_csv(paste0("http://cog-uk-microreact.s3.climb.ac.uk/",cogDate,"/cog_metadata_microreact_geocodes_only.csv"),col_types = cols(
              .default = col_character(),
              sample_date = col_date(format = "%Y-%m-%d"),
              epi_week = col_double(),
              pillar_2 = col_logical(),
              lineage_support = col_double()
            ))
            message("Most recent COG microreact build: ",cogDate)
          }, error = function(e) {
            #message(cogDate)
            cogDate <<- cogDate-1
          })
        }
        cogData = cogData %>% mutate(publish_date = cogDate)
        return(cogData)
      })
    },
    
    getTiers = function(...) {
      self$getDaily("TIERS", ..., orElse = function (...) {
        # Tier data
        fpath = system.file("data-raw", "NPI_dataset_full_extract_03_11_2020.xlsx", package="ukcovidtools")
        prevTiers = readxl::read_xlsx(fpath)
        
        tidyTiers = prevTiers %>% 
          mutate(local_lockdown = residents_cannot_leave_the_local_area) %>%
          select(date,code = ltla20cd, name = ltla20nm, local_lockdown, tier_1, tier_2, tier_3, national_lockdown) %>% 
          mutate(local_lockdown = ifelse(tier_1+tier_2+tier_3+national_lockdown==0, local_lockdown, 0)) %>%
          pivot_longer(cols=c(local_lockdown,tier_1,tier_2,tier_3,national_lockdown),values_to = "present",names_to = "tier") %>% 
          filter(present==1) %>% 
          mutate(date = as.Date(date), tier = case_when(
            tier=="local_lockdown" ~ "local",
            tier=="tier_1" ~ "one",
            tier=="tier_2" ~ "two",
            tier=="tier_3" ~ "three",
            TRUE ~ "lockdown"
          ), codeType="LAD20") %>%
          select(-present)
        revert19to20 = tidyTiers %>% inner_join(tibble(
          code="E06000060",
          oldCode=c("E07000004","E07000005","E07000006","E07000007"),
          oldName=c("Aylesbury Vale","Chiltern","South Bucks","Wycombe")),by="code") %>% 
          select(-code,-name) %>%
          rename(code = oldCode,name = oldName) %>% mutate(codeType="LAD")
        
        tidyTiers = tidyTiers %>% filter(code != "E06000060") %>% bind_rows(revert19to20) %>% mutate(codeType="LAD")
        
        fromDate = max(tidyTiers$date)+1
        #https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&metric=alertLevel&format=csv
        decTiers = readr::read_csv("https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&metric=alertLevel&format=csv")
        
        tidyTiers2 = decTiers %>% select(code = areaCode, name = areaName,FROM = date, alertLevel, alertLevelName) %>% 
          mutate(tier = case_when(
            alertLevel ==1 ~ "one+",
            alertLevel ==2 ~ "one+", #Scot 
            alertLevel ==3 ~ "two+",
            alertLevel ==4 ~ "three+",
            alertLevel ==5 ~ "four+",
            alertLevel ==-99 ~ "lockdown",
          ), codeType="LAD") %>%
          group_by(code,name) %>% arrange(FROM) %>% mutate(TO = lead(FROM,default = NA)) %>%
          mutate(TO = as.Date(ifelse(is.na(TO),Sys.Date(),TO-1),"1970-01-01")) %>% 
          ungroup() %>%
          group_by(across()) 
        
        
        tidyTiers2 = tidyTiers2 %>% group_modify(function(d,g,...) 
          tibble(date = as.Date(g$FROM:g$TO,"1970-01-01"))) %>% ungroup() %>% select(-TO,-FROM)
        
        tidyTiers3 = bind_rows(tidyTiers %>% anti_join(tidyTiers2, by=c("code","date")), tidyTiers2)
        tidyOut = tidyTiers3 %>% tidyr::complete(nesting(code,name,codeType),date=seq(as.Date("2020-01-01"),Sys.Date(),1), fill=list(tier="none"))
        return(tidyOut)
      })
    },
    
    ####TODO: ----
    # BBC contact tracing matrix data:
    # https://www.medrxiv.org/content/10.1101/2020.02.16.20023754v2.supplementary-material
    
    # getPHETests = function() {
    #   # TODO: https://www.gov.uk/guidance/coronavirus-covid-19-information-for-the-public
    # },
    
    # getGoogleTrends = function(...) {
    #   https://cran.r-project.org/web/packages/gtrendsR/gtrendsR.pdf
    # statistic = information seekinh
    # },
    
    # PHE Coronavirus cases by age
    # # url = "https://coronavirus.data.gov.uk/"
    # json_url = "https://c19downloads.azureedge.net/downloads/data/data_latest.json"
    # # "https://coronavirus.data.gov.uk/downloads/json/dated/coronavirus-cases_202004292132.json"
    # # "https://coronavirus.data.gov.uk/downloads/json/dated/data_latest_202004292132.json"
    # tmp = jsonlite::read_json(json_url,simplifyVector = TRUE)
    # 
    # # glimpse(tmp$utlas$E09000002)
    # # tmp3 = enframe(tmp$utlas,name = "UTLA")
    # # tmp3 = tmp3 %>% unnest(cols=value)
    # # tmp3 = tmp3 %>% mutate(name = map(value, ~.x$name$value))
    # 
    # cases = bind_rows(
    #   tmp$countries$E92000001$maleCases %>% mutate(gender="Male"),
    #   tmp$countries$E92000001$femaleCases %>% mutate(gender="Female") 
    # )
    
    # # TODO: This has not yet been integrated
    # ONS age breakdown deaths
    # https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales
    # https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2020/publishedweek172020.xlsx
    # getONSDeaths = function(...) {
    #   ONSurls = c(
    #     "https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2020/publishedweek1620201.xlsx"
    #   )
    #   
    #   filenames = ONSurls %>% stringr::str_extract("/([^/]+)$") %>% stringr::str_remove("/")
    #   filenames = paste0("~/Git/uk-covid-datatools/data-raw/ONS/",filenames)
    #   
    #   missingfileUrls = ONSurls[!file.exists(filenames)]
    #   missingfilenames = filenames[!file.exists(filenames)]
    #   
    #   for(i in seq_along(missingfilenames)) {
    #     download.file(missingfileUrls[i],missingfilenames[i])
    #   }
    #   
    #   loadONS = function(file) {
    #     ONS <- read_excel(file,
    #                       sheet = "Covid-19 - Weekly occurrences",
    #                       range = "B6:BC75")
    #     ONS2 = ONS %>% rename(age=1) %>% mutate(gender = case_when(
    #       stringr::str_detect(age,"Female") ~ "Female",
    #       stringr::str_detect(age,"Male") ~ "Male",
    #       TRUE ~ as.character(NA)
    #     )) %>% fill(gender) %>% mutate(gender = ifelse(is.na(gender),"Both",gender)) %>%
    #       pivot_longer(cols = c(-age,-gender),names_to = "date",values_to = "died") %>%
    #       mutate(age = ifelse(age=="<1","0-1",age)) %>% filter(stringr::str_starts(age,"[0-9]") & stringr::str_starts(date,"[0-9]") & !is.na(died)) %>% mutate(date = as.Date(as.numeric(date),"1970-01-01"))
    #     ONS2 = ONS2 %>% group_by(gender,date) %>% mutate(left = as.numeric(stringr::str_extract(age,"^[0-9]+"))) %>% arrange(left) %>% mutate(right=lead(left,default=120))
    #     ONS3 = ONS2 %>% ungroup() %>% group_by(age,gender) %>% arrange(date) %>% mutate(died = cumsum(died)) %>% mutate(cases=NA,admitted=NA,admittedIcu=NA, country="E&W",filename=file %>% stringr::str_extract("/([^/]+)$") %>% stringr::str_remove("/"))
    #     return(ONS2)
    #   }
    #   
    #   loadONS(filenames[length(filenames)])
    # },
    
    # getUkILIData = function(...) {
    #   urls = c(
    #     "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2020",
    #     "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2019",
    #     "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2018",
    #     "https://www.gov.uk/government/publications/gp-in-hours-bulletin",
    #     "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2016",
    #     "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2015",
    #     "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2014"
    #   )
    #   
    #   fileUrls = NULL
    #   for (url in urls) {
    #     # url = "https://www.gov.uk/government/publications/gp-in-hours-weekly-bulletins-for-2019"
    #     page <- xml2::read_html(url)
    #     
    #     fileUrls = c(fileUrls,page %>%
    #                    html_nodes("a") %>%       # find all links
    #                    html_attr("href") %>%     # get the url
    #                    str_subset("\\.xls|\\.ods"))
    #   }
    #   
    #   fileUrls = unique(fileUrls)
    #   filenames = fileUrls %>% stringr::str_extract("/([^/]+)$") %>% stringr::str_remove("/")
    #   filenames = paste0("~/Git/uk-covid-datatools/data-raw/ILI/",filenames)
    #   
    #   missingfileUrls = fileUrls[!file.exists(filenames)]
    #   missingfilenames = filenames[!file.exists(filenames)]
    #   
    #   for(i in seq_along(missingfilenames)) {
    #     download.file(missingfileUrls[i],missingfilenames[i])
    #   }
    #   
    #   readILI <- function(fname) {
    #     message(fname)
    #     if(str_split(fname,"\\.",simplify = TRUE)[,2]=="xls")
    #     {
    #       #print("XLS")
    #       metadata = readxl::read_xls(path=fname,sheet=1,range="A6:B10", col_names = c("name","val"), col_types = "text")
    #       readxl::read_xls(path=fname,sheet=3,skip = 5, col_names = F)->tmp
    #     } else {
    #       metadata = readODS::read_ods(path=fname,sheet=1,range="A6:B10", col_names = FALSE, col_types = NA) %>% rename(name=A, val=B)
    #       readODS::read_ods(path=fname,sheet=3,skip = 5, col_names = F )->tmp
    #     }
    #     metadata = metadata %>% pivot_wider( names_from = name, values_from = val ) %>% mutate(
    #       DateStarting = tryCatch(
    #         as.Date(`Date starting`,tryFormats=c("%d/%m/%Y","%d/%m/%y")),
    #         error=function(e) as.Date(as.integer(`Date starting`),origin="1900-01-01")),
    #       DateEnding = tryCatch(
    #         as.Date(`Date ending`,tryFormats=c("%d/%m/%Y","%d/%m/%y")),
    #         error=function(e) as.Date(as.integer(`Date ending`),origin="1900-01-01"))
    #     )
    #     tmp %>%
    #       select(
    #         LACode=1,
    #         LAName=2,
    #         PHECentreName=3,
    #         PHECentreCode=4,
    #         PHERegionName =5,
    #         PHERegionCode =6,
    #         Denominator =7,
    #         ILI = 8,
    #         RatePer100000=9) %>%
    #       mutate(ILI = as.integer(ILI),
    #              Denominator = as.integer(Denominator),
    #              RatePer100000 = as.double(RatePer100000),
    #              Week = as.integer(metadata$`Week number`),
    #              Year = lubridate::year(metadata$DateStarting),
    #              DateStarting = as.Date(metadata$DateStarting),
    #              DateEnding = as.Date(metadata$DateEnding)
    #       ) %>%
    #       filter(row_number()<150)->tmp
    #     return(tmp)
    #   }
    #   
    #   
    #   
    #   #filename = dir(path = "~/Git/uk-covid-datatools/data-raw/",recursive = TRUE,pattern = "GP")
    #   #weeks = filename %>% stringr::str_remove_all("_") %>% stringr::str_extract("([0-9]+)\\.") %>% stringr::str_remove("\\.") %>% as.integer()
    #   #years = filename %>% stringr::str_remove_all("_") %>% stringr::str_extract("([0-9]{4})") %>% as.integer()
    #   #years = ifelse(is.na(years),2017,years)
    #   
    #   UKILIdata = readr::read_csv("~/Git/uk-covid-datatools/data-raw/ilidata.csv")
    #   processed = paste0("~/Git/uk-covid-datatools/data-raw/ILI/",unique(UKILIdata$filename))
    #   unprocessed = !(filenames %in% processed)
    #   unprocessedFilenames = filenames[unprocessed]
    #   
    #   UKILIdata_toAdd = tibble(unprocessedFilenames) %>%
    #     mutate(contents = map(unprocessedFilenames,~readILI(.))) %>%
    #     #mutate(Year = years) %>%
    #     #mutate(Week = weeks) %>%
    #     # select(-filename) %>%
    #     mutate(filename=stringr::str_extract(unprocessedFilenames,"([^/]+$)")) %>%
    #     select(-unprocessedFilenames) %>%
    #     unnest(cols=contents)
    #   
    #   if (length(unprocessedFilenames) > 0) {
    #     UKILIdata = bind_rows(UKILIdata,UKILIdata_toAdd)
    #     UKILIdata %>% write_csv("~/Git/uk-covid-datatools/data-raw/ilidata.csv")
    #     usethis::use_data(UKILIdata, overwrite=TRUE)
    #   } else {
    #     message("Nothing to update")
    #   }
    # },
    
    #### Everything ----
    
    getTheFireHose = function(...) {
      self$getDaily("FIRE-HOSE", ..., orElse = function (...) covidTimeseriesFormat({
        bind_rows(
          self$getPublicOneOneOne(...),
          self$getNHSDeaths(...),
          self$getPHEDashboard(...),
          self$getTomWhiteCases(...),
          self$getTomWhiteIndicators(...)
        )
      }))
    }
))
terminological/uk-covid-datatools documentation built on June 24, 2021, 8:16 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
terminological/uk-covid-datatools
Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data

R/NHSDatasetProvider.R
In terminological/uk-covid-datatools: Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data

R Package Documentation

Browse R Packages

We want your feedback!

terminological/uk-covid-datatools Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data

R/NHSDatasetProvider.R In terminological/uk-covid-datatools: Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data

R Package Documentation

Browse R Packages

We want your feedback!

terminological/uk-covid-datatools
Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data

R/NHSDatasetProvider.R
In terminological/uk-covid-datatools: Functions for sourcing, cleaning and calculating Rt for uk covid timeseries data