#' SPIM private data
#' @export
SPIMDatasetProvider = R6::R6Class("SPIMDatasetProvider", inherit=CovidTimeseriesProvider, public = list(
# directory=NULL,
#
# initialize = function(providerController, path, ...) {
# self$directory = path.expand(path)
# super$initialize(providerController, ...)
# },
fileProvider=NULL,
initialize = function(providerController, fileProvider, ...) {
self$fileProvider = fileProvider
super$initialize(providerController, ...)
},
### filters ----
filter=list(
chess="CHESS COVID19", #~/S3/encrypted/5Apr/NHS/CHESS COVID19 CaseReport 20200405.csv",
sari="SARI COVID19 CaseReport",
lineList="Anonymised .*Line List [0-9]{8}",
rcgp="RCGP",
deathsLineList="COVID19 Deaths",
ff100="FF100",
chessSummary="CHESS Aggregate Report",
sariSummaryArchive="SARI Archive Aggregate Report",
sariSummaryCurrent="SARI Aggregate Report",
oneOneOne = "SPIM-111-999",
onsWeekly = "SPIM_ONS",
aeSitrep = "AESitrep",
trust = "SPIM_trust_[0-9]{3}.xlsx",
seroprevalence = "seroprev",
negPillar1 = "Negatives pillar1",
negPillar2 = "Negatives pillar2",
oneOneOneLineList = "111telephony_CLEANSED",
fourNationsCases = "Casedata_AllNations",
sgene = "SGTF_linelist",
immunization = "immunisations SPIM.csv",
voc351 = "VOC202012_02_linelist",
ctasLineList = "CTAS SGTF data.zip",
vamLineList = "VAM line list"
),
#### Get raw file paths ----
#' @description Search a file path for the
#' @param path - path to the line list file
#' @return raw line list data set
getPaths = function(...) {
path = self$directory
return(self$getDaily("DATAFILES", ..., orElse = function (...) {
tmp = self$fileProvider$listAllFiles()
return(tmp %>% filter(!isDir) %>% pull(path))
}))
},
getLatest = function(search) {
tmp2 = self$getPaths() %>% stringr::str_subset(search)
tmp2Date = tmp2 %>% stringr::str_extract_all("20[1-2][0-9]-?[0-1][0-9]-?[0-3][0-9]")
tmp2Date = sapply(tmp2Date, function(x) {
x = x %>% stringr::str_remove_all("-")
y = unique(x[x==max(x)])
return(y)
})
tmp3 = tmp2[tmp2Date == max(tmp2Date)]
if(length(tmp3)==0) {
warning("Missing file: ",search)
return(NA_character_)
}
if(length(tmp3)>1) {
warning("Multiple matches, using first: ",paste0(tmp3,collapse="; "))
tmp3 = tmp3[[1]]
}
return(paste0(self$directory,"/",tmp3))
},
getNewerThan = function(search, date = as.Date("2020-01-01")) {
return(self$getSpecificDates(search,(date+1):Sys.Date()))
},
getSpecificDates = function(search, dates) {
tmp2 = self$getPaths() %>% stringr::str_subset(search)
tmp2Date = tmp2 %>% stringr::str_extract("/([^/]+)\\.") %>% stringr::str_extract("20[1-2][0-9]-?[0-1][0-9]-?[0-3][0-9]") %>% stringr::str_remove_all("-") %>% as.Date("%Y%m%d")
tmp3 = tmp2[tmp2Date %in% dates]
if(length(tmp3)==0) {
warning("Missing file: ",search)
return(NA_character_)
}
return(paste0(self$directory,"/",tmp3))
},
getLatestRawFile = function(filter, to = getwd()) {
path = self$getLatest(filter)
if(!stringr::str_ends(to,"/")) to = paste0(to,"/")
dir.create(to, recursive = TRUE, showWarnings = FALSE)
tmpFile = self$fileProvider$getFile(path)
if (stringr::str_detect(path,"zip")) {
zipPath = fs::path_file(path) %>% stringr::str_replace("\\.zip",".csv")
unzip(tmpFile, files=zipPath,exdir = to, junkpaths = TRUE)
return(paste0(to,zipPath))
} else {
fs::file_copy(path = tmpFile,new_path = paste0(to,fs::path_file(path)))
return(paste0(to,fs::path_file(path)))
}
},
#### One one one ----
#' @description Load 111 data
#'
#' @return a covidTimeseriesFormat dataframe
getOneOneOne = function(...) {
path = self$getLatest(self$filter$oneOneOne)
message("Using: ",path)
tmp = self$getSaved("SPIM-111", params = list(path), ..., orElse = function (...) covidTimeseriesFormat({
oneoneone <- readxl::read_excel(self$fileProvider$getFile(path), sheet = "Extracted Data", col_types = "text")
# make zeros explicit
# for(outcome in colnames(oneoneone)[colnames(oneoneone) %>% stringr::str_starts("111-Outcome")]) {
# outcome = as.symbol(outcome)
# oneoneone = oneoneone %>% mutate(!!outcome := ifelse(!is.na(`111-COVID-TOTAL`) & is.na(!!outcome),0,!!outcome))
# }
#
# for(outcome in colnames(oneoneone)[colnames(oneoneone) %>% stringr::str_starts("999-Outcome")]) {
# outcome = as.symbol(outcome)
# oneoneone = oneoneone %>% mutate(!!outcome := ifelse(!is.na(`999-COVID-TOTAL`) & is.na(!!outcome),0,!!outcome))
# }
#
# for(outcome in colnames(oneoneone)[colnames(oneoneone) %>% stringr::str_starts("111-ONLINE") & colnames(oneoneone) != "111-ONLINE-Total Result"]) {
# outcome = as.symbol(outcome)
# oneoneone = oneoneone %>% mutate(!!outcome := ifelse(!is.na(`111-ONLINE-Total Result`) & is.na(!!outcome),0,!!outcome))
# }
ts111 = oneoneone %>%
dplyr::mutate(Geography = ifelse(Geography=="England: Unknown", "Unknown (England)", Geography)) %>% #TODO fix this ugly hack.
self$codes$findCodesByName(nameVar = Geography,codeTypes = c("CTRY","NHSER","PSEUDO")) %>%
dplyr::select(-ReportLevel) %>%
dplyr::rename(name=Geography) %>%
dplyr::mutate(date = as.Date(paste0(Year,"-",Month,"-",Day),"%Y-%m-%d")) %>%
dplyr::select(-DateVal,-Day,-Month,-Year) %>%
tidyr::pivot_longer(cols = c(-date,-code,-name,-codeType), names_to = "category", values_to = "value") %>%
dplyr::mutate(
value = suppressWarnings(as.numeric(value)),
source = case_when(
category %like% "111-ONLINE%" ~ "online",
category %like% "111-Outcome%" ~ "111",
category %like% "999-Outcome%" ~ "999",
category %like% "All-111-Number_of_calls_where_person_triaged" ~ "111",
TRUE ~ "other 111" # as.character(NA)
),
subgroup = case_when(
category %like% "%Clinical Assessment service 1 hour%" ~ "urgent clinical review",
category %like% "%Clinical Assessment service 2 hour%" ~ "urgent clinical review",
category %like% "%Clinical Assessment%" ~ "clinical review",
category %like% "%Emergency Ambulance%" ~ "emergency ambulance",
(category %like% "%Self Care%" | category %like% "%isolate%") ~ "self care",
TRUE ~ "other" #as.character(NA)
),
statistic = case_when(
category %like% "111-ONLINE%" ~ "triage",
category %like% "111-Outcome%" ~ "triage",
category %like% "999-Outcome%" ~ "triage",
TRUE ~ "information seeking"
),
gender = NA_character_,
ageCat = NA_character_,
type = case_when(
category %like% "All-111-Number_of_calls_where_person_triaged" ~ "background",
TRUE ~ "incidence"
)
) %>%
dplyr::mutate(subgroup = factor(subgroup,levels=c("self care", "clinical review", "urgent clinical review", "emergency ambulance", "other"), ordered = TRUE))
#browser()
ts111 = ts111 %>%
dplyr::filter(!is.na(value)) %>%
dplyr::rename(note = category) %>%
dplyr::group_by(code,codeType,name,source,subgroup,statistic,gender,ageCat,type,date) %>%
dplyr::summarise(
value = sum(value),
note = paste0(note, collapse = "; "))
return(ts111 %>% self$fillAbsent() %>% self$fixDatesAndNames(0) %>% self$complete())
}))
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
getOneOneOneLineList = function(dateFrom=Sys.Date()-28, ...) {
paths = self$getNewerThan(search = self$filter$oneOneOneLineList, date = dateFrom)
tmp = self$getSaved(id = "SPIM-111-LINE-LIST",params=list(paths,dateFrom),...,orElse= function(...) {
readCsvAsText = function(conn) {readr::read_csv(conn, col_types = readr::cols(.default = readr::col_character()))}
tmp = self$fileProvider$processFiles(func = readCsvAsText, paths)
tmp2 = tidyr::unnest(tmp) %>% select(-path)
tmp3 = tmp2 %>% mutate(
date = as.Date(DateVal),
name = case_when(
CCGName == "NHS Herefordshire and Worcestershire CCG" ~ "NHS Herefordshire CCG", # this name is not in ONS
TRUE ~ CCGName
),
ageCat = stringr::str_remove(AgeGroup,"age"),
source = "111",
subgroup = case_when(
MetricName %like% "%Clinical Assessment service 1 hour%" ~ "urgent clinical review",
MetricName %like% "PC Speak to 1h/CAS" ~ "urgent clinical review",
MetricName %like% "%Clinical Assessment service 2 hour%" ~ "urgent clinical review",
MetricName %like% "PC Speak to/Contact 2h" ~ "urgent clinical review",
MetricName %like% "%Clinical Assessment%" ~ "clinical review",
MetricName %like% "%Contact%" ~ "clinical review",
MetricName %like% "%Ambulance%" ~ "emergency ambulance",
(MetricName %like% "%Self Care%" | MetricName %like% "%isolate%") ~ "self care",
TRUE ~ "other" #as.character(NA)
),
gender = NA_character_,
note = MetricName
) %>%
select(-MetricCode,-DateVal,-AgeGroup,-CCGName,-MetricName) %>%
dplyr::mutate(subgroup = factor(subgroup,levels=c("self care", "clinical review", "urgent clinical review", "emergency ambulance", "other"), ordered = TRUE)) %>%
self$codes$findCodesByName(codeTypes = c("CCG","CCG20"))
return(tmp3 %>% select(-name.original))
})
#ccgs = self$codes$getCodes() %>% filter(codeType == "CCG" & status == "live")
attr(tmp,"paths") = paths
return(tmp %>% as_tibble())
},
getOneOneOneIncidence = function(dateFrom=Sys.Date()-28, ...) {
tmp3 = self$getOneOneOneLineList(dateFrom,...)
out = self$getSaved("SPIM-111-BREAKDOWN", params=list(tmp3,dateFrom), ..., orElse = function(...) covidTimeseriesFormat({
tmp4 = tmp3 %>%
dplyr::filter(!is.na(code)) %>%
dplyr::group_by(code,codeType,name,date,ageCat, gender,subgroup) %>%
dplyr::summarise(value = n()) %>%
dplyr::mutate(source="111 line list",statistic = "triage", type="incidence") %>%
self$fillAbsentAllRegions() %>%
self$completeAllRegions() #self$fixDatesAndNames(1) %>% self$completeAllRegions()
return(tmp4)
}))
attr(out,"paths") = attr(tmp3,"paths")
return(out)
},
#### Deaths ----
#' @description Load line list
#'
#' @param path - path to the line list file
#' @return raw line list data set
getDeathsLineList = function(...) {
path = self$getLatest(self$filter$deathsLineList)
message("Using: ",path)
tmp = self$getSaved("DEATHS-LINE-LIST", params = list(path), ..., orElse = function (...) {
tmp = readxl::read_excel(self$fileProvider$getFile(path), col_types = "text")
datecols = c(colnames(tmp) %>% stringr::str_subset("date"),"dod")
for(datecol in datecols) {
tmp[[datecol]] = suppressWarnings(as.Date(as.numeric(tmp[[datecol]]),"1899-12-30"))
}
tmp = tmp %>%
dplyr::mutate(
age = as.integer(age),
gender = self$normaliseGender(ifelse(is.na(gender),"unknown",gender))
)
return(tmp %>% dplyr::ungroup())
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
#' @description Load deaths data from linelist - does not preserve ethnicity
#' @param ageBreaks - a list of ages which form the cut points for breaking continuous ages into ranges (or NULL for a single age category)
#' @return a covidTimeseriesFormat dataframe
getDeathsLineListIncidence = function(ageBreaks = NULL, deathOrReport="death", cutoff=28, subgroup=NULL, gender=FALSE, filterExpr=!(is.na(death_type28) & is.na(death_type60cod) & is.na(covidcod)), codeTypes = c("CTRY","NHSER"), truncate=NULL, ...) {
filterExpr = enexpr(filterExpr)
subgroup = tryCatch(ensym(subgroup), error=function(e) NULL)
tmp = self$getDeathsLineList(...)
output = self$getSaved(id = "DEATHS-LINE-LIST-INCIDENCE", params=list(tmp,ageBreaks, deathOrReport, gender, cutoff,as_label(subgroup),as_label(filterExpr),codeTypes), ..., orElse = function (...) covidTimeseriesFormat({
if (!identical(filterExpr,NULL))
tmp = tmp %>% filter(!!filterExpr)
tmp = tmp %>%
dplyr::filter(is.na(specimen_date) | as.integer(dod-specimen_date)<=cutoff) %>%
dplyr::mutate(
ageCat = age %>% self$cutByAge(ageBreaks)
)
if (!identical(subgroup,NULL)) {
tmp = tmp %>% mutate(subgroup=ifelse(is.na(!!subgroup),"unknown",!!subgroup))
} else {
tmp = tmp %>% mutate(subgroup = NA_character_)
}
if (!gender) {
tmp = tmp %>% dplyr::mutate(gender=NA_character_)
}
if(deathOrReport == "death")
tmp = tmp %>% dplyr::mutate(date = as.Date(dod))
else
tmp = tmp %>% dplyr::mutate(date =
as.Date(pmin(report_date_earliest,NHSdeathreportdate, DBSdeathreportdate, HPTdeathreportdate, ONS_death_registration_date, na.rm = TRUE),"1970-01-01")
) %>% dplyr::filter(!is.na(date))
selectByRegion = function(df, code, codeType, name) {
code = ensym(code)
name = ensym(name)
# check column exists
if(!(as_label(code) %in% colnames(df))) return(tibble())
df = df %>% dplyr::mutate(code = !!code, codeType=codeType, name=!!name) %>%
dplyr::mutate(
code = ifelse(is.na(code),"E99999999",code),
name = ifelse(is.na(code),"Unknown (England)",name)
) %>%
dplyr::group_by( code,codeType,name,date, ageCat, gender,subgroup) %>%
dplyr::summarise(value = n())
return(df)
}
out = NULL
if ("CTRY" %in% codeTypes) {
england = tmp %>%
dplyr::mutate(code = "E92000001", codeType= "CTRY", name="England") %>%
dplyr::group_by(code,codeType,name,date, ageCat, gender,subgroup) %>%
dplyr::summarise(value = n())
out = out %>% bind_rows(england)
}
if ("NHSER" %in% codeTypes) {
nhser = tmp %>% selectByRegion(nhser_code, "NHSER", nhser_name)
isNhser = nhser %>% self$codes$allPresentAndCorrect(codeTypes=c("NHSER","PSEUDO"))
if(!isNhser) {
nhser = tmp %>% selectByRegion(nhser_code, "NHSER19CDH", nhser_name) %>%
dplyr::inner_join(
self$codes$getMappings() %>% dplyr::filter(fromCodeType=="NHSER19CDH" & toCodeType=="NHSER"),
by=c("code"="fromCode")
) %>%
dplyr::ungroup() %>%
dplyr::select(-code,-codeType, -fromCodeType,-rel,-weight) %>%
dplyr::rename(code = toCode, codeType=toCodeType)
}
out = out %>% bind_rows(nhser)
}
if ("PHEC" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(phec_code, "PHEC", phec_name))}
if ("UA" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(utla_code, "UA", utla_name))}
if ("LAD" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(ltla_code, "LAD", ltla_name))}
if ("LSOA" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(lsoa_code, "LSOA", lsoa_name))}
out = out %>% dplyr::mutate(source="deaths line list",statistic = "death", type="incidence")
out = out %>% self$codes$findNamesByCode() %>% select(-ends_with(".original"))
out = out %>% self$fixDatesAndNames(truncate)
out = out %>% self$fillAbsent(completeDates=TRUE)
#out = out %>% self$complete()
out = out %>% dplyr::ungroup()
return(out)
}))
attr(output,"paths") = attr(tmp,"paths")
return(output %>% as_tibble())
},
#### Variants / genomics ----
getVoc351LineList = function(...) {
path = self$getLatest(self$filter$voc351)
message("Using: ",path)
tmp = self$getSaved("VOC351", params = list(path), ..., orElse = function (...) {
tmp = readxl::read_excel(self$fileProvider$getFile(path), col_types = "text", sheet = "Linelist")
datecols = c("earliest_specimen_date")
for(datecol in datecols) {
tmp[[datecol]] = suppressWarnings(as.Date(as.numeric(tmp[[datecol]]),"1899-12-30"))
}
tmp = tmp %>%
dplyr::mutate(
finalid=as.integer(finalid)
)
return(tmp %>% dplyr::ungroup())
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
getVAMLineList = function(...) {
path = self$getLatest(self$filter$vamLineList)
message("Using: ",path)
tmp = self$getSaved("VAM", params = list(path), ..., orElse = function (...) {
tmp = self$fileProvider$getFile(path)
tmp2 = readr::read_csv(tmp, col_types = readr::cols(.default = readr::col_character()))
# tmp2 = tmp2 %>% mutate(genomic_specimen_date = suppressWarnings(as.Date(genomic_specimen_date,"%Y%m%d")))
datecols = colnames(tmp2)[colnames(tmp2) %>% stringr::str_detect("date|_at")]
for(datecol in datecols) {
tmp2[[datecol]] = suppressWarnings(as.Date(tmp2[[datecol]],"%Y-%m-%d"))
}
idcols = colnames(tmp2)[colnames(tmp2) %>% stringr::str_detect("id")]
for(idcol in idcols) {
tmp2[[idcol]] = suppressWarnings(as.integer(tmp2[[idcol]]))
}
tmp2 = tmp2 %>% mutate(
age = suppressWarnings(as.integer(age))
)
if(file.exists(tmp)) unlink(tmp)
return(tmp2)
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
#### Test and trace ----
getCTASLineList = function(...) {
#/home/terminological/S3/encrypted/2021-03-29/20210329 CTAS SGTF data.zip
path = self$getLatest(self$filter$ctasLineList)
message("Using: ",path)
tmp = self$getSaved("CTAS", params = list(path), ..., orElse = function (...) {
tmpPath = self$fileProvider$getFile(path)
zipPath = fs::path_file(path) %>% stringr::str_replace("\\.zip",".csv")
tmp2 = readr::read_csv(unz(tmpPath,filename=zipPath), col_types = readr::cols(.default = readr::col_character()))
# tmp2 = tmp2 %>% mutate(genomic_specimen_date = suppressWarnings(as.Date(genomic_specimen_date,"%Y%m%d")))
datecols = colnames(tmp2)[colnames(tmp2) %>% stringr::str_detect("date|_at")]
for(datecol in datecols) {
tmp2[[datecol]] = suppressWarnings(as.Date(tmp2[[datecol]],"%Y-%m-%d"))
}
idcols = colnames(tmp2)[colnames(tmp2) %>% stringr::str_detect("id")]
for(idcol in idcols) {
tmp2[[idcol]] = suppressWarnings(as.integer(tmp2[[idcol]]))
}
tmp2 = tmp2 %>% mutate(
completed = as.logical(completed),
sex = self$normaliseGender(sex),
sgtf = as.integer(sgtf),
sgtf_under30ct = as.integer(sgtf_under30ct),
p2ch1cq = as.double(p2ch1cq),
p2ch2cq = as.double(p2ch2cq),
p2ch3cq = as.double(p2ch3cq),
p2ch4cq = as.double(p2ch4cq),
age = as.integer(age)
)
if(file.exists(tmpPath)) unlink(tmpPath)
return(tmp2)
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
#### Immunisations ----
#' @description Load immunizations line list
#'
#' @return raw line list data set
getImmunizationLineList = function(...) {
path = self$getLatest(self$filter$immunization)
message("Using: ",path)
out = self$getSaved("IMMUNIZATIONS", params = list(path), ..., orElse = function (...) {
tmp = readr::read_csv(self$fileProvider$getFile(path), col_types = readr::cols(.default = readr::col_character()))
tmp = tmp %>%
as_tibble() %>%
mutate(
patient_pseudo_id = as.numeric(patient_pseudo_id),
age = suppressWarnings(as.numeric(age)),
finalid = as.numeric(finalid),
finalid2 = as.numeric(finalid2),
vaccination_date = as.Date(vaccination_date, "%d%b%Y")
)
})
attr(out,"paths") = path
return(out %>% as_tibble())
},
#' @description Load incidence from line list
#'
#' @param ageBreaks - a list of ages which form the cut points for breaking continuous ages into ranges (or NULL for a single age category)
#' @return a covidTimeseriesFormat dataframe
getImmunizationLineListIncidence = function(ll=NULL, ageBreaks = NULL, filterExpr=NULL, subgroup="string_dose_number", ...) {
filterExpr = enexpr(filterExpr)
subgroup = tryCatch(ensym(subgroup), error = function(e) NULL)
# TODO: do we need the ll option here. It is not cached
tmp = self$getDaily("IMMUNIZATIONS-INCIDENCE", params=list(ageBreaks, as_label(filterExpr), as_label(subgroup)), ..., orElse = function (...) covidTimeseriesFormat({
if(!identical(ll,NULL)) {
tmp = ll
} else {
tmp = self$getImmunizationLineList(...)
}
if(!identical(filterExpr,NULL)) tmp = tmp %>% filter(!!filterExpr)
tmp = tmp %>% dplyr::mutate(ageCat = age %>% self$cutByAge(ageBreaks), gender=self$normaliseGender(gender,na.value="unknown"))
if(!identical(subgroup,NULL)) {
tmp = tmp %>% mutate(subgroup=!!subgroup)
} else {
tmp = tmp %>% mutate(subgroup=NA_character_)
}
tmp = tmp %>% dplyr::mutate(date = as.Date(vaccination_date))
out = tmp %>% dplyr::mutate(code = ltla_code, codeType="LAD", name=ltla_name) %>%
dplyr::mutate(
code = ifelse(is.na(code) | code=="Unknown","E99999999",code),
name = ifelse(is.na(code) | code=="Unknown","Unknown (England)",name)
) %>%
dplyr::group_by(code,codeType,name,date,ageCat,gender,subgroup) %>%
dplyr::summarise(value = n())
out = out %>% dplyr::mutate(source="immunization",statistic = "immunization", type="incidence")
out = out %>% self$fixDatesAndNames(0)
out = out %>% self$fillAbsent(completeDates=TRUE)
out = out %>% dplyr::ungroup()
return(out)
}))
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
getImmunizationFraction = function(ageBreaks = NULL,...) {
self$getDaily("IMMUNIZATIONS-FRACTION", params=list(ageBreaks), ..., orElse = function (...) covidTimeseriesFormat({
tmp2 = dpc$spim$getImmunizationLineListIncidence(ageBreaks=ageBreaks)
tmp3 = tmp2 %>% tsp$aggregateGender()
tmp4 = tmp3 %>% tsp$cumulativeFromIncidence()
tmp4 = tmp4 %>% self$demog$findDemographics()
deaths = dpc$spim$getDeathsLineListIncidence(ageBreaks = ageBreaks,codeTypes = "LAD")
deathsCum = deaths %>% tsp$aggregateGender() %>% tsp$cumulativeFromIncidence()
tmp5 = tmp4 %>% inner_join(deathsCum %>% select(code,date,ageCat,cumdeaths = value), by=c("code","date","ageCat"))
tmp5 = tmp5 %>% mutate(vaccPercent = value/(population-cumdeaths)) %>% mutate(vaccPercent = ifelse(vaccPercent>1,1,vaccPercent))
tmp5 = tmp5 %>% mutate(immunized = value, value=vaccPercent, type="fraction") %>% select(-vaccPercent)
return(tmp5)
}))
},
#### S-gene line list ----
#' @description Load line list
#'
#' @return raw line list data set
getSGeneLineList = function(...) {
path = self$getLatest(self$filter$sgene)
message("Using: ",path)
tmp = self$getSaved("SGENE-LINE-LIST", params = list(path), ..., orElse = function (...) {
if (stringr::str_detect(path,"zip")) {
tmpFile = self$fileProvider$getFile(path)
zipPath = fs::path_file(path) %>% stringr::str_replace("\\.zip",".csv")
tmp = readr::read_csv(unz(tmpFile, filename=zipPath))
} else {
tmp = readr::read_csv(self$fileProvider$getFile(path))
}
return(tmp %>% dplyr::ungroup())
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
#' Interpret S gene status according to various cut off values
#' function to help interpret S gene CT values in context of N gene and ORF gene to give S gene status.
#' With the defaults this produces the same result as the sgtf_30 column in the source SGTF line list
#' Defaults are S:30,ORF:30,N:30,Control:Inf
#'
#' @param sGeneLineList - a dataframe includeing
#' @param S_CT - S gene detected when P2CH3CQ <= this value
#' @param ORF1ab_CT - ORF1ab gene detected when P2CH1CQ <= this value
#' @param N_CT - N gene detected when P2CH2CQ <= this value
#' @param Control_CT - control sample is positive when P2CH4CQ <= this value
#'
#' @return - the same dataframe with additional columns including "sGene" and "result"
#'
#' @examples coxData = coxData %>% interpretSGene()
interpretSGene = function(sGeneLineList, S_CT = 30, ORF1ab_CT = 30, N_CT = 30, Control_CT = Inf, ...) {
sGeneLineList %>%
mutate(
ORF1ab_CT_threshold = ORF1ab_CT,
N_CT_threshold = N_CT,
S_CT_threshold = S_CT,
S_pos = P2CH3CQ > 0 & P2CH3CQ <= S_CT,
S_undetect = P2CH3CQ == 0,
N_pos = P2CH2CQ > 0 & P2CH2CQ <= N_CT,
ORF1ab_pos = P2CH1CQ > 0 & P2CH1CQ <= ORF1ab_CT,
Control_pos = P2CH4CQ > 0 & P2CH4CQ <= Control_CT,
sGene = case_when(
is.na(P2CH1CQ) ~ "Unknown",
S_pos & N_pos & ORF1ab_pos & Control_pos ~ "Positive",
S_undetect & N_pos & ORF1ab_pos & Control_pos ~ "Negative",
TRUE ~ "Equivocal"
),
CT_N = ifelse(P2CH2CQ > 0, P2CH2CQ, 40)
) %>%
mutate(
result = case_when(
is.na(P2CH1CQ) ~ "Unknown",
!Control_pos ~ "No control",
TRUE ~ paste0(ifelse(S_pos,"S+","S-"),ifelse(N_pos,"N+","N-"),ifelse(ORF1ab_pos,"ORF+","ORF-")))
) %>%
mutate(
sGene = sGene %>% ordered(c("Positive","Negative","Equivocal","Unknown")),
relativeCopyNumber = 2^(median(CT_N,na.rm=TRUE)-CT_N)
)
},
getSGeneEras = function(cutoff = 28, S_CT = 30, ORF1ab_CT = 30, N_CT = 30, Control_CT = Inf, ...) {
path = self$getLatest(self$filter$sgene)
tmp = self$getSaved("SGENE-ERAS", params = list(path), ..., orElse = function (...) {
sgll = self$getSGeneLineList() %>% self$interpretSGene(S_CT, ORF1ab_CT, N_CT, Control_CT)
# group by patient and find time delay between tests (where there are more than one)
tmp = sgll %>% arrange(FINALID,specimen_date) %>% mutate(delay = ifelse(FINALID==lag(FINALID), as.numeric(specimen_date - lag(specimen_date)), NA_real_))
# TODO: there is some interesting properties of the delay
# ggplot(tmp, aes(x=delay))+geom_density()+scale_x_continuous(trans="log1p",breaks=c(0,10,20,50,100,200,500,1000))+facet_wrap(vars(sgtf_under30CT))
# ggplot(tmp, aes(x=delay,y=P2CH1CQ))+geom_density_2d()
# apply some heuristics to determine whether a test is part of the same infection or a new one
tmp2 = tmp %>% mutate(era = case_when(
is.na(delay) ~ "new",
is.na(sgtf_under30CT) & delay < cutoff*2 ~ "same", # prolonged recovery
delay < cutoff ~ "same",
TRUE ~ "new"
))
# assign an eraIndex - essentially the count of novel infection episodes
tmp3 = tmp2 %>% group_by(FINALID) %>% arrange(specimen_date) %>% mutate(eraIndex = cumsum(ifelse(era=="new",1,0)))
# summarise sGene data into a single value for each era
# TODO: each era may have multiple positive tests there is an opportunity to look at the CT values over time and
# fit some sort of model here
tmp4 = tmp3 %>% group_by(FINALID,eraIndex) %>%
mutate(
earliest_specimen_date = min(specimen_date,na.rm=TRUE),
latest_specimen_date = max(specimen_date,na.rm=TRUE),
tests=n(),
anyPosSGene = any(sGene == "Positive"),
anyNegSGene = any(sGene == "Negative"),
anyEquivSGene = any(sGene == "Equivocal"),
anyUnknSGene = any(sGene == "Unknown"),
) %>% ungroup() %>% mutate(
sGene=case_when(
anyPosSGene & !anyNegSGene ~ "Positive",
anyNegSGene & !anyPosSGene ~ "Negative",
anyEquivSGene ~ "Equivocal",
TRUE ~ "Unknown"
) %>% ordered(c("Positive","Negative","Equivocal","Unknown"))
)
return(tmp4 %>% select(-anyPosSGene, -anyNegSGene, -anyEquivSGene, -anyUnknSGene))
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
getSDropoutFreqency = function(codeTypes = c("NHSER"), ageBreaks = NULL, S_CT = 30, ORF1ab_CT = 30, N_CT = 30, equivocal.rm=TRUE, window=7, ll=NULL, sgll=NULL, ...) {
if (identical(ll,NULL)) ll = self$getLineList() %>% ungroup()
if (identical(sgll,NULL)) sgll = self$getSGeneLineList()
path = c(attr(ll,"paths"),attr(sgll,"paths"))
tmp = self$getDaily("SGENE-DROPOUT",params=list(ORF1ab_CT, N_CT, codeTypes,ageBreaks,equivocal.rm,window,path), ..., orElse = function (...) {
groupByVars = ll %>% groups()
tmp2 = sgll %>%
left_join(ll, by="FINALID", suffix=c("",".ll")) # %>%
# group_by(FINALID) %>% arrange(desc(specimen_date)) %>% filter(row_number()==1)
tmp3 = tmp2 %>% ungroup() %>% mutate(
ageCat = age %>% self$cutByAge(ageBreaks)
) %>% self$interpretSGene(S_CT,ORF1ab_CT,N_CT)
fn = function(tmpDf, codeCol = "NHSER_code", nameCol = "NHSER_name", codeType="NHSER") {
codeCol = ensym(codeCol)
nameCol = ensym(nameCol)
grps = tmpDf %>% groups()
joins = unlist(sapply(grps,as_label))
tmpDf = tmpDf %>%
rename(code = !!codeCol,name = !!nameCol,date=specimen_date) %>%
group_by(!!!grps,code,name,ageCat,date,sGene) %>%
summarise(count = n()) %>% ungroup()
tmpDf2 = tmpDf %>% select(code,name) %>% distinct() %>% filter(!is.na(code)) %>%
left_join(
tmpDf %>% select(!!!grps) %>% distinct(), by=character()
) %>% left_join(
tibble(date = as.Date(min(tmpDf$date):max(tmpDf$date),"1970-01-01")), by=character()
) %>% left_join(
tmpDf %>% select(ageCat) %>% distinct(), by=character()
) %>% left_join(
tibble(sGene=c("Positive","Negative","Equivocal")), by=character()
) %>% left_join(tmpDf, by=c(joins,"code","name","ageCat","sGene","date")) %>% mutate(count = ifelse(is.na(count),0,count))
if (equivocal.rm) tmpDf2 = tmpDf2 %>% filter(sGene != "Equivocal")
tmpDf2 = tmpDf2 %>% group_by(!!!grps,code,name,ageCat,sGene) %>% arrange(date) %>%
mutate(Roll.count = stats::filter(count,filter=rep(1,window),sides=1)) %>%
filter(!is.na(Roll.count))
tmpDf2 = tmpDf2 %>%
group_by(!!!grps,code,name,ageCat,date) %>%
mutate(binom::binom.confint(Roll.count, sum(Roll.count), conf.level = 0.95, methods = "wilson")) %>%
rename(Roll.mean = mean,Roll.lower = lower,Roll.upper=upper) %>%
mutate(binom::binom.confint(count, sum(count), conf.level = 0.95, methods = "wilson")) %>%
select(-x,-n,-method) %>%
mutate(codeType = codeType)
return(tmpDf2)
}
tmp3 = tmp3 %>% ungroup() %>% group_by(!!!groupByVars)
out = NULL
if ("NHSER" %in% codeTypes) out = out %>% bind_rows(tmp3 %>% fn(codeCol = NHSER_code, nameCol=NHSER_name, codeType = "NHSER"))
if ("PHEC" %in% codeTypes) out = out %>% bind_rows(tmp3 %>% fn(codeCol = PHEC_code, nameCol=PHEC_name, codeType = "PHEC"))
if ("UA" %in% codeTypes) out = out %>% bind_rows(tmp3 %>% fn(codeCol = UTLA_code, nameCol=UTLA_name, codeType = "UA"))
if ("LAD" %in% codeTypes) out = out %>% bind_rows(tmp3 %>% fn(codeCol = LTLA_code, nameCol=LTLA_name, codeType = "LAD"))
if ("CTRY" %in% codeTypes) out = out %>% bind_rows(tmp3 %>% mutate(code = "E92000001",name="England") %>% fn(codeCol = code, nameCol=name, codeType = "CTRY"))
return(out)
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
#### Cases ----
#' @description Load line list
#'
#' @return raw line list data set
getLineList = function(...) {
path = self$getLatest(self$filter$lineList)
message("Using: ",path)
out = self$getSaved("LINE-LIST", params = list(path), ..., orElse = function (...) {
if (stringr::str_detect(path,"zip")) {
tmpFile = self$fileProvider$getFile(path)
zipPath = fs::path_file(path) %>% stringr::str_replace("\\.zip",".csv")
tmp = readr::read_csv(unz(tmpFile, filename=zipPath), col_types = readr::cols(.default = readr::col_character()))
tmp = tmp %>%
dplyr::mutate(
Onsetdate = maybeDMYorMDY(Onsetdate),
specimen_date = maybeDMYorMDY(specimen_date),
lab_report_date = maybeDMYorMDY(lab_report_date)
)
} else if (stringr::str_detect(path,"csv")) {
tmp = readr::read_csv(self$fileProvider$getFile(path), col_types = readr::cols(.default = readr::col_character()))
tmp = tmp %>%
dplyr::mutate(
Onsetdate = maybeDMYorMDY(Onsetdate),
specimen_date = maybeDMYorMDY(specimen_date),
lab_report_date = maybeDMYorMDY(lab_report_date)
)
} else {
tmp = readxl::read_excel(path.expand(path),
col_types = "text") #c("numeric", "text", "text", "text", "text", "text", "text", "text", "text", "text", "numeric", "date", "date", "date"))
tmp = tmp %>%
dplyr::mutate(
Onsetdate = suppressWarnings(as.Date(as.numeric(Onsetdate),"1899-12-30")),
specimen_date = suppressWarnings(as.Date(as.numeric(specimen_date),"1899-12-30")),
lab_report_date = suppressWarnings(as.Date(as.numeric(lab_report_date),"1899-12-30"))
)
}
if(any(is.na(tmp$specimen_date))) warning("NA sprecimen dates in cases file")
if ("finalid" %in% colnames(tmp)) tmp = tmp %>% rename(FINALID = finalid)
return(tmp %>% mutate(
pillar_2_testingkit = tolower(pillar_2_testingkit),
age = suppressWarnings(as.numeric(age)),
FINALID = as.numeric(FINALID),
imd_rank = as.integer(imd_rank),
imd_decile = as.integer(imd_decile),
ethnicity_final = case_when(
ethnicity_final %in% c("African (Black or Black British)","Any other Black background","Caribbean (Black or Black British)") ~ "Afro-caribbean",
ethnicity_final %in% c("Any other Asian background","Bangladeshi (Asian or Asian British)","Indian (Asian or Asian British)","Pakistani (Asian or Asian British)") ~ "Asian",
ethnicity_final %in% c("Any other White background","British (White)","Irish (White)") ~ "White",
ethnicity_final %in% c("Any other Mixed background","Any other ethnic group","White and Black Caribbean (Mixed)","White and Black African (Mixed)","Chinese (other ethnic group)") ~ "Other",
TRUE ~ "Unknown"),
residential_category = case_when(
cat == 'Residential dwelling (including houses, flats, sheltered accommodation)' ~ "Residential",
cat == 'Care/Nursing home' ~ "Care home",
cat == 'Undetermined'~"Other/Unknown",
cat == 'Medical facilities (including hospitals and hospices, and mental health)'~"Other/Unknown",
cat == 'Other property classifications'~"Other/Unknown",
cat == 'House in multiple occupancy (HMO)' ~ "Residential",
cat == 'Prisons, detention centres, secure units'~"Other/Unknown",
cat == 'Residential institution (including residential education)'~"Other/Unknown",
cat == 'No fixed abode'~"Other/Unknown",
cat == 'Overseas address'~"Other/Unknown",
TRUE ~ "Other/Unknown"
)
) %>% dplyr::ungroup())
})
attr(out,"paths") = path
return(out %>% as_tibble())
# TODO: https://github.com/sarahhbellum/NobBS
},
augmentLineListWithLSOA = function(ll, ltlaCodeCol = "LTLA_code", imdRankCol="imd_rank") {
ltlaCodeCol = ensym(ltlaCodeCol)
imdRankCol = ensym(imdRankCol)
imd = self$demog$getIMDData() %>% select(
!!ltlaCodeCol := `Local Authority District code (2019)`,
!!imdRankCol := `Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)`,
LSOA_code = `LSOA code (2011)`,
LSOA_name = `LSOA name (2011)`
)
return(ll %>% left_join(imd, by=c(as_label(ltlaCodeCol),as_label(imdRankCol))))
},
#' @description Load incidence from line list
#'
#' @param ageBreaks - a list of ages which form the cut points for breaking continuous ages into ranges (or NULL for a single age category)
#' @return a covidTimeseriesFormat dataframe
getLineListIncidence = function(ll=NULL, ageBreaks = NULL, gender=FALSE, specimenOrReport="specimen", subgroup="pillar", filterExpr=NULL, codeTypes = c("CTRY","NHSER"), truncate=NULL, ...) {
filterExpr = enexpr(filterExpr)
subgroup = tryCatch(ensym(subgroup), error = function(e) NULL)
if(!identical(ll,NULL)) {
tmp = ll
} else {
tmp = self$getLineList(...)
}
path = attr(tmp,"paths")
out2 = self$getSaved("LINE-LIST-INCIDENCE", params=list(tmp, ageBreaks, specimenOrReport,as_label(subgroup), as_label(filterExpr), codeTypes, gender), ..., orElse = function (...) covidTimeseriesFormat({
if(!identical(filterExpr,NULL))
tmp = tmp %>% filter(!!filterExpr)
tmp = tmp %>% dplyr::mutate(ageCat = age %>% self$cutByAge(ageBreaks))
if (gender) {
tmp = tmp %>% dplyr::mutate(gender=self$normaliseGender(sex))
} else {
tmp = tmp %>% dplyr::mutate(gender=NA_character_)
}
if(!identical(subgroup,NULL)) {
tmp = tmp %>% dplyr::mutate(subgroup=!!subgroup)
} else {
tmp = tmp %>% dplyr::mutate(subgroup=NA_character_)
}
if(specimenOrReport == "report")
tmp = tmp %>% dplyr::mutate(date = as.Date(lab_report_date))
else
tmp = tmp %>% dplyr::mutate(date = as.Date(specimen_date))
selectByRegion = function(df, code, codeType, name) {
code = ensym(code)
name = ensym(name)
# check column exists
if(!(as_label(code) %in% colnames(df))) return(tibble())
df = df %>% dplyr::mutate(code = !!code, codeType=codeType, name=!!name) %>%
dplyr::mutate(
code = ifelse(is.na(code),"E99999999",code),
name = ifelse(is.na(code),"Unknown (England)",name)
) %>%
dplyr::group_by( code,codeType,name,date, ageCat, gender,subgroup) %>%
dplyr::summarise(value = n())
return(df)
}
out = NULL
if ("CTRY" %in% codeTypes) {
england = tmp %>% dplyr::mutate(code = "E92000001", codeType= "CTRY", name="England") %>%
dplyr::group_by(code,codeType,name,date, ageCat, gender,subgroup) %>%
dplyr::summarise(value = n())
out = out %>% bind_rows(england)
}
if ("NHSER" %in% codeTypes) {
nhser = tmp %>% selectByRegion(NHSER_code, "NHSER", NHSER_name)
isNhser = nhser %>% self$codes$allPresentAndCorrect(codeTypes=c("NHSER","PSEUDO"))
if(!isNhser) {
nhser = tmp %>% selectByRegion(NHSER_code, "NHSER19CDH", NHSER_name) %>%
dplyr::inner_join(
self$codes$getMappings() %>% dplyr::filter(fromCodeType=="NHSER19CDH" & toCodeType=="NHSER"),
by=c("code"="fromCode")
) %>%
dplyr::ungroup() %>%
dplyr::select(-code,-codeType, -fromCodeType,-rel,-weight) %>%
dplyr::rename(code = toCode, codeType=toCodeType)
}
out = out %>% bind_rows(nhser)
}
if ("PHEC" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(PHEC_code, "PHEC", PHEC_name))}
if ("UA" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(UTLA_code, "UA", UTLA_name))}
if ("LAD" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(LTLA_code, "LAD", LTLA_name))}
if ("LSOA" %in% codeTypes) {out = out %>% bind_rows(tmp %>% selectByRegion(LSOA_code, "LSOA", LSOA_name))}
out = out %>% dplyr::mutate(source="line list",statistic = "case", type="incidence")
out = out %>% self$fixDatesAndNames(truncate)
out = out %>% self$fillAbsent(completeDates=TRUE)
out = out %>% dplyr::ungroup()
return(out)
}))
attr(out2,"paths") = path
return(out2 %>% as_tibble())
},
#### Episodes ----
#' @description Combine line list and S-gene line list to get a list of infection episodes
#'
#' this defines how long between tests before two tests are regarded as a new episode.
#' if the tests are sgtf equivocal double this is allowed.
#' calculate the individual episodes of covid resulting from runs of sequential positive tests <28 days apart.
#'
#' @param cutoff - the time gap between sequential tests after which two tests are said to be from a new episode
#' @param S_CT - S gene detected when P2CH3CQ <= this value
#' @param ORF1ab_CT - ORF1ab gene detected when P2CH1CQ <= this value
#' @param N_CT - N gene detected when P2CH2CQ <= this value
#' @param Control_CT - control sample is positive when P2CH4CQ <= this value
#' @return a covidTimeseriesFormat dataframe
getInfectionEpisodes = function(cutoff=28, S_CT = 30, ORF1ab_CT = 30, N_CT = 30, Control_CT = Inf, ...) {
path1 = self$getLatest(self$filter$sgene)
path2 = self$getLatest(self$filter$lineList)
message("Using: ",path1," and ",path2)
# TODO: this is a bit slow and maybe could be improved
# the dtplyr has some sort of bug in it connected with: https://github.com/tidyverse/dtplyr/issues/164
# which only appears when wrapped in a function so code below works when
# dtplyr does not seem production ready. maybe need to learn data.table.
out = self$getSaved(id = "EPISODES", params = list(cutoff,path1,path2,S_CT,ORF1ab_CT,N_CT), ..., orElse=function(...) {
ll = self$getLineList()
sgll = self$getSGeneLineList()
# Split line list out into normalised parts.
llDemog = ll %>% select(FINALID, NHSER_code, NHSER_name, PHEC_code, PHEC_name, UTLA_code, UTLA_name, LTLA_code, LTLA_name, sex, age, ethnicity_final, imd_decile, imd_rank, residential_category, cat) %>% distinct()
# llEpisode = ll %>% select(FINALID, specimen_date, asymptomatic_indicator, pillar, lab_report_date, pillar_2_testingkit, testcentreid, case_category) %>% mutate(episode_type="first positive",episode=1) %>% distinct()
llTest = ll %>% select(FINALID, specimen_date, case_category,asymptomatic_indicator) %>% mutate(linelist = TRUE)
# Split s gene line list out into normalised parts.
sgllTest = sgll %>%
self$interpretSGene(S_CT, ORF1ab_CT, N_CT, Control_CT) %>%
select(FINALID, specimen_date, sGene, result) %>%
mutate(sglinelist = TRUE)
# combine tests from ll (first only) with tests from S-gene files
# N.B. this will be a bit of a funny mixture - multiple tests but only for cases that went through taqpath assay.
tests = llTest %>%
full_join(
sgllTest,
by = c("FINALID","specimen_date")
) %>%
mutate(
linelist = ifelse(is.na(linelist),FALSE,linelist),
sglinelist = ifelse(is.na(sglinelist),FALSE,sglinelist),
sGene = ifelse(is.na(sGene),"Unknown",as.character(sGene)),
case_category = ifelse(sglinelist,"TaqPath",case_category)
)
tests2 = tests # dtplyr::lazy_dt(tests) weirdly not working inside a function....
# browser()
# look for sequential tests with the same id and calulate time difference between them.
# we do this avoiding group by as it becomes very slow.
tests3 = tests2 %>% arrange(FINALID,specimen_date) %>%
mutate(
delay = ifelse(FINALID == lag(FINALID), as.numeric(specimen_date - lag(specimen_date)), NA_integer_)
) %>%
mutate(era = case_when(
is.na(delay) ~ "new",
sGene == "Equivocal" & delay < cutoff*2 ~ "same", # prolonged recovery allowed for Equivocal test results
delay < cutoff ~ "same",
TRUE ~ "new"
)) %>%
# assign an eraIndex - essentially the count of novel infection episodes
group_by(FINALID) %>% arrange(specimen_date) %>%
mutate(eraIndex = cumsum(ifelse(era=="new",1,0)))
# TODO: each era may have multiple positive tests there is an opportunity to look at the CT values over time and
# fit some sort of model here
tests4 = tests3 %>% group_by(FINALID,eraIndex) %>%
summarise(
earliest_specimen_date = min(specimen_date,na.rm=TRUE),
latest_specimen_date = max(specimen_date,na.rm=TRUE),
tests=n(),
anyPosSGene = any(sGene == "Positive"),
anyNegSGene = any(sGene == "Negative"),
anyEquivSGene = any(sGene == "Equivocal"),
anyUnknSGene = any(sGene == "Unknown"),
asymptomatic_indicator = first(na.omit(asymptomatic_indicator),default="U"),
lft_only = all(na.omit(case_category=="LFT_Only"))
)
tests5 = tests4 %>% mutate(
sGene=
case_when(
anyPosSGene & !anyNegSGene ~ "Positive",
anyNegSGene & !anyPosSGene ~ "Negative",
anyEquivSGene ~ "Equivocal",
TRUE ~ "Unknown"
) %>% ordered(c("Positive","Negative","Equivocal","Unknown")),
) %>% as_tibble()
return(tests5 %>% inner_join(llDemog, by="FINALID") %>% select(-anyPosSGene, -anyNegSGene, -anyEquivSGene, -anyUnknSGene))
})
attr(out,"paths") = c(path1,path2)
return(out %>% as_tibble())
},
#### Negatives ----
getNegatives = function(codeTypes = c("CTRY","NHSER"), truncate=NULL, ...) {
# TODO:
stop("needs updating as formats changed at some point")
path1 = self$getLatest(self$filter$negPillar1)
path2 = self$getLatest(self$filter$negPillar2)
tmp = self$getSaved("NEGATIVES", params=list(codeTypes, path1, path2), ..., orElse = function(...) {
#TODO: extra layer of caching in here?
neg1 = readr::read_csv(self$fileProvider$getFile(path1))
neg2 = readr::read_csv(self$fileProvider$getFile(path2))
neg = bind_rows(neg1 %>% mutate(subgroup="Pillar 1"),neg2 %>% mutate(subgroup="Pillar 2"))
neg = neg %>% mutate(ageCat = case_when(
agegroup == "0 to 4" ~ "<5",
agegroup == "5 to 9" ~ "5-14",
agegroup == "10 to 14" ~ "5-14",
agegroup == "15 to 19" ~ "15-24",
agegroup == "20 to 24" ~ "15-24",
agegroup == "25 to 29" ~ "25-34",
agegroup == "30 to 34" ~ "25-34",
agegroup == "35 to 39" ~ "35-44",
agegroup == "40 to 44" ~ "35-44",
agegroup == "45 to 49" ~ "45-54",
agegroup == "50 to 54" ~ "45-54",
agegroup == "55 to 59" ~ "55-64",
agegroup == "60 to 64" ~ "55-64",
agegroup == "65 to 69" ~ "65-74",
agegroup == "70 to 74" ~ "65-74",
agegroup == "75 to 79" ~ "75-84",
agegroup == "80 to 84" ~ "75-84",
agegroup == "85 to 89" ~ "85+",
agegroup == "90 or older" ~ "85+",
TRUE ~ "unknown"
))
neg = neg %>% mutate(gender = self$normaliseGender(gender))
neg = neg %>% rename(value = negative, date = earliestspecimendate) %>% select(-agegroup)
selectByRegion = function(df, code, codeType, name) {
code = ensym(code)
name = ensym(name)
# check column exists
if(!(as_label(code) %in% colnames(df))) return(tibble())
df = df %>% dplyr::mutate(code = !!code, codeType=codeType, name=!!name) %>%
dplyr::mutate(
code = ifelse(is.na(code),"E99999999",code),
name = ifelse(is.na(code),"Unknown (England)",name)
) %>%
dplyr::group_by( code,codeType,name,date, ageCat, gender,subgroup) %>%
dplyr::summarise(value = n())
return(df)
}
neg = NULL
if ("CTRY" %in% codeTypes) {
england = tmp %>% dplyr::mutate(code = "E92000001", codeType= "CTRY", name="England") %>%
dplyr::group_by(code,codeType,name,date,ageCat,gender,subgroup) %>%
dplyr::summarise(value = sum(value))
neg = neg %>% bind_rows(england)
}
if ("PHEC" %in% codeTypes) {
phec = tmp %>% dplyr::rename(name=phecentre) %>%
dplyr::group_by(name,date,ageCat,gender,subgroup) %>%
dplyr::summarise(value = sum(value)) %>%
dpc$codes$findCodesByName(codeTypes = "PHEC",outputCodeVar = "code",outputCodeTypeVar = "codeType")
neg = neg %>% bind_rows(phec)
}
if ("NHSER" %in% codeTypes) {
nhser = tmp %>% dplyr::rename(name=nhsregion) %>%
dplyr::group_by(name,date,ageCat,gender,subgroup) %>%
dplyr::summarise(value = sum(value)) %>%
dpc$codes$findCodesByName(codeTypes = "NHSER",outputCodeVar = "code",outputCodeTypeVar = "codeType")
neg = neg %>% bind_rows(nhser)
}
if ("LAD" %in% codeTypes) {
ltla = neg %>% dplyr::mutate(code = ltla, codeType= "LAD", name=ltlaname) %>%
dplyr::group_by(code,codeType,name,date,ageCat,gender,subgroup) %>%
dplyr::summarise(value = sum(value))
neg = neg %>% bind_rows(ltla)
}
if ("UA" %in% codeTypes) {
utla = tmp %>% dplyr::mutate(code = utla, codeType= "UA", name=utlaname) %>%
dplyr::group_by(code,codeType,name,date,ageCat,gender,subgroup) %>%
dplyr::summarise(value = sum(value))
neg = neg %>% bind_rows(utla)
}
neg = neg %>%
mutate(source = "negatives", statistic = "negative test",type = "incidence") %>%
select(-any_of("name.original")) %>%
mutate(note=NA_character_) %>%
filter(!is.na(code))
neg = neg %>%
self$fixDatesAndNames(truncate) %>%
self$fillAbsent(completeDates = TRUE) %>%
dplyr::ungroup()
return(neg)
})
attr(tmp,"paths") = c(path1,path2)
return(tmp %>% as_tibble())
},
#### Seroprevalence ----
#' @description Load the seroprevalance file
#'
#' @return raw FF100 data set
getSeroprevalence = function(...) {
path = self$getLatest(self$filter$seroprevalence)
message("Using: ",path)
tmp = self$getSaved("SEROPREV", params = list(path), ..., orElse = function (...) {
# ID Barcode surv age age_m Sex Region Location sample_region SampleDate isoweek_sample EuroImm_outcome EuroImm_Units RBD_outcome RBD_Units
#Barcode Collection Sex Location NHS_Region SampleDate sample_region isoweek_sample age age_m study_id Abbott_outcome Abbott_units EuroImmun_outcome EuroImmun_units RBD_outcome RBD_units RocheN_outcome RocheN_units RocheS_outcome RocheS_units Ethnicity study_visit firstvaccinationdate secondvaccinationdate firstvaccinationbrand secondvaccinationbrand ONS_Region
#xlsCon = self$fileProvider$getFile(path)
for (sheet in readxl::excel_sheets(self$fileProvider$getFile(path))) {
a1 = readxl::read_excel(self$fileProvider$getFile(path), sheet = sheet, range = "A1",col_names = FALSE)
if (a1[[1]]=="Barcode") break
}
data = readxl::read_excel(self$fileProvider$getFile(path), sheet = sheet, col_types = "text") %>% dplyr::mutate(
SampleDate = suppressWarnings(as.Date(as.numeric(SampleDate),"1899-12-30")),
) %>% dplyr::mutate(
age = suppressWarnings((as.numeric(age)))
)
if ("Abbott_units" %in% colnames(data)) data = data %>% dplyr::mutate(Abbott_units = suppressWarnings(as.numeric(Abbott_units)))
if ("EuroImmun_units" %in% colnames(data)) data = data %>% dplyr::mutate(EuroImmun_units = suppressWarnings(as.numeric(EuroImmun_units)))
if ("RBD_units" %in% colnames(data)) data = data %>% dplyr::mutate(RBD_units = suppressWarnings(as.numeric(RBD_units)))
if ("RocheN_units" %in% colnames(data)) data = data %>% dplyr::mutate(RocheN_units = suppressWarnings(as.numeric(RocheN_units)))
if ("RocheS_units" %in% colnames(data)) data = data %>% dplyr::mutate(RocheS_units = suppressWarnings(as.numeric(RocheS_units)))
data2 = data %>%
self$postcodes$lookupWeightedFeatureByOutcode(outcodeVar = Location, onspdVar = ccg) %>%
dplyr::mutate(weight = ifelse(is.na(weight),0,weight)) %>%
dplyr::group_by(Barcode) %>%
dplyr::arrange(desc(weight)) %>%
dplyr::filter(row_number()==1) %>%
dplyr::select(-weight) %>%
self$postcodes$lookupWeightedFeatureByOutcode(outcodeVar = Location, onspdVar = nhser) %>%
dplyr::group_by(Barcode) %>%
dplyr::mutate(weight = ifelse(is.na(weight),0,weight)) %>%
dplyr::arrange(desc(weight)) %>%
dplyr::filter(row_number()==1) %>%
dplyr::select(-weight)
return(data2 %>% dplyr::filter(!is.na(SampleDate)) %>% dplyr::ungroup())
})
attr(tmp,"paths") = path
return(tmp %>% as_tibble())
},
#' @description Load seroprevalence data from linelist
#' @param ageBreaks - a list of ages which form the cut points for breaking continuous ages into ranges (or NULL for a single age category)
#' @return a covidTimeseriesFormat dataframe
getSeroprevalenceTestIncidence = function(ageBreaks = NULL, ...) {
stop("Need to update this for newer seroprevalence data")
data2 = self$getSeroprevalence(...)
self$getSaved("SEROPREVALENCE-INCIDENCE", params=list(data2, ageBreaks), ..., orElse = function (...) covidTimeseriesFormat({
# data2 %>% group_by(EuroImm_outcome) %>% summarise(low_cutoff = min(EuroImm_Units,na.rm=TRUE),high_cutoff = max(EuroImm_Units,na.rm=TRUE))
# cut offs are: Negative > Borderline > Positive; 0.8 -> 1.1
# data2 %>% group_by(RBD_outcome) %>% summarise(low_cutoff = min(RBD_Units,na.rm=TRUE),high_cutoff = max(RBD_Units,na.rm=TRUE))
# cut offs are: Negative > Borderline > Positive; 3.3 -> 4.9
data3 = data2 %>%
dplyr::mutate(ageCat = age %>% self$cutByAge(ageBreaks), gender=self$normaliseGender(Sex), date = SampleDate)
if ("EuroImm_outcome" %in% colnames(data3)) data3 %>% rename(EuroImmun_outcome = EuroImm_outcome)
if ("EuroImm_Units" %in% colnames(data3)) data3 %>% rename(EuroImmun_units = EuroImm_Units)
if ("RBD_Units" %in% colnames(data3)) data3 %>% rename(RBD_units = RBD_Units)
data3 = data3 %>% mutate(
subgroup = case_when(
(!is.na(RBD_outcome) & RBD_outcome=="Failed QC") | (!is.na(EuroImmun_outcome) & EuroImmun_outcome == "Insufficient") ~ "no result", # if either insufficient then insufficient
(is.na(RBD_units) & is.na(EuroImmun_units)) ~ "no result", # if both NA then some problem
(!is.na(RBD_units) & RBD_units > 3.3 & RBD_units <= 4.9) | (!is.na(EuroImmun_units) & EuroImmun_units > 0.8 & EuroImmun_units < 1.1) ~ "borderline", # if either is borderline its borderline
(is.na(RBD_units) | RBD_units > 4.9) & (is.na(EuroImmun_units) | EuroImmun_units > 1.1) ~ "positive", # if both either positive or one NA and one positive its positive
(is.na(RBD_units) | RBD_units <= 3.3) & (is.na(EuroImmun_units) | EuroImmun_units <= 0.8 ) ~ "negative", # if both either negative or one NA and one negative its positive
(RBD_units <= 3.3 & EuroImmun_units > 1.1 | RBD_units > 4.9 & EuroImmun_units <= 0.8) ~ "no result", # if one positive and one negative there is a disagreement
TRUE ~ "no result"
),
Region= ifelse(is.na(Region),sample_region,Region)
) %>%
self$codes$findCodesByName(nameVar = Region, codeTypes = c("CTRY","NHSER")) %>%
self$codes$findNamesByCode(codeVar = ccg, outputNameVar = ccgName, outputCodeTypeVar = ccgCodeType, codeTypes = "CCG")
out = bind_rows(
data3 %>% dplyr::mutate(code = ccg, codeType = ccgCodeType, name = ccgName) %>% dplyr::filter(!is.na(code)) %>% dplyr::group_by( code,codeType,name,date, ageCat, gender,subgroup) %>% dplyr::summarise(value = n()),
data3 %>% dplyr::mutate(code = code, codeType = codeType, name = Region) %>% dplyr::filter(!is.na(code)) %>% dplyr::group_by( code,codeType,name,date, ageCat, gender,subgroup) %>% dplyr::summarise(value = n())
) %>%
dplyr::mutate(statistic = "serology", type = "incidence", source="SPIM seroprevalence") %>%
self$complete() %>%
#tidyr::complete(tidy::nesting(code,codeType,name,source,statistic,type),subgroup,gender,ageCat,date = as.Date(min(date):max(date),"1970-01-01"), fill=list(value=0)) %>%
dplyr::ungroup()
return(out %>% self$fillAbsent() %>% self$fixDatesAndNames(0) %>% self$complete())
}))
},
#### FF 100 ----
#' @description Load ff100 file
#'
#' @return raw FF100 data set
getFF100 = function() {
path = self$getLatest(self$filter$ff100)
message("Using: ",path)
tmp = readr::read_csv(self$fileProvider$getFile(path),
col_types = readr::cols(
FF100_ID = readr::col_integer(),
ContactOf_FF100_ID = readr::col_integer(),
date_reported = readr::col_date(format = "%Y-%m-%d"),
date_labtest = readr::col_date(format = "%Y-%m-%d"),
date_onset = readr::col_date(format = "%Y-%m-%d"),
date_hosp_adm = readr::col_date(format = "%Y-%m-%d"),
date_hosp_dis = readr::col_date(format = "%Y-%m-%d"),
hosp_adm = readr::col_logical(),
date_NHSdirect = readr::col_date(format = "%Y-%m-%d"),
NHSdirect = readr::col_logical(),
date_GP_first = readr::col_date(format = "%Y-%m-%d"),
GP = readr::col_logical(),
date_AEhosp_first = readr::col_date(format = "%Y-%m-%d"),
AEhosp = readr::col_logical(),
age = readr::col_double(),
gender = readr::col_character(),
local_authority = readr::col_character(),
travel_anywhere = readr::col_logical(),
heart_ds = readr::col_logical(),
diabetes = readr::col_logical(),
immunodeficiency = readr::col_logical(),
kidney_ds = readr::col_logical(),
liver_ds = readr::col_logical(),
resp_ds = readr::col_logical(),
asthma = readr::col_logical(),
malignancy = readr::col_logical(),
organ_recipient = readr::col_logical(),
neuro_ds = readr::col_logical(),
pregnant = readr::col_logical(),
fever = readr::col_logical(),
runny_nose = readr::col_logical(),
sneezing = readr::col_logical(),
cough = readr::col_logical(),
short_breath = readr::col_logical(),
sore_throat = readr::col_logical(),
diarrhoea = readr::col_logical(),
nausea = readr::col_logical(),
vomit = readr::col_logical(),
fatigue = readr::col_logical(),
muscle_ache = readr::col_logical(),
joint_ache = readr::col_logical(),
appetite_loss = readr::col_logical(),
headache = readr::col_logical(),
seizure = readr::col_logical(),
alter_consious = readr::col_logical(),
nose_bleed = readr::col_logical(),
rash = readr::col_logical(),
smell_loss = readr::col_logical(),
symptom_other = readr::col_logical(),
any_symptom = readr::col_logical(),
status = readr::col_character(),
case_classification = readr::col_character(),
HCW_exposure = readr::col_logical(),
ARDS = readr::col_logical(),
mech_ventl = readr::col_logical(),
ICU_adm = readr::col_logical(),
date_ICU_adm = readr::col_date(format = "%Y-%m-%d"),
date_recovery = readr::col_date(format = "%Y-%m-%d"),
date_death = readr::col_date(format = "%Y-%m-%d"),
date_exposure_first = readr::col_date(format = "%Y-%m-%d"),
date_exposure_last = readr::col_date(format = "%Y-%m-%d"),
exposure_setting_final = readr::col_character()
))
attr(tmp,"paths") = c(path1,path2)
return(tmp %>% as_tibble())
},
#### CHESS / SARI ----
#' @description Load the CHESS dataset from a path
#'
#' @param path - a path to the chess csv file
#' @return raw CHESS data set
getCHESS = function() {
path = self$getLatest(self$filter$chess)
message("Using: ",path)
out = readr::read_csv(self$fileProvider$getFile(path), col_types = readr::cols(.default = readr::col_character()))
for (col in colnames(out)) {
if (stringr::str_detect(col, "date")) {
out = out %>% mutate(!!col := as.Date(stringr::str_extract(out[[col]], "[0-9]{4}-[0-9]{2}-[0-9]{2}"), format = "%Y-%m-%d"))
} else {
out = out %>% mutate(!!col := type.convert(out[[col]], as.is=TRUE))
}
}
attr(out,"paths") = c(path)
return(out)
},
#' @description Load the CHESS dataset from a path
#'
#' @param path - a path to the chess csv file
#' @return raw CHESS data set
getSARI = function() {
path = self$getLatest(self$filter$sari)
message("Using: ",path)
out = readr::read_csv(self$fileProvider$getFile(path), col_types = readr::cols(.default = readr::col_character()))
for (col in colnames(out)) {
if (stringr::str_detect(col, "date")) {
out = out %>% mutate(!!col := as.Date(stringr::str_extract(out[[col]], "[0-9]{4}-[0-9]{2}-[0-9]{2}"), format = "%Y-%m-%d"))
} else {
out = out %>% mutate(!!col := type.convert(out[[col]], as.is=TRUE))
}
}
attr(out,"paths") = c(path)
return(out)
},
#' @description Load Chess summary file
#'
getCHESSSummary = function() {
path = self$getLatest(self$filter$chessSummary)
message("Using: ",path)
chessSummary = readr::read_csv(self$fileProvider$getFile(path), col_types = readr::cols(
DateRange = readr::col_date("%d-%m-%Y"),
DateOfAdmission = readr::col_date("%d-%m-%Y"),
YearofAdmission = readr::col_integer(),
TrustName = readr::col_character(),
Code = readr::col_character(),
.default = readr::col_integer()))
chessSummary = chessSummary %>% dplyr::select(-X67) %>% tidyr::pivot_longer(cols = c(everything(),-all_of(c("DateRange","DateOfAdmission","YearofAdmission","TrustName","Code","Total"))), names_to = "variable", values_to = "count")
chessSummary = chessSummary %>% dplyr::filter(Code != "Total")
tmp = chessSummary %>% dplyr::mutate(
toAge = stringr::str_replace(variable,"^.*_([^_]+)$","\\1"),
fromAge = stringr::str_replace(variable,"^.*_([^_]+)_[^_]+$","\\1"),
variable = stringr::str_replace(variable,"^(.*)_[^_]+_[^_]+$","\\1")
)
tmp = tmp %>% dplyr::mutate(fromAge = ifelse(fromAge=="GreaterThanEqual", toAge, fromAge))
tmp = tmp %>% dplyr::mutate(fromAge = ifelse(fromAge=="LessThan", 0, fromAge))
chessSummary = tmp %>% dplyr::mutate(toAge = ifelse(fromAge==toAge, 120, toAge))
attr(chessSummary,"paths") = c(path)
return(chessSummary)
},
#' @description Load Sari summary file
#'
getSARISummary = function(truncate = NULL,...) {
path1 = self$getLatest(self$filter$sariSummaryArchive)
path2 = self$getLatest(self$filter$sariSummaryCurrent)
out = self$getSaved(id = "SARI-SUMMARY", params = list(path1,path2), ..., orElse = function (...) covidTimeseriesFormat({
fn = function (path) {
return(readr::read_csv(self$fileProvider$getFile(path), col_types = readr::cols(
DateRange = readr::col_date("%d-%m-%Y"),
DateOfAdmission = readr::col_date("%d-%m-%Y"),
YearofAdmission = readr::col_integer(),
TrustName = readr::col_character(),
Code = readr::col_character(),
.default = readr::col_integer()))
)}
tmp1 = fn(path1)
tmp2 = fn(path2)
sariSummary = bind_rows(tmp1 %>% anti_join(tmp2,by=c("DateOfAdmission","TrustName","Code")),tmp2)
#sariSummary = sariSummary %>% group_by(DateOfAdmission,TrustName,Code) %>% arrange(desc(DateRange)) %>% filter(row_number() == 1) %>% ungroup()
#browser()
sariSummary = sariSummary %>% tidyr::pivot_longer(cols = c(everything(),-all_of(c("DateRange","DateOfAdmission","YearofAdmission","TrustName","Code"))), names_to = "variable", values_to = "count")
sariSummary = sariSummary %>% mutate(variable = stringr::str_to_lower(variable)) %>% group_by(DateOfAdmission,TrustName,Code,variable) %>% arrange(desc(count)) %>% filter(row_number() == 1) %>% ungroup()
sariSummary = sariSummary %>% dplyr::filter(Code != "Total") #%>% mutate(count = ifelse(is.na(count),0,count))
tmp = sariSummary %>% dplyr::mutate(
toAge = stringr::str_replace(variable,"^.*_([^_]+)$","\\1"),
fromAge = stringr::str_replace(variable,"^.*_([^_]+)_[^_]+$","\\1") #,
#variable = str_replace(variable,"^(.*)_[^_]+_[^_]+$","\\1")
)
tmp = tmp %>% dplyr::mutate(ageCat = case_when(
fromAge %>% stringr::str_detect("mos") ~ "<1",
toAge %>% stringr::str_detect("mos") ~ "<1",
fromAge=="greaterthanequal" ~ paste0(toAge,"+"),
fromAge=="lessthan" ~ paste0("<",toAge),
fromAge=="45" ~ "45-54",
toAge=="54" ~ "45-54", # 3 combinations in data 45-49, 45-54, 50-54 - merged into one category
TRUE ~ paste0(fromAge,"-",toAge)
))
tmp = tmp %>% dplyr::select(-fromAge,-toAge)
sariSummary = tmp %>% mutate(
type = case_when(
stringr::str_detect(variable,"newhospitaladmissionswithacuterespiratoryinfection") ~ "background",
stringr::str_detect(variable,"alladmittedpatientstestedforcovid19") ~ "background",
stringr::str_detect(variable,"alladmittedpatientswithnewlabconfirmed") ~ "incidence",
stringr::str_detect(variable,"newicu_hduadmissionswithacuterespiratoryinfection") ~ "background",
stringr::str_detect(variable,"newlabconfirmedcovid19patientsonicu_hdu") ~ "incidence",
stringr::str_detect(variable,"alllabconfirmedcovid19patientscurrentlyonicu_hdu") ~ "prevalence",
TRUE ~ NA_character_
),
statistic = case_when(
stringr::str_detect(variable,"newhospitaladmissionswithacuterespiratoryinfection") ~ "hospital admission",
stringr::str_detect(variable,"alladmittedpatientstestedforcovid19") ~ "test",
stringr::str_detect(variable,"alladmittedpatientswithnewlabconfirmed") ~ "hospital admission",
stringr::str_detect(variable,"newicu_hduadmissionswithacuterespiratoryinfection") ~ "icu admission",
stringr::str_detect(variable,"newlabconfirmedcovid19patientsonicu_hdu") ~ "icu admission",
stringr::str_detect(variable,"alllabconfirmedcovid19patientscurrentlyonicu_hdu") ~ "icu admission",
TRUE ~ NA_character_
),
subgroup = case_when(
stringr::str_detect(variable,"icu") ~ "icu",
TRUE ~ "hospital"
),
note = variable
)
sariSummary = sariSummary %>% select(date = DateOfAdmission, name = TrustName, code = Code, value=count, ageCat,type,statistic,subgroup, note) %>% mutate(codeType = "NHS trust",gender=NA,source = "sari summary")
sariSummary = sariSummary %>% group_by(date,code,name,codeType,ageCat,gender,source,subgroup,type,statistic) %>% summarise(note = paste0(note,collapse = "|"), value=sum(value,na.rm=TRUE), tmpCount = n())
if (any(sariSummary$tmpCount > 1 & (sariSummary$ageCat != "<1" & sariSummary$ageCat != "45-54"))) warning("duplicates present in sari output where none were expected")
sariSummary = sariSummary %>% self$fillAbsent() %>% self$fixDatesAndNames(truncate) # do;nt have any good info for reporting delay
return(sariSummary %>% select(-tmpCount))
}))
attr(out,"paths") = c(path1,path2)
return(out)
},
#### Get processed SPIM data ----
#### DSTL files ----
getFourNationsCases = function(truncate=NULL, ...) {
path = self$getLatest(self$filter$fourNationsCases)
message("Using: ",path)
out = self$getSaved("SPIM-4-NATIONS", params = list(path), ..., orElse = function(...) covidTimeseriesFormat({
tmp = readxl::read_excel(self$fileProvider$getFile(path), sheet = "Extracted Data", col_types = "text", na = c("n/a",""))
tmp2 = tmp %>%
tidyr::pivot_longer(
cols=c(-DateVal,-Day,-Month,-Year,-Geography),
names_to = "variable",
values_to = "value"
) %>% dplyr::mutate(
date = suppressWarnings(as.Date(DateVal)),
value = suppressWarnings(as.numeric(value))
) %>% dplyr::select(-Day,-Month,-Year) %>%
dplyr::mutate(Geography = ifelse(Geography %in% c("England: Unknown","England: Other"), "Unknown (England)", Geography)) %>% #TODO fix this ugly hack.
dplyr::rename(name= Geography) %>%
self$codes$findCodesByName(codeTypes = c("CTRY","PSEUDO")) %>%
dplyr::mutate(
statistic = "case",
type = case_when(
stringr::str_detect(variable,"umula") ~ "cumulative",
TRUE ~ "incidence"
),
source = "casedata allnations",
subgroup = variable
)
tmp3 = tmp2 %>% dplyr::select(-DateVal,-name.original, -variable) %>% mutate(ageCat=NA_character_,gender=NA_character_)
tmp4 = tmp3 %>% filter(!is.na(value)) %>% self$fillAbsentByRegion() %>% self$fixDatesAndNames(truncate) %>% self$complete()
return(tmp4)
# CHESS_LL_lab_date_cases_P1
# CHESS_LL_specimen_date_cases_P1
# CHESS_LL_lab_date_cases_P2
# CHESS_LL_specimen_date_cases_P2
# RCGP_Pos_cases
# RCGP_Neg_cases
# Admitted Patients with Lab Confirmed COVID19
# Dashboard_daily_confirmed - wales
# Dashboard_cumulative_confirmed - wales
# Positives_Spec_Date - scotland
# Positives_Cumulative_Spec_Date - scotland
# SitRep_Daily_Positive_tests - n ireland
# SitRep_Cumulative_Positive_tests - n ireland
}))
attr(out,"paths") = c(path)
return(out)
},
#' @description Load the SPI-M aggregated data spreadsheet
#' @return a covidTimeseriesFormat dataframe
# TODO: fix Couldn't match the following names: England: Unknown, England: Other, Golden Jubilee National Hospital, Velindre University NHS Trust
getSPIMextract = function(truncate=NULL,...) {
path = self$getLatest(self$filter$trust)
message("Using: ",path)
out = self$getSaved("SPIM-TRUST", params = list(path), ..., orElse = function (...) covidTimeseriesFormat({
tmp = readxl::read_excel(self$fileProvider$getFile(path), sheet = "Extracted Data", col_types = "text", na = c("n/a",""))
tmp2 = tmp %>%
tidyr::pivot_longer(
cols=c(-DateVal,-Day,-Month,-Year,-ReportLevel,-Geography,-TrustCode,-TrustName),
names_to = "variable",
values_to = "value"
) %>%
dplyr::filter(!is.na(value)) %>% dplyr::mutate(
variable = variable %>% stringr::str_replace("acute1","acuteOne")
) %>%
dplyr::mutate(
ageCat = stringr::str_extract(variable,"(<|>|Under )?[0-9]?[0-9]-?[0-9]?[0-9]?\\+?( age| year)?$") %>% stringr::str_trim(),
source = stringr::str_remove(variable,"(<|>|Under )?[0-9]?[0-9]-?[0-9]?[0-9]?\\+?( age| year)?$")
) %>%
dplyr::mutate(
ageCat = ifelse(stringr::str_detect(variable,"unknown age"),"unknown",ageCat %>% stringr::str_remove("age|year") %>% stringr::str_trim()),
gender = self$normaliseGender(variable %>% stringr::str_extract("male|female")),
source = source %>% stringr::str_remove("unknown age") %>% stringr::str_remove_all("males?|females?") %>% stringr::str_remove_all("[^a-zA-Z]+$") %>% stringr::str_to_lower()
)
#TODO: fix >84 in ageCat instead of 85+
tmp3 = tmp2 %>% dplyr::mutate(
date = suppressWarnings(as.Date(DateVal)),
value = suppressWarnings(as.numeric(value))
) %>% dplyr::select(-Day,-Month,-Year)
tmp4 = tmp3 %>% dplyr::mutate(
type = case_when(
stringr::str_detect(source,"cum") ~ "cumulative",
stringr::str_detect(source,"total") ~ "cumulative",
stringr::str_detect(source,"inc") ~ "incidence",
stringr::str_detect(source,"prev") ~ "prevalence",
stringr::str_detect(source,"weekly") ~ "incidence",
stringr::str_detect(source,"admissions") ~ "incidence",
stringr::str_detect(source,"daily") ~ "incidence",
stringr::str_detect(source,"test") ~ "incidence",
stringr::str_detect(source,"case") ~ "incidence",
stringr::str_detect(source,"discharges") ~ "incidence",
TRUE ~ NA_character_
),
statistic = case_when(
stringr::str_detect(source,"eath") ~ "death",
stringr::str_detect(source,"icu") ~ "icu admission",
stringr::str_detect(source,"osp") ~ "hospital admission",
stringr::str_detect(source,"test") ~ "test",
stringr::str_detect(source,"case") ~ "case",
stringr::str_detect(source,"carehome") ~ "case",
stringr::str_detect(source,"discharges") ~ "discharge",
source == "positive_admissions_inpatients" ~ "hospital admission",
TRUE ~ NA_character_
),
subgroup=NA_character_,
)
# scotland weekly NRS has age breakdowm, and gender breakdown which causes duplication issues....
# here we exclude them...
tmp4 = tmp4 %>% filter(
!(source == "nrs_weeklydeath" & (!is.na(ageCat) | !is.na(gender)))
) %>% filter(
!is.na(statistic) &
!is.na(type)
)
browser(expr=self$debug)
# Trusts
tmp5 = tmp4 %>%
dplyr::filter(!is.na(TrustCode)) %>%
self$codes$findNamesByCode(TrustCode,outputNameVar = name) %>%
dplyr::mutate(name= ifelse(is.na(name), "Unknown NHS trust", name)) %>%
dplyr::select(-TrustName, -Geography,-ReportLevel,-DateVal) %>%
dplyr::rename(note=variable,code = TrustCode)
tmp6 = tmp4 %>%
dplyr::filter(is.na(TrustCode)) %>%
dplyr::mutate(Geography = ifelse(Geography %in% c("England: Unknown","England: Other"), "Unknown (England)", Geography)) %>% #TODO fix this ugly hack.
dplyr::mutate(name= Geography) %>%
dplyr::select(-TrustName) %>%
self$codes$findCodesByName(codeTypes = c("LHB","HB","NHSER","CTRY","PSEUDO")) %>%
dplyr::select(-name.original, -TrustCode,-Geography,-ReportLevel,-DateVal) %>%
dplyr::rename(note=variable) %>%
dplyr::filter(!is.na(code)) # 2 missing hospital trusts - Velindre and Golden jubilee
#browser()
tmp7 = dplyr::bind_rows(tmp5,tmp6) %>% self$fillAbsentByRegion() %>% self$fixDatesAndNames(truncate) %>% self$complete()
return(tmp7)
}))
attr(out,"paths") = c(path)
return(out)
},
#' @description Load the SPI-M and public data
#' @return a covidTimeseriesFormat dataframe
getTheSPIMFireHose = function(...) {
self$getDaily("SPIM-FIRE-HOSE", ..., orElse = function (...) covidTimeseriesFormat({
bind_rows(
self$datasets$getTheFireHose(),
self$getOneOneOne(),
self$getSPIMextract(),
self$getLineListIncidence(...),
self$getDeathsLineListIncidence(...),
self$getSeroprevalenceTestIncidence(...)
)
}))
}
))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.