#library(dfmip) # for splitter function #www.github.com/akeyel/dfmip
#library(rf1) #www.github.com/akeyel/rf1
#**# Need to re-enable these lines, both packages available on github. # Disabled to avoid having to add them as official dependencies
# when the primary purpose of this repository is the data sets, and not the data processing
# Define a function to run the process, because its a lot of code and it is in 2 places
process.month = function(rda.path, rda.april, rda.processed, weather.files, april.files, breaks, i){
# Initialize weather.data object
load(sprintf("%s/%s", rda.path, weather.files[i]))
# Loads the this.state object
state = substr(weather.files[i], 1, nchar(weather.files[i]) - 6)
# Add location and location_year fields
this.state$location_year = sprintf("%s-%s_%s", state, this.state$district, this.state$year)
this.state$location = sprintf("%s-%s", state, this.state$district)
all.counties = unique(this.state$location)
most.data = this.state
# Drop data from April 2020 to avoid it getting double counted
drop.index = most.data$doy > 91 & most.data$year == 2020
most.data = most.data[!drop.index, ]
rm(this.state)
## PROCESS FOR APRIL 2020
load(sprintf("%s/%s", rda.april, april.files[i]))
state.april = substr(april.files[i], 1, nchar(april.files[i]) - 12)
this.state$location_year = sprintf("%s-%s_%s", state.april, this.state$district, this.state$year)
this.state$location = sprintf("%s-%s", state.april, this.state$district)
all.counties.april = unique(this.state$location)
if (state != state.april){
stop(sprintf("%s and %s, what happened?", state, state.april))
}
# Join April data to Most Data
most.data = rbind(most.data, this.state)
state.quarterly = rf1:::convert.env.data(most.data, all.counties, breaks) #**# Could save this for faster re-use. How should I do that?
# Change _2 to APRIL
colnames(state.quarterly) = gsub("_2", "_APRIL", colnames(state.quarterly))
save(state.quarterly, file = sprintf("%s/%s_gridmet.rda", rda.processed, state))
return(state.quarterly)
}
identify.fips = function(){
fips.lookup = wnvdata::fips.lookup
fips.lookup[grep("Maryland-Baltimore", fips.lookup$location), ] #005 for Baltimore # 510 for Baltimore City
fips.lookup[grep("Missouri-St. Louis", fips.lookup$location), ] # 189 for St. Louis, #510 for St. Louis City
fips.lookup[grep("Virginia-Fairfax", fips.lookup$location), ] # 059 for Fairfax # 600 for Fairfax City
fips.lookup[grep("Virginia-Franklin", fips.lookup$location), ] #067 for Franklin # 620 for Franklin City
fips.lookup[grep("Virginia-Richmond", fips.lookup$location), ] #159 for Richmond # 760 for Richmond City
fips.lookup[grep("Virginia-Roanoke", fips.lookup$location), ] #161 for Roanoke # 770 for Roanoke City
}
patch.name = function(this.state, problem.name, county.fips, city.fips){
problem.id = sprintf("%s_%03d", problem.name, city.fips)
new.name = sprintf("%s City", problem.name)
this.state$district_fips = sprintf("%s_%03d", this.state$district, this.state$COUNTYFP)
this.state$district_fips = gsub(problem.id, new.name, this.state$district_fips)
this.state$district = sapply(this.state$district_fips, splitter, "_", 1, 1)
return(this.state)
}
# Convert .csv to .rda
# Raw files included for NY to show methods, but due to file size, the rest were omitted from the data-raw folder
convert.to.rda = function(csv.path, rda.path, ending){
for (my.file in list.files(csv.path)){
this.file = sprintf("%s/%s", csv.path, my.file)
this.state = read.csv(this.file)
# if this.file is Maryland, Missouri, or Virginia, correct the county names based on the county fips
if (my.file == "Maryland24.csv"){
this.state = patch.name(this.state, "Baltimore", 005, 510)
}
if (my.file == "Missouri29.csv"){
this.state = patch.name(this.state, "St. Louis", 189, 510)
}
if (my.file == "Virginia51.csv"){
this.state = patch.name(this.state, "Fairfax", 059, 600) #059 for Fairfax # 600 for Fairfax City
this.state = patch.name(this.state, "Franklin", 067, 620) #067 for Franklin # 620 for Franklin City
this.state = patch.name(this.state, "Richmond", 159, 760) #159 for Richmond # 760 for Richmond City
this.state = patch.name(this.state, "Roanoke", 161, 770) #161 for Roanoke # 770 for Roanoke City
}
# Drop the countyfp field if it is present
this.state$COUNTYFP = NULL
out.file = sprintf("%s/%s%s.rda", rda.path, substr(my.file, 1, (nchar(my.file) - 4)), ending)
save(this.state, file = out.file)
}
}
standarize.names = function(in.data){
# Standardize names to CDC data'
in.data$location_year = gsub("Illinois-LaSalle", "Illinois-La Salle", in.data$location_year)
in.data$location_year = gsub("Indiana-DeKalb", "Indiana-De Kalb", in.data$location_year)
in.data$location_year = gsub("Indiana-LaPorte", "Indiana-La Porte", in.data$location_year)
in.data$location_year = gsub("Indiana-LaGrange", "Indiana-Lagrange", in.data$location_year)
in.data$location_year = gsub("Louisiana-LaSalle", "Louisiana-La Salle", in.data$location_year)
in.data$location_year = gsub("New Mexico-De Baca", "New Mexico-DeBaca", in.data$location_year)
in.data$location_year = gsub("New Mexico-Doña Ana", "New Mexico-Dona Ana", in.data$location_year)
in.data$location_year = gsub("Virginia-Manassas City Park", "Virginia-Manassas Park City", in.data$location_year)
# Update city mismatches
add.city.vec1 = c("Virginia-Alexandria", "Virginia-Bristol", "Virginia-Buena Vista", "Virginia-Charlottesville", "Virginia-Chesapeake")
add.city.vec2 = c("Virginia-Colonial Heights", "Virginia-Covington", "Virginia-Danville", "Virginia-Emporia")
add.city.vec3 = c("Virginia-Falls Church", "Virginia-Fredericksburg", "Virginia-Galax", "Virginia-Hampton")
add.city.vec4 = c("Virginia-Hopewell", "Virginia-Lexington", "Virginia-Harrisonburg", "Virginia-Lynchburg", "Virginia-Manassas")
add.city.vec5 = c("Virginia-Norton", "Virginia-Winchester")
add.v.city = c("Newport News", "Manassas Park", "Martinsville", "Norfolk", "Petersburg", "Poquoson")
add.v.city2 = c("Portsmouth", "Radford", "Salem", "Staunton", "Suffolk", "Virginia Beach", "Waynesboro", "Williamsburg")
add.v.city.vec = c(add.v.city, add.v.city2)
add.v.city.vec = sprintf("Virginia-%s", add.v.city.vec)
add.city.vec = c(add.city.vec1, add.city.vec2, add.city.vec3, add.city.vec4, add.city.vec5, add.v.city.vec)
for (city in add.city.vec){
with.city = sprintf("%s City", city)
in.data$location_year = gsub(city, with.city, in.data$location_year)
}
# Fix location based on location-year corrections
in.data$location = sapply(in.data$location_year, dfmip::splitter, "_", 1, 1)
return(in.data)
}
april.gridmet = function(){
# Convert GRIDMET data to .rda
# Working directory: "C:\Users\ak697777\University at Albany - SUNY\Elison Timm, Oliver - CCEID\CODE\wnvdata"
climate.path = "C:/Users/ak697777/University at Albany - SUNY/Elison Timm, Oliver - CCEID/DATA/CLIMATE"
gridmet.path = sprintf("%s/GRIDMET", climate.path)
april.path = sprintf("%s/GRIDMET_MONTHLY/APRIL", climate.path)
rda.path = sprintf("%s/GRIDMET_RDA", climate.path)
rda.april = sprintf("%s/APRIL", rda.path)
rda.processed = sprintf("%s/GRIDMET_RDA_processed", climate.path)
# Convert from .csv format downloaded from Google Earth Engine to .rda format to make it easier to process in R
# Also patches the 6 county pairs where the county name is the same for two distinct counties
convert.to.rda(gridmet.path, rda.path, "")
convert.to.rda(april.path, rda.april, "_april")
# Compile data to be monthly
# Check that all files match up
# Pull in weather data from GRIDMET
weather.files = list.files(rda.path)
weather.files = weather.files[weather.files != "APRIL"]
weather.files = weather.files[weather.files != "QUARTER1"]
weather.files = weather.files[weather.files != "APRIL_processed"]
april.files = list.files(rda.april)
breaks = c(90, 120) # Specify Before April and April.
err.vec = c()
is.error = 0
for (i in 1:length(weather.files)){
weather.file = weather.files[i]
state = substr(weather.files[i], 1, nchar(weather.files[i]) - 6)
state.april = substr(april.files[i], 1, nchar(april.files[i]) - 12)
if (state != state.april){
err.vec = c(err.vec, state, state.april)
is.error = 1
}
}
if (is.error == 1){
stop(sprintf("States do not match up. Problem state pairs are %s", paste(err.vec, collapse = ", ")))
}
## PROCESS FOR 2000 - 2019 data
state.quarterly = process.month(rda.path, rda.april, rda.processed, weather.files, april.files, breaks, 1)
# Intialize data frames for entire US
us.quarterly = state.quarterly
# merge in remaining weather files
for (i in 2:length(weather.files)){
state.quarterly = process.month(rda.path, rda.april, rda.processed, weather.files, april.files, breaks, i)
us.quarterly = rbind(us.quarterly, state.quarterly)
}
us.quarterly = standarize.names(us.quarterly)
# Test merge with CDC data
cdc.path = "C:/hri_no_backup/WNV_CHALLENGE"
cdc.file = sprintf("%s/neurownv_by_county_2000-2018_full_working_copy.csv", cdc.path)
cdc.raw = read.csv(cdc.file)
cdc.2019 = read.csv(sprintf("%s/wnv_by_county_2019_provisional_full.csv", cdc.path))
colnames(cdc.2019)[1] = "fips"
cdc.raw$fips = as.character(cdc.raw$fips)
cdc.raw$county = as.character(cdc.raw$county)
cdc.raw$state = as.character(cdc.raw$state)
cdc.raw$location = as.character(cdc.raw$location)
cdc.raw$year = as.numeric(as.character(cdc.raw$year))
cdc.raw$count = as.numeric(as.character(cdc.raw$count))
cdc.raw$location_year = sprintf("%s_%s", cdc.raw$location, cdc.raw$year)
cdc.raw$location = gsub("South Dakota-Oglala Lakota/Shannon", "South Dakota-Oglala Lakota", cdc.raw$location)
cdc.raw$location = gsub("Virginia-Bedford/Bedford City", "Virginia-Bedford", cdc.raw$location)
cdc.raw$location_year = gsub("South Dakota-Oglala Lakota/Shannon", "South Dakota-Oglala Lakota", cdc.raw$location_year)
cdc.raw$location_year = gsub("Virginia-Bedford/Bedford City", "Virginia-Bedford", cdc.raw$location_year)
nrow(cdc.raw) #59052
# Test merge for the main CDC data
us.quarterly.subset = us.quarterly[us.quarterly$year < 2019, ]
us.quarterly.subset = us.quarterly.subset[us.quarterly.subset$year > 1999, ]
nrow(us.quarterly.subset) # 59052
test = merge(cdc.raw, us.quarterly, by = "location_year")
nrow(test) #59033 missing 19 rows #58995. Lost 57 rows!
missing.vec = c()
for (i in 1:nrow(us.quarterly.subset)){
this.locyear = us.quarterly.subset$location_year[i]
test = grep(this.locyear, cdc.raw$location_year)
#if (!this.locyear %in% cdc.raw$location_year){
if (length(test) == 0){
missing.vec = c(missing.vec, this.locyear)
}
}
# This line was in standardize.names, not sure why it did not work there. Running it a second time here solved the issue.
us.quarterly$location_year = gsub("Virginia-Manassas City Park", "Virginia-Manassas Park City", us.quarterly$location_year)
us.quarterly$location = gsub("Virginia-Manassas City Park", "Virginia-Manassas Park City", us.quarterly$location)
test = merge(cdc.raw, us.quarterly, by = "location_year")
nrow(test) #59052
# Add anomaly
analysis.counties = unique(us.quarterly$location)
vars1 = c("TMINC_1", "TMEANC_1", "TMAXC_1", "PR_1", "RMEAN_1", "VPD_1",
"TMINC_APRIL", "TMEANC_APRIL", "TMAXC_APRIL", "PR_APRIL", "RMEAN_APRIL", "VPD_APRIL")
us.quarterly = add.anomaly(us.quarterly, vars1, analysis.counties)
# Check for missing values
missing.rows = us.quarterly[!complete.cases(us.quarterly), ]
if (missing.rows > 0){
stop(sprintf("There were %s rows with incomplete data. Please check these rows and resolve any issues", nrow(missing.rows)))
}
usethis::use_data(us.quarterly, overwrite = TRUE)
}
#' Update with a new month
#'
#'
us.month.update = function(STUFF){
# Read in data for update month
# Compile data for update month
# Merge update month into us.quarterly
# Update quarterly data
usethis::use_data(us.quarterly, overwrite = TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.