knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Load packages
library(data.table) library(measurements) library(disperseR) library(tidyverse)
In this vignette we show how we prepare the units data attached to this function.
If you have not done so yet please create the project folders as follows.
disperseR::create_dirs()
First, we need to download two files: Power Plant Emissions Data and
Power Plant Emissions Data and Emission Data from The US Environmental Protection Agency.
We start by downloading the first datasets. We will download it to the process folder, unzip it and read the csv: "2014fd_cb6_14j/inputs/ptegu/ptegu_2014NEIv2_POINT_20171103_final_21dec2017_nf_v2.csv"
url <- "ftp://newftp.epa.gov/air/emismod/2014/v2/2014fd/emissions/2014fd_inputs_point.zip" directory <- proc_dir file <- file.path(directory, '2014fd_inputs_point.zip') if (!file.exists(file)) { # if file does not exist, download it download.file(url = url, destfile = file) } unzip(file, exdir = directory) # unzip the file file <- file.path( directory, "2014fd_cb6_14j", "inputs", "ptegu", "ptegu_2014NEIv2_POINT_20171103_final_21dec2017_nf_v2.csv") d_nei <- data.table::fread(file, skip = 18)
The other data set is hosted on "https://dataverse.harvard.edu/api/access/datafile/3086908?gbrecs=true". We specify this location and download it the same way.
url <- "https://dataverse.harvard.edu/api/access/datafile/3086908?gbrecs=true" directory <- proc_dir file <- file.path(directory, 'AMPD_Unit_with_Sulfur_Content_and_Regulations_with_Facility_Attributes.csv' ) # if file does not exist, download it if (!file.exists(file)) { download.file(url = url, destfile = file) print("data downloaded") } d_ampd <- data.table::fread(file) d_ampd2 <- d_ampd
d_nei_unique <- unique(d_nei[, .( facility_name, Facility.ID..ORISPL. = oris_facility_code, Unit.ID = oris_boiler_id, stkhgt, stkdiam, stktemp, stkvel, latitude, longitude )]) d_nei_unique <- d_nei_unique[Facility.ID..ORISPL. != "" & Unit.ID != ""] d_nei_unique <- d_nei_unique[, Facility.ID..ORISPL. := as.numeric(d_nei_unique$Facility.ID..ORISPL.)]
Here we show you how to prepare the 2005 set but you can choose other years.
d_ampd <- d_ampd[Year == 2005][, V1 := NULL]
d_ampd_subset <- d_ampd[Fuel1.IsCoal == 1][, .( Facility.ID..ORISPL., Unit.ID, Year, Month, Initial.Year.of.Operation, Sulfur.Content, Program.s. = Program.s..x, SO2.Phase = SO2.Phase.y, NOx.Phase = NOx.Phase.y, EPA.Region, NERC.Region, Source.Category = Source.Category.y, State = State.x, Facility.Latitude = Facility.Latitude.x, Facility.Longitude = Facility.Longitude.x, Has.SO2.Scrub, SO2..tons., Has.NOx.Scrub, NOx..tons., CO2..short.tons., Heat.Input..MMBtu., Gross.Load..MW.h., Steam.Load..1000lb., Max.Hourly.HI.Rate..MMBtu.hr.)] d_ampd_annual <- d_ampd_subset[, .( Facility.Latitude, Facility.Longitude, SO2..tons. = sum(SO2..tons., na.rm = TRUE), CO2..short.tons. = sum(CO2..short.tons., na.rm = TRUE), NOx..tons. = sum(NOx..tons., na.rm = TRUE) ), by = c("Facility.ID..ORISPL.", "Unit.ID", "Year")] d_ampd_annual <- unique(d_ampd_annual, by = c("Facility.ID..ORISPL.", "Unit.ID")) d_ampd_annual <- d_ampd_annual[, Facility.ID..ORISPL. := as.numeric(d_ampd_annual$Facility.ID..ORISPL.)] head(d_ampd_annual)
ampd <- merge(d_ampd_annual, d_nei_unique, by = c("Facility.ID..ORISPL.", "Unit.ID"), all.x = TRUE) ampd[, ID := paste(Facility.ID..ORISPL., Unit.ID, sep = "-")] ampd <- ampd[, .( ID = ID, Latitude = Facility.Latitude, Longitude = Facility.Longitude, SOx = SO2..tons., CO2 = CO2..short.tons., NOx = NOx..tons., Height = conv_unit(stkhgt, "ft", "m"), Diam = conv_unit(stkdiam, "ft", "m"), Velocity = conv_unit(stktemp, "ft_per_sec", "m_per_sec"), Temp = conv_unit(stkvel, "F", "K") )] names(ampd)
Many of the units in the provided datasets do not have stack height data. In these cases, it is suggested in Henneman et al. (2019) to fill with the average stack height of all units. We do that and create a variable that flags if the height was imputed.
# create flag if Height is missing ampd <- ampd[, inputed := 0] ampd$inputed[is.na(ampd$Height)] <- 1 # impute ampd$Height[is.na(ampd$Height)] <- mean(ampd$Height, na.rm = T)
There are duplicates from the NEI database, where for the same stack (= same facility + same unit) may be reported several times for the same pollutant with different stack info.
ampd <- unique(ampd, by = "ID")
Delete variables we do not need.
ampd <- ampd %>% dplyr::select(-c(Diam, Velocity, Temp))
The ampd
data are present in your environment should be now ready to use. You can rename it to units2005
or whatever you wish.
This works however we wanted to attached all the years in the same file. We write a function to get these data.
get_units_data <- function(year, d_ampd) { d_ampd <- d_ampd[Year == year][, V1 := NULL] d_ampd_subset <- d_ampd[Fuel1.IsCoal == 1][, .( Facility.ID..ORISPL., Unit.ID, Year, Month, Initial.Year.of.Operation, Sulfur.Content, Program.s. = Program.s..x, SO2.Phase = SO2.Phase.y, NOx.Phase = NOx.Phase.y, EPA.Region, NERC.Region, Source.Category = Source.Category.y, State = State.x, Facility.Latitude = Facility.Latitude.x, Facility.Longitude = Facility.Longitude.x, Has.SO2.Scrub, SO2..tons., Has.NOx.Scrub, NOx..tons., CO2..short.tons., Heat.Input..MMBtu., Gross.Load..MW.h., Steam.Load..1000lb., Max.Hourly.HI.Rate..MMBtu.hr.)] d_ampd_annual <- d_ampd_subset[, .( Facility.Latitude, Facility.Longitude, SO2..tons. = sum(SO2..tons., na.rm = TRUE), CO2..short.tons. = sum(CO2..short.tons., na.rm = TRUE), NOx..tons. = sum(NOx..tons., na.rm = TRUE) ), by = c("Facility.ID..ORISPL.", "Unit.ID", "Year")] d_ampd_annual <- unique(d_ampd_annual, by = c("Facility.ID..ORISPL.", "Unit.ID")) d_ampd_annual <- d_ampd_annual[, Facility.ID..ORISPL. := as.numeric(d_ampd_annual$Facility.ID..ORISPL.)] ampd <- merge(d_ampd_annual, d_nei_unique, by = c("Facility.ID..ORISPL.", "Unit.ID"), all.x = TRUE) ampd[, ID := paste(Facility.ID..ORISPL., Unit.ID, sep = "-")] ampd <- ampd[, .( ID = ID, Latitude = Facility.Latitude, Longitude = Facility.Longitude, SOx = SO2..tons., CO2 = CO2..short.tons., NOx = NOx..tons., Height = conv_unit(stkhgt, "ft", "m"), Diam = conv_unit(stkdiam, "ft", "m"), Velocity = conv_unit(stktemp, "ft_per_sec", "m_per_sec"), Temp = conv_unit(stkvel, "F", "K") )] ampd <- ampd[, inputed := 0] ampd$inputed[is.na(ampd$Height)] <- 1 # impute ampd$Height[is.na(ampd$Height)] <- mean(ampd$Height, na.rm = T) ampd <- unique(ampd, by = "ID") # add year variable ampd <- ampd[, year := year] ampd <- ampd %>% dplyr::select(-c(Diam, Velocity, Temp)) }
Vectors of years that we will be adding to our data.
vector_years<-c(1995:2015)
Run the function for all the years and combine results.
units <- data.table::setDF(data.table::rbindlist(lapply(vector_years, get_units_data, d_ampd=d_ampd2))) units <- units %>% mutate(uID=gsub("-", ".", ID)) save(units, file = "units.Rda")
Now the units data should be in your environment. You can go ahead and use it. We have attached it to the disperseR package.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.