ukem: Package Title (one line, title case)

# start constants
# abbrev used in NAEI, in SNAP sector order 1-11
v_sector <- c("energyprod", "domcom", "indcom", "indproc", "offshore",
                "solvents", "roadtrans", "othertrans", "waste", "agric", "nature")
n_sectors <- length(v_sector)
# end constants


#' getEmissionTimeSeriesFromBEIS
#'
#' Get a time series of pollutants from the BEIS web server
#' @param startYear start year of time series
#' @param endYear     end year of time series
#' @param query_id_ghg url to the template repository
#' @param v_pollutant_name_ghg vector of names of pollutants under BEIS GHG tab
#' @param query_id_airpoll destination remote url
#' @param v_pollutant_name_airpoll vector of names of pollutants under BEIS Air Pollution tab
#'
#' @details Need to run two queries first at https://naei.beis.gov.uk/data/data-selector
#' with one vector of GHGs and one vector of air pollutants.
#' These need to match the pollutants in /data/lookups/pollutants.csv.
#' @examples
#' \dontrun{
#' dt <- getEmissionTimeSeriesFromBEIS(
#'   startYear = 1990,
#'     endYear = 2018,
#'   query_id_ghg = 145161,
#'   v_pollutant_name_ghg = c("co2", "ch4", "n2o"),
#'   query_id_airpoll = 145164,
#'   v_pollutant_name_airpoll = c("co", "nox", "nh3", "bz", "so2", "voc"))
#' }
getEmissionTimeSeriesFromBEIS <- function(
    startYear = 1990,
      endYear = 2019, # 2018, updated to latest NAEI year
    query_id_ghg = 151517, # 145161, SAM updated for new query generation on NAEI
    v_pollutant_name_ghg = c("co2", "ch4", "n2o"),
    query_id_airpoll = 151579, # 145164, SAM updated for new query generation on NAEI
    v_pollutant_name_airpoll = c("co", "nox", "nh3", "bz", "so2", "voc"),
    sector_classification = "SNAP",
    lookup_NFR # NFR to SNAP lookup table
    ){

  df_lookup <- read.csv("./data/lookups/pollutants.csv", stringsAsFactors = FALSE)

  # output will be written to a temporary file
  fname <- paste0("./data/NAEI_ts_wide.csv")
  # if necessary, delete any existing copy because we later append to it
  if (file.exists(fname)) file.remove(fname)

  ## SJT: i have changed the script do to AP first
  ##      This is because the AP run from 1970 to 20xx and GHG from 1990
  ##      Doing GHG first means that appended AP are do not align by 20 years and no error is created.
  ## ALSO: change to make lists and then bind together to respect col headers, append does not do this
  ## thought: the id query on NAEI can be changed to run only from 1990, thereby solving append problem, but
  ##          this seems to restrict the data that can be called in from NAEI

  ind <- match(v_pollutant_name_airpoll, df_lookup$pollutant) # no such name `name_pollutant`; changed
  v_pollutant_id <- df_lookup$pollutant_id[ind] # c(2, 3, 5)
  v_url <- paste0("https://naei.beis.gov.uk/data/download?q=", query_id_airpoll, "&section=ukdata&pollutantId=", v_pollutant_id)

  # read from the query URL into a list of data tables
  l_dt_ap <- lapply(v_url, fread, na.strings = "-", header=T)

  ind <- match(v_pollutant_name_ghg, df_lookup$pollutant)
  v_pollutant_id <- df_lookup$pollutant_id[ind] # c(2, 3, 5)
  v_url <- paste0("https://naei.beis.gov.uk/data/download?q=", query_id_ghg, "&section=ukdata&pollutantId=", v_pollutant_id)

  # read from the query URL into a list of data tables
  l_dt_ghg <- lapply(v_url, fread, na.strings = "-", header=T) # SJT: this structure is not the same as AP

  # write all to one csv, because the data type recognition doesnt work from the query
  # data frames in list do not all have same colClasses
  ## THE DATA APPEND DOES NOT WORK BY COL NAME
  #lapply(l_dt, fwrite, file = fname, append = TRUE)

  # bind together the data, using column names (differing years of data) and write.
  dt_ap_ghg <- rbindlist(c(l_dt_ap, l_dt_ghg), use.names = T, fill = T)
  fwrite(dt_ap_ghg, fname, eol = "\n")

  # then read back in from the csv
  dt <- fread(fname, na.strings = c("-"), header=T)

  # change blanks to NA
  dt$Source[dt$Source == ""]     <- NA
  dt$Activity[dt$Activity == ""] <- NA

  # reshape to long format # SJT: i have reservations about using col indices - use names?
  startCol <- 6
  endCol   <- dim(dt)[2]
  dt <- as.data.table(pivot_longer(dt, cols = startCol:endCol,
    names_to = "year", values_to ="emission"))
  # remove / and space characters from names
  names(dt) <- make.names(names(dt))
  # and make consistent
  setnames(dt, c("Gas",     "Units"),
             c("pollutant", "units"))
  #summary(dt)
  #unique(dt$NFR.CRF.Group)

  # remove rows outwith data range
  dt <- dt[year >= startYear & year <= endYear]
  # make a subset containing only totals (where Source is NA) for later comparison
  dt_total <- subset(dt,  is.na(Source))
  # remove rows containing totals (where Source is NA)
  dt       <- subset(dt, !is.na(Source))

  df_lookup <- read.csv("./data/lookups/pollutants.csv", stringsAsFactors = FALSE)
  ind <- match(dt$pollutant, df_lookup$pollutant_longName)
  dt$pollutant <- df_lookup$pollutant[ind]
  # unique(dt$pollutant)
  # any(is.na(dt$pollutant))

  df_lookup <- fread(lookup_NFR, stringsAsFactors = FALSE)
  #df_lookup <- read.csv("./data/lookups/NFR19_to_SNAP.csv", stringsAsFactors = FALSE) # updated to new
  ind <- match(dt$NFR.CRF.Group, df_lookup$NFR19)
  # set snap to matching value in lookup table
  # "avi.cruise" is only non-numeric, and set to NA
  dt$snap_id <- as.numeric(df_lookup$SNAP[ind])
  dt <- dt[!is.na(snap_id)]
  # unique(dt$snap_id)
  # unique(dt$Units)
  if (sector_classification == "SNAP"){
    dt <- dt[, .(emission = sum(emission, na.rm = TRUE)), by = .(pollutant, year, snap_id)]
  } else if (sector_classification == "NFR"){
    dt <- dt[, .(emission = sum(emission, na.rm = TRUE)), by = .(pollutant, year, NFR.CRF.Group)]
  }
  # year returned as character; need to convert back to integers
  dt$year <- as.numeric(dt$year)

  # set units
  # given as kilotonnes = Gg = 1e9 g
  units(dt$emission) <- "Gg /yr"
  # point and diffuse data are in Mg (= tonnes)
  dt$emission <- set_units(dt$emission, Mg /yr)
  # but convert CO2-C to CO2 - only odd one out?
  dt[pollutant == "co2", emission := emission /12*44]

  # # check that the totals of the remaining rows add to the
  # # pre-calculated totals
  # dt$NFR.CRF.Group_1 <- substr(dt$NFR.CRF.Group, 1, 2)
  # head(dt)
  # dt_total_check <- dt[, .(F_sum = sum(emission, na.rm = TRUE)), by = .(Gas, NFR.CRF.Group_1, year)]
  # dft <- inner_join(dt_total_check, dt_total,
    # by = c("Gas", "NFR.CRF.Group_1" = "NFR.CRF.Group", "year"))
  # dim(dt_total)

  # add missing levels
  dt <- dt %>%
    tidyr::complete(snap_id = 1:n_sectors, nesting(pollutant, year),
    fill = list(emission = 0))
  dt <- as.data.table(dt)

  # do we want to write to file?
  fwrite(dt, file = "./data/NAEI_ts_long.csv", eol = "\n")
  #dt <- fread(file = "./data/NAEI_ts_long.csv")

  return(dt)
}


getEmissionTimeSeriesFromUNFCCC <- function(
    country_code = "IE",
    startYear = 1990,
      endYear = 2019,
    v_pollutant_name_ghg = c("co2", "ch4", "n2o"),
    sector_classification = "SNAP",
    lookup_CRF # added here, similar to NFR lookup above.
    ){

  dt <- fread("./data/UNFCCC_GHG_ts.csv")
  #dim(dt)
  #names(dt)
  setnames(dt, c("Pollutant_name","Year","Unit","emissions"),
               c("pollutant","year","Units","emission"))
  #summary(dt)
  # subset to country, without NA values
  dt <- dt[Country_code == country_code & !is.na(emission)]
  dt[, pollutant := tolower(pollutant)]
  dt[, year := as.numeric(year)]
  dt <- dt[pollutant %in% v_pollutant_name_ghg]
  dt <- dt[year >= startYear & year <= endYear]

  # read lookup table for CRF to SNAP sector codes
  # !stringsAsFactors option needed on older R versions
  ## use the new CRF to SNAP lookup. Use UNFCCC data.
  df_lookup <- fread(lookup_CRF, stringsAsFactors = FALSE)

  ind <- match(dt$Sector_code, df_lookup$CRF)
  # set snap_id to matching value in lookup table
  dt[, snap_id := as.numeric(df_lookup$SNAP[ind])]
  # Sector totals are only missing values - check non-matched are just these
  # sum(is.na(dt$snap_id))  # 522
  dt <- dt[!is.na(snap_id)]

  # calculate totals by SNAP sector or other sector_classification
  if (sector_classification == "SNAP"){
    dt <- dt[, .(emission = sum(emission, na.rm = TRUE), units = first(Units)), by = .(pollutant, year, snap_id)]
  } else if (sector_classification == "NFR"){ # would need to look up other codes before this will work
    dt <- dt[, .(emission = sum(emission, na.rm = TRUE), units = first(Units)), by = .(pollutant, year, Parent_sector_code)]
  }

  # add missing levels
  dt <- dt %>%
    tidyr::complete(snap_id = 1:n_sectors, nesting(pollutant, year),
    fill = list(emission = 0))
  dt <- as.data.table(dt)
dt[pollutant=="co2" & year==2019]
    # set units
  # given as Gg = 1e9 g
  units(dt$emission) <- "Gg / yr"
  # point and diffuse data are in Mg (= tonnes)
  dt$emission <- set_units(dt$emission, Mg / yr)

  # write to file
  fwrite(dt, file = "./data/UNFCCC_ts_long.csv", eol = "\n")
  return(dt)
}


#' getPointSourceData
#'
#' Get a time series of pollutant point source data from the NAEI files
#' @param startYear start year of time series
#' @param endYear     end year of time series
#'
#' @details Files have been downloaded from NAEI (https://naei.beis.gov.uk/data/)
#' but old files are no longer directly available from web site.
#' This is straightforward, apart from standardising names over time.
#' @examples
#' \dontrun{
#' dt <- getPointSourceData(startYear = 2010, endYear = 2018)
#' }
getPointSourceData <- function(startYear = 2010, endYear = 2019, lookup_SIC){
  v_year <- startYear:endYear
  v_fname <- paste0("./data-raw/point_sources/NAEIPointsSources_", v_year, ".xlsx")
  l_df <- suppressWarnings(lapply(v_fname, read_excel, sheet = "Data"))
  # l_names <- lapply(l_df, names)
  # lapply(l_names, `[[`, 2)
  # lapply(l_df, head)
  # lapply(l_df, standardiseNames)
  for (i in 1:length(l_df)){
    l_df[[i]] <- standardiseNames(l_df[[i]])
  }
  # lapply(l_df, names)
  # lapply(l_names, `[[`, 2)
  dt <- as.data.table(bind_rows(l_df))

  ##* WIP - the list in pollutants.csv is incomplete
  ##* SJT: something is wrong with the match() functions, creating 'extra' SNAP codes where they shouldnt be
  ##* i think it is the presence of TWO lines for pm25, which causing multiple matches.
  df_lookup <- as.data.table(read.csv("./data/lookups/pollutants.csv", stringsAsFactors = FALSE))
  df_lookup <- df_lookup[pollutant != "pm2_5"]
  ind <- match(dt$pollutant_id, df_lookup$pollutant_id)
  dt$pollutant <- df_lookup$pollutant[ind]

  df_lookup <- fread(lookup_SIC, stringsAsFactors = FALSE)
  ## cannot match like below, as SectorIDs map to different SNAPs dependent on pollutant (only in some instances) - V ANNOYING
  ## there is a problem here as the point source pollutant names are complex (and change) so cant match to the SIC_to_SNAP table. Must have a pollutant name to enable it.
  ## **Need to change SIC-to_SNAP lookup to include pollutant_id **. Done.

  #ind <- match(dt$sic_id, df_lookup$SectorID)
  #dt$snap_id_fromSIC <- df_lookup$SNAP[ind]
  setnames(df_lookup, c("Pollutant","Pollutant_id","SectorID","SNAP"), c("pollutant_name","pollutant_id","sic_id","snap_id_fromSIC"))
  dt <- df_lookup[dt, on =c("sic_id","pollutant_id")]
  # only use this if snap_id is missing
  ## does this prioritise SNAP data originally in point files? is this wise? is it out of date? ##
  dt[is.na(snap_id), snap_id := snap_id_fromSIC]

  return(dt)
}


## ----standardiseNames----
#' Standardises names in a data frame
#'
#' This function standardises names in a data frame.
#' @param df A data frame.
#' @export
#' @examples
#' \dontrun{
#' dt_pts <- standardiseNames(dt_pts)
#' }
standardiseNames <- function(df){
  # common variables
  if (exists("year", where=df)){
    df[["Year"]] <- df[["year"]]
    df[["year"]] <- NULL
  }
  if (exists("SectorID", where=df)){
    df[["sic_id"]] <- df[["SectorID"]]
    df[["SectorID"]] <- NULL
  }
  if (exists("Point_SNAPID", where=df)){
    df[["snap_id"]] <- df[["Point_SNAPID"]]
    df[["Point_SNAPID"]] <- NULL
  }
  if (exists("Point_SNAP Code", where=df)){
    df[["snap_id"]] <- df[["Point_SNAP Code"]]
    df[["Point_SNAP Code"]] <- NULL
  }
  if (exists("SNAP Code", where=df)){
    df[["snap_id"]] <- df[["SNAP Code"]]
    df[["SNAP Code"]] <- NULL
  }
  if (exists("Point_NFR Code", where=df)){
    df[["nfr_id"]] <- df[["Point_NFR Code"]]
    df[["Point_NFR Code"]] <- NULL
  }
  if (exists("Point_NFRCode", where=df)){
    df[["nfr_id"]] <- df[["Point_NFRCode"]]
    df[["Point_NFRCode"]] <- NULL
  }
  if (exists("NFR Code", where=df)){
    df[["nfr_id"]] <- df[["NFR Code"]]
    df[["NFR Code"]] <- NULL
  }
  if (exists("Pollutant Name", where=df)){
    df[["pollutant_name"]] <- df[["Pollutant Name"]]
    df[["Pollutant Name"]] <- NULL
  }
  if (exists("FullPollName", where=df)){
    df[["pollutant_name"]] <- df[["FullPollName"]]
    df[["FullPollName"]] <- NULL
  }
  if (exists("PollCode", where=df)){
    df[["pollutant_id"]] <- df[["PollCode"]]
    df[["PollCode"]] <- NULL
  }
  if (exists("Pollcode", where=df)){
    df[["pollutant_id"]] <- df[["Pollcode"]]
    df[["Pollcode"]] <- NULL
  }
  if (exists("PollutantID", where=df)){
    df[["pollutant_id"]] <- df[["PollutantID"]]
    df[["PollutantID"]] <- NULL
  }
  if (exists("PollutantD", where=df)){
    df[["pollutant_id"]] <- df[["PollutantD"]]
    df[["PollutantD"]] <- NULL
  }
  if (exists("OS_GRE", where=df)){
    df[["x"]] <- df[["OS_GRE"]]
    df[["OS_GRE"]] <- NULL
  }
  if (exists("Easting", where=df)){
    df[["x"]] <- df[["Easting"]]
    df[["Easting"]] <- NULL
  }
  if (exists("OS_GRN", where=df)){
    df[["y"]] <- df[["OS_GRN"]]
    df[["OS_GRN"]] <- NULL
  }
  if (exists("Northing", where=df)){
    df[["y"]] <- df[["Northing"]]
    df[["Northing"]] <- NULL
  }
  if (exists("Operator_Full", where=df)){
    df[["Operator"]] <- df[["Operator_Full"]]
    df[["Operator_Full"]] <- NULL
  }
  if (exists("Sourcecode", where=df)){
    df[["WebSourcecode"]] <- df[["Sourcecode"]]
    df[["Sourcecode"]] <- NULL
  }
  if (exists("Emission", where=df)){
    df[["emission"]] <- df[["Emission"]]
    df[["Emission"]] <- NULL
  }
  if (exists("SumOfEmission", where=df)){
    df[["emission"]] <- df[["SumOfEmission"]]
    df[["SumOfEmission"]] <- NULL
  }
  if (exists("Units", where=df)){
    df[["Unit"]] <- df[["Units"]]
    df[["Units"]] <- NULL
  }
  if (exists("Data Type", where=df)){
    df[["Datatype"]] <- df[["Data Type"]]
    df[["Data Type"]] <- NULL
  }

  return(df)
}

# getAlpha_byYear function to be written to data.table syntax?
getAlpha_byYear <- function(dt, year_ref){
  # calculate the ref emission for each pollutant/sector/year
  # extract the reference year values
  dt_ref <- dt[year == year_ref, emission, by = .(pollutant, snap_id)]
  # add them back in
  dt <- left_join(dt, dt_ref, by = c("pollutant", "snap_id"), suffix = c("", "_ref"))
  # and calculate alpha as ratio
  # data table version produces a warning: "Invalid .internal.selfref detected ..."
  #dt[, alpha_year := emission / emission_ref][]
  # so using standard version instead - SJT removing, not needed in scaling function later?
  dt$alpha_year <- dt$emission / dt$emission_ref
  return(dt)
}

plotAlpha_byYear <- function(dt, v_pollutant_to_plot = c("ch4", "co2", "n2o")){
  # extract the pollutant-specific values
  #v_pollutant_to_plot <- c("co", "nh3", "nox")
  dts <- dt[pollutant %in% v_pollutant_to_plot]
  p <- ggplot(dts, aes(year, alpha_year, colour = pollutant))
  p <- p + geom_line()
  p <- p + facet_wrap(~ snap_id, scale = "free")
  p <- p + ylab(expression(alpha[i*","*year]))
  p <- p + ggtitle(expression(paste("Relative variation in emissions, ", italic(alpha)[i*","*year])))
  p
  ggsave(p, file = "./docs/alpha_vsYear_byGHG.png")
  return(p)
}

# s_uk <- readUKDiffuseData(pollutant = "co2")
readUKDiffuseData <- function(pollutant, year){ # 'year=' causes problems!! Force to 2019?
  # we have no data prior to 2010, so use 2010 data for earlier years
  if (year < 2010){
    year <- 2010
    print("UK diffuse-source mapped data only exist from 2010 - using these data for earlier years")
  }
  dir_data_raw <- paste0("./data-raw/diffuse_uk/", pollutant) # linked to 2019 NAEI maps
  #dir_data_out <- paste0("./data/refYearStacks")

  # read in area sources of GHG emissions
  #v_fname  <- paste(dir_data_raw, "/",
  #  v_sector, pollutant, substr(year, 3, 4), ".tif", sep = "") # changed from .asc to .tif
  # make a stack with a layer for each sector
  #s <- stack(v_fname)

  # have changed the v_fname to loop and stack. While uglier, this allows flexibility for adding in missing SNAPs as and when they occur
  # For e.g. CH4 does not have solvents while SOx does not have agri or natural
  s <- stack()

  r_blank <- raster(list.files(dir_data_raw, pattern = paste0(paste(v_sector,collapse="|"), pollutant, substr(year, 3, 4),".tif$"), full.names = T)[1])
  r_blank[] <- NA
  names(r_blank) <- NA

  for(i in v_sector){

    if(file.exists(paste0(dir_data_raw,"/",i, pollutant, substr(year, 3, 4),".tif"))){
      ri <- raster(paste0(dir_data_raw,"/",i, pollutant, substr(year, 3, 4),".tif"))
      s <- stack(s, ri)
    }else{
      ri <- r_blank
      names(ri) <- paste0(i, pollutant, substr(year, 3, 4))
      s <- stack(s, ri)
    }

  }

  # set projection to be OSGB 1936 in consistent form
  projection(s) <- projOSGB

  # units are Mg / km^2 / yr
  # but convert CO2-C to CO2 - only odd one out?
  if (pollutant == "co2")  s <- s /12*44
  attr(s, "units") <- "Mg / km^2 / yr"
  #writeRaster(s, file = paste0(dir_data_out, "/s_", pollutant, "_uk.tif"), overwrite=TRUE)
  return(s)
}

# s_ie <- readIEDiffuseData(pollutant = "ch4")
readIEDiffuseData <- function(pollutant, year, 
  lookup_GNFR = "./data/lookups/GNFR_to_SNAP.csv"){ # 'year=' causes problems!! Force to 2019
  # we only have data for 2016, so use this for all years (actually 2015 also exist, but not processed)
  if (year != 2019){ # have altered to read 2019 maps
    year <- 2019
    print("Irish diffuse-source mapped data now exist for 2019 - using these data for all years")
  }
  dir_data_raw <- paste0("./data-raw/diffuse_ie/", pollutant) # linked to 2019 MapEire maps
  #dir_data_out <- paste0("./data/refYearStacks")
  v_fname  <- list.files(dir_data_raw, pattern = paste0(year, "_*"))
  v_GNFR_letter  <- toupper(substr(v_fname, 6, 6))

  df_lookup <- read.csv(lookup_GNFR, stringsAsFactors = FALSE)
  ind <- match(v_GNFR_letter, df_lookup$GNFR_letter)
  v_snap_id <- df_lookup$snap_id[ind]
  # remove files without SNAP match
  #v_fname <- v_fname[!is.na(v_snap_id)]

  # make a stack with a layer for each GNFR sector
  s_gnfr <- suppressWarnings(stack(paste(dir_data_raw, v_fname, sep = "/")))

  # make a stack with a layer for each SNAP sector
  s <- stack(s_gnfr[[1:n_sectors]])
  # remove existing data
  s <- setValues(s, NA)

  # the following loop causes warnings - could be avoided with calc?
  for (i in 1:n_sectors){
    if (i %in% v_snap_id){ # sector 4 is missing - all industry allocated to SNAP 3
      s[[i]] <- suppressWarnings(sum(s_gnfr[[which(v_snap_id == i)]], na.rm=T)) # added na.rm=T as rasters not summing correctly. Warnings here are generated when only 1 layer from s_gnfr matches a SNAP. Suppress.
    }
  }

  names(s) <- v_sector
  # set projection to be OSGB 1936 in consistent form
  projection(s) <- projOSGB

  # units seem to be in Mg / km^2 / yr
  attr(s, "units") <- "Mg / km^2 / yr"
  #writeRaster(s, file = paste0(dir_data_out, "/s_", pollutant, "_ie.tif"), overwrite=TRUE)
  return(s)
}

add_points_to_diffuse <- function(year, s, dt_pts, pollutant){
  # subset point source data to just pollutant of interest
  # setkey(dt_pts, pollutant)
  # dt_pts <- dt_pts[pollutant]
  # "Year" in dt_pts is inconsistently with uppercase
  # "get" retrives the variable rather than the column name
  dt_pts <- dt_pts[pollutant == get("pollutant", envir = parent.frame(3)) &
                        Year == get("year",      envir = parent.frame(3))]
  #dim(dt_pts)

  # SJT: add in line to convert CO2-C to CO2, not done elsewhere.
  if(pollutant == "co2") dt_pts[, emission := emission/12*44]

  for (i_sector in 1:n_sectors){
    ##* WIP
    # subset point source data for this sector
    dt_pts_sub <- dt_pts[snap_id == i_sector]

    # rasterise point sources
    # if no values for this sector, no point sources to rasterise
    if (sum(dt_pts_sub$emission) == 0){
      r_pts <- s[[i_sector]]
      r_pts[] <- 0
    } else {
      # make a raster with the sum of point sources in each grid cell
      xy <- cbind(dt_pts_sub$x, dt_pts_sub$y)
      r_pts <- rasterize(xy, s, dt_pts_sub$emission, fun=sum)
      r_pts[is.na(r_pts)] <- 0 # so we can add with missing values - use na.rm = TRUE in sum function below instead?

    }

    # this is not working - changing to calc
    # which means creating a blank raster for no points
    #s[[i_sector]] <- s[[i_sector]] + r_pts
    s[[i_sector]] <- calc(stack(s[[i_sector]], r_pts), sum , na.rm=T)

  } # end sectors loop

  return(s)
}

scale_stack_to_nationalTotal <- function(year, sp, dt_ts, pollutant){
  # subset time series to just pollutant and year of interest
  dt_ts <- dt_ts[pollutant == get("pollutant", envir = parent.frame(3)) &
                      year == get("year",      envir = parent.frame(3))]
  setkey(dt_ts, snap_id) # set row order
  #sum(dt_ts$emission[1:11]) / 1e6 ### needs unit change to CO2 ###
  ind_check <- dt_ts[snap_id %in% c(3,4), sum(emission, na.rm=T)]

  ## sum the industrial sectors here, before national total scaling.
  ## need to think more on whether this is correct/robust. Should we change the SNAP look-ups instead? or add a function argument?
  dt_ts[snap_id==3, emission := dt_ts[snap_id %in% c(3,4), sum(emission, na.rm=T)]]
  dt_ts[snap_id==4, emission := 0]
  if(dt_ts[snap_id==3, emission] != ind_check) print("Extra industrial emissions created. Check")

  sp[[3]] <- calc(stack(sp[[3]],sp[[4]]), sum, na.rm=T)
  sp[[4]] <- setValues(sp[[4]],0)

  for (i_sector in 1:n_sectors){
    # subset time series data for this sector
    dt_ts_sub <- dt_ts[snap_id == i_sector]
    # assumes national and spatial data are in same units, Mg; otherwise unit conversion would be needed here

    national_total <- dt_ts_sub$emission
    spatial_total <- cellStats(sp[[i_sector]], "sum", na.rm = TRUE)
    # calc adjustment multiplier
    scaling_factor <- national_total / spatial_total
    scaling_factor <- set_units(scaling_factor, NULL)
    print(paste0("Sector ", i_sector,": Nat total - ",round(national_total,1)," ; Spatial total - ", round(spatial_total,1)," ; Scale fac - ",round(scaling_factor,2)))
    # apply it
    sp[[i_sector]] <- sp[[i_sector]] * scaling_factor
  } # sectors

  return(sp)
}

## ----unit_conversions, eval=TRUE-----------------------------------------
#' A unit conversion function
#'
#' This function converts from Tg km-2 y-1 to standard SI units in m-2 s-1.
#' @param pollutant Pollutant name: one of "ch4", "co2", "n2o", "c2h6" or "voc". Defaults to "ch4".
#' @param unit_in Units of input data. Defaults to "Mg / km^2 / yr".
#' @param unitType Either molar ("mol") or mass-based ("g").
#' @param unitSIprefix Any standard SI prefix for the output units, from "peta" to "pico".
#' @keywords internal units
#' @export
#' @seealso \code{\link{calcFlux}}, the higher-level function which calls this.
#' @examples
#' unit_conversion("ch4", "mol", "nano")
#' unit_conversion("co2", "mol", "micro")
#' unit_conversion("n2o", "mol", "nano")
#' unit_conversion("ch4", "g", "nano")

unit_conversion <- function(pollutant = c("ch4", "co2", "n2o", "c2h6", "voc", "co"),
                           unitType = c("mol", "g"),
                           unitSIprefix = c("peta", "tera", "giga", "mega", "kilo", "none", "milli", "micro", "nano", "pico"),
                           unit_in = c("Mg / km^2 / yr", "Gg / km^2 / yr", "Tg / km^2 / yr")){
  pollutant <- match.arg(pollutant)
  unitType  <- match.arg(unitType)
  unitSIprefix <- match.arg(unitSIprefix)
  unit_in   <- match.arg(unit_in)

  if (unit_in == "Mg / km^2 / yr") Xg_to_g <- 1e06
  if (unit_in == "Gg / km^2 / yr") Xg_to_g <- 1e09
  if (unit_in == "Tg / km^2 / yr") Xg_to_g <- 1e12
  km2_to_m2 <- 1e6
  secsPerYear <- 365*24*60*60

  # molecular weight of gases
  if (pollutant == "ch4") {
      molWt <- 16
  } else if (pollutant == "co2") {
      molWt <- 44
  } else if (pollutant == "co") {
      molWt <- 28
  } else if (pollutant == "n2o") {
      molWt <- 44
  } else if (pollutant == "c2h6") {
      molWt <- 30       # 12*2 + 6
  } else if (pollutant == "voc") {
      molWt <- 78.9516  # approximate mean from https://www.convertunits.com/molarmass/VOC
  }
  # ugly, but numerically correct
  if (unitType == "g") {
      molWt <- 1
  }

  # unit conversions - this could be outwith the function - stored as data
  #unitSIprefix <- "peta"
  SIprefix <- c("peta", "tera", "giga", "mega", "kilo", "none", "milli", "micro", "nano", "pico")
  SI_multiplier <- c(1e-15, 1e-12, 1e-9, 1e-6, 1e-3, 1, 1e3, 1e6, 1e9, 1e12) # better with a seq function?
  # match prefix with multiplier
  i <- match(unitSIprefix, SIprefix)
  mult <- SI_multiplier[i]

  # gases usually in Tg km-2 y-1, converted to x m-2 s-1
  value <- Xg_to_g *mult /molWt /km2_to_m2 /secsPerYear

  # return name as well
  name <- paste(SIprefix[i], unitType, pollutant, "m^-2_s^-1", sep="_")
  return(list(value = value,  # unit conversion multiplier
               name = name,  # unit conversion name
              molWt = molWt)) # molecular weight
}

getMarineEmissionTimeSeries_annual <- function(pollutant, startYear = 2010, endYear = 2019){
  # annual means
  fname <- paste0("./data-raw/marine/", pollutant, "/b_F", pollutant, "_annual_amm7.tif")
  b <- brick(fname)
  names(b) <- 2005:2015

  b[b < 0] <- NA
  v_emission <- cellStats(b, mean)
  v_emission <- set_units(v_emission, mol / m^2 / s)
  v_year <- 2005:2015

  # make a data frame matching the terrestrial time series
  # snap_id pollutant year   emission     units
  dt_year <- data.table(snap_id = 12, pollutant = pollutant, year = v_year,
    emission = v_emission, units = as.character(units(v_emission)))
  # calculate alpha
  dt_year <- getAlpha_byYear(dt_year, year_ref = 2015)
  ##* WIP what to return?
  # side-effect of resampling / writing maps here too?
  return(dt_year)
}

#r_marine <- getMarineEmissionMap(pollutant = "co2", year = 2015)
getMarineEmissionMap <- function(pollutant, year){
  v_year <- 2005:2015
  # we have no data prior to 2010, so use 2010 data for earlier years
  if (year < 2005){
    year <- 2005
    print("Marine data only exist from 2005 - using 2005 data for earlier years")
  }
  if (year > 2015){
    year <- 2015
    print("Marine data only exist up to 2015 - using 2015 data for later years")
  }

  # read in single file for fluxes
  fname <- paste0("./data-raw/marine/", pollutant, "/b_F", pollutant, "_annual.tif")
  b <- brick(fname)
  names(b) <- v_year
  ind <- match(year, v_year)
  r <- b[[ind]]

  # units are mol / m^2 / s
  attr(r, "units") <- "mol / m^2 / s"
  return(r)
}

#add_marine argument added, see ReadMe
get_maps_static <- function(year, s, r, dt_ts, dt_pts, pollutant, unitType = "mol", unitSIprefix = "none", add_points = TRUE, add_marine = TRUE){

  if (add_points) s <- add_points_to_diffuse(year = year, s, dt_pts, pollutant = pollutant)

  # reproject
  sp <- projectRaster(from = s, to = r, method = "ngb") # rescale to original totals? BELOW

  # as data isn't quite to national totals in maps PLUS reprojection changing values, rescale here
  s <- scale_stack_to_nationalTotal(year = year, sp=sp, dt_ts,  pollutant = pollutant)

  # convert units
  uc <- unit_conversion(pollutant, unitType = unitType, unitSIprefix = unitSIprefix, unit_in = "Mg / km^2 / yr")
  s <- s * uc$value

  # add marine for co2 & n2o - ONLY TO UK, previously was adding twice , SJT edit 21/1/22
  # but add a blank layer to IE to allow dim to remain the same for merge later on get_maps_UKIE().
  ## we might need a blank placeholder for CH4 if that also needs to be 12 layers

  if (pollutant == "co2" | pollutant == "n2o"){

    if(add_marine==T){

      if (unitType != "mol" & unitSIprefix != "none") stop(message = "Units needs to be mol /m2 /s to match marine data")
      r_marine <- getMarineEmissionMap(pollutant = pollutant, year = year)
      s <- addLayer(s, r_marine) # or s[[11]] <- s[[11]] + r with NA=0 ## SJT 09/12/21 KEY CHANGE. PL to double check. currently making marine a 12th sector 21/01/22
    }else{
      r_marine <- r
      r_marine[] <- 0
      s <- addLayer(s, r_marine) # if marine = F, add blank layer to ensure structure is ok.
    }

  # add an else clause if CH4 needs to be same 12 layer structure.
  }else{
   # if not CO2 or N2O, just add blank 12th layer (no marine)
    r_marine <- r
    r_marine[] <- 0
    s <- addLayer(s, r_marine)
   }

  return(s)
}

# writeOutput(v_year = startYear:endYear, l_s, pollutant)
writeOutput <- function(v_year = startYear:endYear, l_s, pollutant){
  v_fname <- paste0("./data/output/s_F_", pollutant, "_", v_year, ".tif")
  for (i in 1:length(l_s)){
    rf <- writeRaster(l_s[[i]], filename = v_fname[i], overwrite = TRUE)
  }
  return(v_fname)
}

# combine all pollutant-specific functions in one
# down-side of this is that all targets need to be rebuilt if anything changes
# add_marine argument added to get_maps_static(), as it was added twice
get_maps_UKIE <- function(year_ref, startYear, endYear, pollutant,
    r, dt_pts, dt_ts_uk, dt_ts_ie, unitSIprefix = "none", 
    lookup_GNFR = "./data/lookups/GNFR_to_SNAP.csv"){
  # get the diffuse source maps
  s_diff_uk <- readUKDiffuseData(pollutant = pollutant, year = year_ref) # possibly problem here - no 'year=' so always defaulting to 2018 as per L854 AND L484 in readUKDiffuseData(). Have set in L484.
  s_diff_ie <- readIEDiffuseData(pollutant = pollutant, year = year_ref, lookup_GNFR)
  # combine diffuse and point sources, rescale and resample
  l_s_uk <- lapply(startYear:endYear, FUN = get_maps_static, s_diff_uk, r=r, dt_ts_uk, dt_pts, pollutant = pollutant, unitSIprefix = unitSIprefix)
  l_s_ie <- lapply(startYear:endYear, FUN = get_maps_static, s_diff_ie, r=r, dt_ts_ie, dt_pts, pollutant = pollutant, unitSIprefix = unitSIprefix, add_points = FALSE, add_marine = FALSE)


  # merge the two together
  #l_s <- lapply(1:length(l_s_uk), function(i) raster::merge(l_s_uk[[i]], l_s_ie[[i]]))
  ## this is the problem, merge isn't working
  # unsure why merge is not working, it's just returning Irish values (it should use UK where present? maybe a bug?)
  # instead, put uk and ireland in stack, then stackApply using indices, i.e. 1 and 12, 2 and 13 etc. Same structure so ok.

  l_s_ukie <- lapply(1:length(l_s_uk), function(i) stack(l_s_uk[[i]], l_s_ie[[i]]))

  l_s <- lapply(1:length(l_s_ukie), function(i) stackApply(l_s_ukie[[i]], indices=rep(1:12, 2), fun=sum, na.rm=T) )

  # at the moment and if clause for ch4 as the 12th layer not added
  #if(pollutant=="ch4"){
  #  l_s <- lapply(1:length(l_s_ukie), function(i) stackApply(l_s_ukie[[i]], indices=rep(1:11, 2), fun=sum, na.rm=T) )
  #}else{
  #  l_s <- lapply(1:length(l_s_ukie), function(i) stackApply(l_s_ukie[[i]], indices=rep(1:12, 2), fun=sum, na.rm=T) )
  #}

  # write final output to annual files
  v_fname <- writeOutput(v_year = startYear:endYear, l_s, pollutant)
  return(v_fname)
}

##* WIP - Ireland has bigger CO2 emission than UK - units diff, CO2 not C?
##* mostly sorted - inclusion of duplicate totals in UNFCCC file - using Iriah EPA data instead
##* years <- 2010:2019
##* l_res <- lapply(X=years, function(x) fun = sapply(c("ch4","n2o","co2"), test_agreement, v_snap_id = 1:n_sectors, year = x))
##* names(l_res) <- as.character(years)

test_agreement <- function(v_snap_id = 1:n_sectors, pollutant, year){
  # call unit conversion to return pollutant-specific molWt
  molWt <- unit_conversion(pollutant, "mol")$molWt
  dt_ts_uk <- tar_read(dt_ts_uk)
  dt_ts_ie <- tar_read(dt_ts_ie)

  F_total_uk <- sum(dt_ts_uk[pollutant == get("pollutant", envir = parent.frame(3)) &
                                  year == get("year",      envir = parent.frame(3)) &
                                  snap_id %in% v_snap_id, emission])
  F_total_ie <- sum(dt_ts_ie[pollutant == get("pollutant", envir = parent.frame(3)) &
                                  year == get("year",      envir = parent.frame(3)) &
                                  snap_id %in% v_snap_id, emission])
  # UK data already so converted from C to CO2 ##
  F_total_uk <- set_units(F_total_uk, Tg / yr)
  F_total_ie <- set_units(F_total_ie, Tg / yr)
  F_total_ts <- F_total_uk + F_total_ie
  # total from time series
  F_total_ts

  # read output file
  fname <- paste0("./data/output/s_F_", pollutant, "_", year, ".tif")
  s <- stack(fname)
  # name layers to 1 to 11, ignoring marine
  names(s)[1:n_sectors] <- v_sector
  # subset to selected sectors
  s <- s[[v_snap_id]]
  # get cell area in m2
  r_area <- terra::area(s, sum=FALSE) * 1e6
  #cellStats(r_area, sum) / 1e6
  # s is flux in mol m-2 s-1 x area in m2 -> mol s-1
  F_total_sp <- sum(cellStats(s * r_area, sum))
  # convert mol to g
  F_total_sp <- set_units(F_total_sp * molWt, g / s)
  F_total_sp <- set_units(F_total_sp, Tg / yr)
  return(F_total_sp / F_total_ts)
}