R/calpost_get_concentrations_from_time_series_file.R

#' Create a data frame of discrete receptor concentrations
#' @description Create a data frame of discrete receptor concentrations using a CALPOST time series outfile file.
#' @param time_series_file a path to a binary time series data file that was generated by CALPOST with time series options set.
#' @param location_name the name of the location in which the receptors reside.
#' @param source_id the ID value for the source emissions.
#' @param pollutant_id the ID value for the emitted pollutant.
#' @param create_hourly_CSV an option to create hourly CSV files describing pollutant concentrations at every receptor.
#' @param create_hourly_rda an option to create hourly R data (.rda) files describing pollutant concentrations at every receptor.
#' @param return_large_df an option to invisibly return a large data frame object.
#' @param resume_from_set_hour an option to resume processing of concentrations from a set hour of the year.
#' @param autoresume_processing
#' @param autoresume_year
#' @export calpost_get_concentrations_from_time_series_file

calpost_get_concentrations_from_time_series_file <- function(time_series_file = NULL,
                                                             location_name,
                                                             source_id,
                                                             pollutant_id,
                                                             create_hourly_CSV = TRUE,
                                                             create_hourly_rda = TRUE,
                                                             return_large_df = FALSE,
                                                             resume_from_set_hour = NULL,
                                                             autoresume_processing = TRUE,
                                                             autoresume_year = NULL){
  
  # Add require statements
  require(stringr)
  require(lubridate)
  
  # Read in all lines from the CALPOST time series output file
  time_series_output <- readLines(time_series_file)
  
  # Get numeric vector of all receptor numbers
  time_series_receptor_numbers <-
    as.numeric(unlist(str_split(gsub("      ix:", "", time_series_output[8]), "          ")))[
      !is.na(as.numeric(unlist(str_split(gsub("      ix:", "", time_series_output[8]), "          "))))]
  
  # Get numeric vector of all x coordinate values (UTM) in units of km
  time_series_x_km <- as.numeric(unlist(str_split(gsub("^.*x......", "", time_series_output[10]), "  ")))
  
  # Get numeric vector of all y coordinate values (UTM) in units of km
  time_series_y_km <- as.numeric(unlist(str_split(gsub("^.*y......", "", time_series_output[11]), "  ")))
  
  # Set a beginning index for the i value of the outer loop data frame
  begin <- 15
  
  # Set an index for autoresuming processing if 'resume_from_hour' is NULL and
  # 'autoresume_processing' is TRUE
  if (is.null(resume_from_set_hour) & autoresume_processing == TRUE &
        !is.null(autoresume_year)){
    pattern <- paste0(location_name, "--",
                      source_id, "--",
                      pollutant_id, "--",
                      autoresume_year,
                      "-[0-9][0-9]-[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9].csv")
    
    file_list <- list.files(path = ".",
                            pattern = pattern)
    
    # Get the highest index number from matched files
    highest_index <- max(as.numeric(gsub(".*([0-9][0-9][0-9][0-9]).csv", "\\1", file_list)))
    begin <- highest_index + 14
  }
  
  
  # Set a resume hour if the argument 'resume_from_hour' is not NULL and
  # 'autoresume_processing is FALSE
  if (!is.null(resume_from_set_hour) & autoresume_processing == FALSE){
    begin <- resume_from_set_hour + 14
  }
  
  # Generate a long data frame containing concentration data for each cell at every timestep
  for (i in begin:length(time_series_output)){
    
    # Initialize a data frame object for containing concentration, date, location, source,
    # pollutant information
    if (i == begin){
      concentration_df <- as.data.frame(mat.or.vec(nr = 0, nc = 12))
      colnames(concentration_df) <- c("posix_date", "year", "month", "day", "hour",
                                      "recep_number", "recep_x_km", "recep_y_km",
                                      "concentration", "location_name", "source_id",
                                      "pollutant_id")
    }
    
    # Extract a string from the 'time_series_output' object, representing data for an hour
    time_series_row <-
      as.vector(as.numeric(unlist(
        str_split_fixed(time_series_output[i],
                        pattern = "[ ]+",
                        n = length(time_series_receptor_numbers) + 4))))
    
    # Get the POSIXct date from the string
    POSIXdate <- ISOdatetime(year = time_series_row[2],
                             month = 1, day = 1,
                             hour = time_series_row[4] / 100,
                             min = 0, sec = 0, tz = "GMT") + ((time_series_row[3] - 1) * 24 * 3600)
    
    # Get vector object containing concentrations at each receptor for the hour
    time_series_conc <- time_series_row[5:length(time_series_row)]
    
    # Create a for structure an inner loop
    for (j in 1:length(time_series_conc)){
      
      # Initialize a smaller data frame containing data for all information found in the
      # larger data frame, but for a single hour
      if (j == 1){
        concentration_small_df <- as.data.frame(mat.or.vec(nr = length(time_series_conc), nc = 12))
        colnames(concentration_small_df) <- c("posix_date", "year", "month", "day", "hour",
                                              "recep_number", "recep_x_km", "recep_y_km",
                                              "concentration", "location_name", "source_id",
                                              "pollutant_id")
      }
      
      # Populate the rows of the smaller data frame (data for a single hour)
      concentration_small_df[j,1] <- POSIXdate
      concentration_small_df[j,2] <- year(POSIXdate)
      concentration_small_df[j,3] <- month(POSIXdate)
      concentration_small_df[j,4] <- day(POSIXdate)
      concentration_small_df[j,5] <- hour(POSIXdate)
      concentration_small_df[j,6] <- j
      concentration_small_df[j,7] <- time_series_x_km[j]
      concentration_small_df[j,8] <- time_series_y_km[j]
      concentration_small_df[j,9] <- time_series_conc[j]
      concentration_small_df[j,10] <- location_name
      concentration_small_df[j,11] <- source_id
      concentration_small_df[j,12] <- pollutant_id
    }
    
    # Successively bind the smaller data frame to the larger data frame
    if (return_large_df == TRUE){
      concentration_df <- rbind(concentration_df, concentration_small_df)
    }
    
    # Create a CSV file for the hour if it is requested
    if (create_hourly_CSV == TRUE){
      write.table(concentration_small_df,
                  file = paste0(location_name, "--",
                                source_id, "--",
                                pollutant_id, "--",
                                year(POSIXdate), "-",
                                formatC(month(POSIXdate), width = 2, flag = "0"), "-",
                                formatC(day(POSIXdate), width = 2, flag = "0"), "-",
                                formatC(hour(POSIXdate), width = 2, flag = "0"), "-",
                                formatC((i-14), width = 4, flag = "0"), ".csv"),
                  sep = ",",
                  row.names = FALSE)
    }
    
    # Create an .rda file for the hour if it is requested
    if (create_hourly_rda == TRUE){
      save(concentration_small_df,
           file = paste0(location_name, "--",
                         source_id, "--",
                         pollutant_id, "--",
                         year(POSIXdate), "-",
                         formatC(month(POSIXdate), width = 2, flag = "0"), "-",
                         formatC(day(POSIXdate), width = 2, flag = "0"), "-",
                         formatC(hour(POSIXdate), width = 2, flag = "0"), "-",
                         formatC((i-14), width = 4, flag = "0"), ".csv"))
    }
  }
  
  # Return the 'concentration_df' object invisibly
  if (return_large_df == TRUE){
    invisible(concentration_df)
  }
}
rich-iannone/PuffR documentation built on May 27, 2019, 7:46 a.m.