Objective

The ocean heat flux data is essential for Hector calibration. This markdown serves two purposes.

  1. Prepare the ESM data to be used in the calibration.
  2. Quality check the data.

The data being processed by this script is generated by the 1A.tas_nc.R that is set up to run on pic. Because the pic results are rather large the inputs will not be commited the the GitHub Repo.

Set Up

# Load the required libs 
library(dplyr)
library(tidyr)
library(cdoR)
library(ggplot2)

# And define the required directories 
DATARAW_DIR <- here::here('data-raw')
INPUT_DIR   <- file.path(DATARAW_DIR, 'comparison-data', 'cmip6', '1A-out')
OUT_DIR     <- file.path(DATARAW_DIR, 'comparison-data', 'cmip6', '1B-out')
dir.create(OUT_DIR, showWarnings = FALSE)
# Import the raw temperature data.
raw_tas_data <- read.csv(file.path(INPUT_DIR, '1A.tas-output.csv'), stringsAsFactors = FALSE)

Quailty Control

A1. Start out be checking for any unprocessed files.

assertthat::assert_that(!'problem' %in% names(raw_tas_data), msg = 'Some problem occured when processing the netcdf files.')

# Right now because there are no temp netcdf files that could not be processed there this section does not have any 
# code to take a look at what happened. If there problems do arrise the asserthtat will indicate that more code will 
# need to be added to this section. 
data1 <- raw_tas_data

Check for unexpected NA values, this has to be done after checking for the problem files

# Unexpected NA values are the worst and can happen when the weighted mean does not remove NA values. 
assertthat::assert_that(sum(is.na(data1$value)) == 0, msg = 'Unexpected NAs in the value column.')

A2. Check for Mulitple Grids

data1 %>% 
  group_by(year, units, variable, domain, model, experiment) %>% 
  summarise(n_grid = n_distinct(grid)) %>%
  filter(n_grid != 1) -> 
  dupplicate_grids

assertthat::assert_that(nrow(dupplicate_grids) == 0, msg = 'Data reported for mulitple grids.')

# Right now because there are no temp netcdf files that contain the same information but contain results from 
# different grids, so there is no code in this section to hanndle this. If this does occur, which it has for other 
# CMIP6 variables the assertthat error will indicate that code needs to be added here. 
data2 <- dupplicate_grids

A3. Check for Year with Mulitple Observations

Sometimes when the results are written out as seperate netcdf files where the new netcdf file starts midyear means that for a single model / experiment / ensemble there can be multiple results returned for a single year.

# Check to see how many observations there are per model / ensemble / domain / year. 
raw_tas_data %>% 
  group_by(units, variable, model, ensemble, experiment, domain, year) %>% 
  summarise(n_value = n()) %>% 
  ungroup %>%  
  filter(n_value != 1) -> 
  mulitple_years

# Most of the time there are going to be 2 values for a year, because of the situtation described above. If 
# for whatever reason there are more 2 values reported for a single year more code will need to be added. The 
# assert that call checks for that here. 
assertthat::assert_that(all(mulitple_years$n_value == 2), msg = 'More than 2 observations per year.')

# Calculate the annaul average. 
raw_tas_data %>%  
  inner_join(mulitple_years, by = c("year", "units", "variable", "domain", "model", "experiment", "ensemble")) %>%  
  group_by(year, units, variable, domain, model, experiment, ensemble, grid) %>% 
  summarise(value = mean(value)) %>% 
  ungroup() -> 
  new_data

# Replace the mulitple observations with the mean of the multiple observations. 
raw_tas_data %>% 
  anti_join(mulitple_years, by = c("year", "units", "variable", "domain", "model", "experiment", "ensemble")) %>% 
  bind_rows(new_data) -> 
  data3

# A final check to make sure that there are no duplicate years. 
data3 %>% 
  group_by(units, variable, model, ensemble, experiment, domain, year) %>% 
  summarise(n_value = n()) %>% 
  ungroup %>%  
  filter(n_value != 1) -> 
  mulitple_years

assertthat::assert_that(nrow(mulitple_years) == 0 , msg = 'More than 2 observations per year in data3.')

A4. Check for Continuous Observations

Are there any missing years? If so that's unfortunate.

split(data3, 
      interaction(data3$units, data3$variable, data3$model, data3$grid, 
                  data3$ensemble, data3$experiment, data3$domain, drop = TRUE)) %>%
  lapply(function(input){

    input %>% 
      arrange(year) %>% 
      pull(year) %>% 
      diff() %>% 
      unique() -> 
      step_size 

    # If the step size is 1 then ther is 
    if(step_size == 1){
      TRUE
    } else {
        FALSE
      }


  }) ->
  missing_check

assertthat::assert_that(unique(missing_check) == TRUE, msg = 'discontinuous data: data3 is missing some values for some years')

# This is a common issue but is not occuring here! Hooooray! If the check in this section is triggered with an error message then 
# code will need to be added here to handle this issue. 
data4 <- data3

A5. Convert Idealized Runs

The idealized epxeriment results (the DECK experiments minus the historical experiment) are reported in aboslute years when they should be reported in realitve years. This section will convert time to years since the run start date which is how we are going to want to plot the results.

idealized_exps <- c("1pctCO2", "abrupt-4xCO2", "piControl", "abrupt-0p5xCO2")

data4 %>%  
  filter(experiment %in% idealized_exps) -> 
  idealized_results

# Determine the start year for each model / ensemble / idealized start year 
idealized_results %>% 
  group_by(variable, domain, model, experiment, ensemble) %>%  
  summarise(start_year = min(year)) %>% 
  ungroup -> 
  start_years

# Remove the start year from the year column. 
idealized_results %>% 
  left_join(start_years, by = c("variable", "domain", "model", "experiment", "ensemble")) %>%
  mutate(year = year - start_year) %>% 
  select(-start_year) ->
  idealized_results_reatlive_yrs

So it turns out that some of the idealzed runs are rather short. Like only contain 5 years of data short. This does not provide us with enough information to help during the calibration process so they will be discarded.

required_length <- 100

idealized_results_reatlive_yrs %>% 
  group_by(model, experiment, ensemble) %>% 
  summarise(end_year = max(year)) %>%
  ungroup %>%  
  filter(end_year < required_length) %>% 
  select(-end_year) -> 
  to_remove

idealized_results_reatlive_yrs %>%
  anti_join(to_remove, by = c("model", "experiment", "ensemble")) -> 
  good_idealized_results

Idealized runs sanity plots

ggplot(good_idealized_results) + 
  geom_line(aes(year, value, color = model, group = interaction(model, experiment, ensemble))) + 
  facet_wrap('experiment', scales = 'free') + 
  theme(legend.position = 'none') + 
  labs(title = 'Idealized Experiments', 
       y = 'absolute temperature (K)',
       x = 'year')
data4 %>%  
  filter(!experiment %in% idealized_exps) %>% 
  bind_rows(good_idealized_results) ->
  data5

Final Quality Control Data Results

tas_data <- data5 %>% select(-datetime, -month, -grid)

Calculate Tgav

Start by calculating the baseline, this the value should be removed from the temperature to calcualte the temperature anomaly.

tas_data %>%
  filter(year %in% 1850:1860) %>%
  group_by(variable, domain, model, experiment, ensemble) %>% 
  summarise(baseline = mean(value)) %>% 
  ungroup() ->
  baseline_temp

emission_driven_historical      <- filter(baseline_temp, experiment == 'esm-hist') %>% select(-experiment)
concentration_driven_historical <- filter(baseline_temp, experiment == 'historical') %>% select(-experiment)

Calculate temp change for the concentration driven runs.

tas_data %>%  
  filter(!grepl(pattern = 'esm', x = experiment)) %>% 
  left_join(concentration_driven_historical, 
            by = c("variable", "domain", "model", "ensemble")) %>% 
  mutate(value = value - baseline, 
         variable = 'Tgav', 
         units = 'deg C') -> 
  conc_tgav

Calculate temp change for the emission driven runs.

tas_data %>%  
  filter(grepl(pattern = 'esm', x = experiment)) %>% 
  left_join(emission_driven_historical, 
            by = c("variable", "domain", "model", "ensemble")) %>% 
  mutate(value = value - baseline, 
         variable = 'Tgav', 
         units = 'deg C') -> 
  emiss_tgav
tgav_data <- na.omit(bind_rows(conc_tgav, emiss_tgav))

Sanity Plots

Generate all of the tas sanity plots.

tas_data %>% 
  mutate(class = 'Phase CMIP6', 
         class = if_else(experiment %in% idealized_exps, 'DECK Idealized', class)) %>%  
  split(interaction(.$model, .$variable, drop = TRUE)) %>%  
  lapply(function(input){

    model <- unique(input$model)

    input %>% 
      filter(year <= 2150) %>% 
      ggplot() + 
      geom_line(aes(year, value, color = experiment, group = interaction(variable, ensemble, model, experiment))) + 
      facet_wrap('class', scales = 'free') + 
      labs(x = 'Year', 
           y = 'Temp (K)', 
           title = model)
  }) ->
  tas_plots

invisible(lapply(tas_plots, print))

Generate all of the tgav sanity plots.

The temperature anomoly is the ideal way to look at the difference between the models.

tgav_data %>% 
  mutate(class = 'Phase CMIP6', 
         class = if_else(experiment %in% idealized_exps, 'DECK Idealized', class)) %>%  
  split(interaction(.$model, .$variable, drop = TRUE)) %>%  
  lapply(function(input){

    model <- unique(input$model)

    input %>% 
      filter(year <= 2150) %>% 
      ggplot() + 
      geom_line(aes(year, value, color = experiment, group = interaction(variable, ensemble, model, experiment))) + 
      facet_wrap('class', scales = 'free') + 
      labs(x = 'Year', 
           y = 'Tgav (degree C)', 
           title = model)
  }) ->
  tgav_plots

invisible(lapply(tgav_plots, print))

Save Output

Save a copy of the tas results.

write.csv(tas_data, file = file.path(OUT_DIR, '1B.tas_quality.csv'))


ashiklom/hector-rcmip documentation built on Sept. 23, 2020, 11:30 a.m.