inst/original_data/climate/process_data_psl.R

library(dplyr)
library(tidyr)
library(readr)
library(lubridate)
library(stringr)
library(purrr)

# List of URLs
urls <- c(
  "https://psl.noaa.gov/data/correlation/pna.data",
  "https://psl.noaa.gov/data/correlation/epo.data",
  "https://psl.noaa.gov/data/correlation/wp.data",
  "https://psl.noaa.gov/data/correlation/censo.data",
  "https://psl.noaa.gov/data/correlation/whwp.data",
  "https://psl.noaa.gov/data/correlation/oni.data",
  "https://psl.noaa.gov/enso/mei/data/meiv2.data",
  "https://psl.noaa.gov/pdo/data/pdo.timeseries.ersstv5.data",
  "https://psl.noaa.gov/data/timeseries/IPOTPI/ipotpi.hadisst2.data",
  "https://psl.noaa.gov/data/correlation/np.data",
  "https://psl.noaa.gov/data/correlation/pacwarm.data",
  "https://psl.noaa.gov/data/correlation/gmsst.data"
)

url <- urls[5]

# Helper: Get the climate index name from the URL
get_index_name <- function(url) {
  if (str_detect(url, "ipotpi.hadisst2.data")) 
    return("ipotpi")
  if (str_detect(url, "pdo.timeseries.ersstv5")) 
    return("pdo")
  return(str_remove(basename(url), "\\.data$"))
}

# Main loop to read and process
climate_indices <- lapply(urls, function(url) {
  index_name <- get_index_name(url)
  
  # Print progress
  message("Processing: ", index_name, " (", url, ")")
  
  # Read raw lines
  lines <- read_lines(url)
  
  # Always skip the first line (bad header)
  lines <- lines[-1]
  
  # Trim leading/trailing spaces
  lines <- str_trim(lines, side = "both")
  
  # Keep only lines that have exactly 13 fields when splitting on whitespace
  good <- sapply(str_split(lines, "\\s+"), length) == 13 & str_detect(lines, "^\\s*\\d{4}\\s")
  # Keep only rows that start with a 4-digit year
  lines_clean <- lines[good]
  
  # Read into tibble
  df <- read_table2(
    paste(lines_clean, collapse = "\n"),
    col_names = FALSE,
    na = c("-9.90", "-99.90", "-99.99", "-999.00", "-999", "-9999", "-9999.000", "Missing", "-999.000", "9999.00", "-9.99"),
    col_types = cols(.default = col_character())
  )
  
  # Now convert columns to numeric after NAs are correctly parsed
  df <- df %>% mutate(across(everything(), as.numeric))
  
  # Rename columns
  colnames(df) <- c("year", month.abb)
  
  # Make sure all months are numeric
  df <- df %>%
    mutate(across(-year, as.numeric)) %>%
    dplyr::filter(year >= 1800, year <= 2100)
  
  # Pivot longer
  df_long <- df %>%
    pivot_longer(-year, names_to = "month", values_to = index_name) %>%
    mutate(
      month = match(month, month.abb),
      date = as.Date(sprintf("%d-%02d-01", year, month))
    ) %>%
    dplyr::select(date, year, month, all_of(index_name))
  
  return(df_long)
})

# Name the list elements
names(climate_indices) <- sapply(urls, get_index_name)

# Merge all into one wide data frame
climate_all_wide <- climate_indices %>%
  reduce(full_join, by = c("date", "year", "month")) %>%
  arrange(date)

# Preview
np_climate_month <- climate_all_wide

## Now create the seasonal yearly

library(dplyr)
library(tidyr)

# Use your existing data frame
# climate_all_wide has columns: date, year, month, pdo, pna, epo, wp, etc.

# Define seasons
get_season <- function(month) {
  case_when(
    month %in% c(1, 2, 3) ~ "winter",
    month %in% c(4, 5, 6) ~ "spring",
    month %in% c(5, 6, 7, 8) ~ "summer",
    month %in% c(9, 10, 11) ~ "fall",
    TRUE ~ NA_character_
  )
}

# Add 'season' column
climate_with_season <- climate_all_wide %>%
  mutate(season = get_season(month))

# Identify all climate indices
climate_vars <- setdiff(names(climate_with_season), c("date", "year", "month", "season"))

# Seasonal and annual means per index
climate_seasonal_list <- lapply(climate_vars, function(var) {
  
  df <- climate_with_season %>%
    dplyr::select(year, season, month, value = all_of(var)) %>%
    dplyr::filter(!is.na(value))
  
  # Seasonal means
  seasonal <- df %>%
    dplyr::filter(!is.na(season)) %>%
    group_by(year, season) %>%
    summarise(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(
      names_from = season,
      values_from = value,
      names_prefix = paste0(var, ".")
    )
  
  # Annual means
  annual <- df %>%
    group_by(year) %>%
    summarise(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
    rename(!!paste0(var, ".annual") := value)
  
  # Combine
  combined <- left_join(seasonal, annual, by = "year") %>%
    arrange(year)
  
  return(combined)
})

# Combine all indices into a single data frame
climate_seasonal <- climate_seasonal_list %>%
  reduce(full_join, by = "year") %>%
  arrange(year)

# Preview
print(head(climate_seasonal))

np_climate_seasonal <- climate_seasonal

save(np_climate_month, np_climate_seasonal, file=here::here("data", "np_climate.RData"))

## Save a copy
# Download each file
# library(fs)
# for (url in urls) {
#   # Get the filename (e.g., pna.data)
#   filename <- basename(url)
#   
#   # Handle special naming case
#   if (filename == "ipotpi.hadisst2.data") {
#     filename <- "ipotpi.data"
#   }
#   
#   # Local path
#   destfile <- path("inst", "original_data", "climate", "raw_data", filename)
#   
#   message("Downloading: ", url)
#   
#   # Download
#   download.file(url, destfile = destfile, mode = "wb")
# }

# Plot all climate indices as ts objects
library(tibble)

# Get the climate variable names (excluding date, year, month)
climate_vars <- setdiff(names(np_climate_month), c("date", "year", "month"))

# Loop through each variable and plot
for (var in climate_vars) {
  ts_data <- ts(np_climate_month[[var]], freq = 12, start = c(np_climate_month$year[1], 1))
  plot(ts_data, main = var)
}
nwfsc-timeseries/atsa-package documentation built on June 9, 2025, 3:49 p.m.