inst/extdata/discrete/Discrete_Occupations_Stations.R

## code to prepare `Discrete_Occupations_Stations` dataset
cat('Sourcing Discrete_Occupations_Stations.R', sep = '\n')

library(dplyr)
library(lubridate)
library(tidyr)
library(readr)
library(usethis)
library(RCurl)

# source custom functions
source("tmp/BC/R/Standard_Depth_Lookup.R")

# vectorize function Standard_Depth_Lookup()
Standard_Depth_Lookup_vec <- Vectorize(Standard_Depth_Lookup)

# load data
# HL2
cat('    reading in station2 biochemical data', sep = '\n')
HL2_env <- new.env()
con <- url("ftp://ftp.dfo-mpo.gc.ca/AZMP_Maritimes/azmpdata/data/biochemical/Halifax-2/ChlNut_Data_Filtered.RData")
load(con, envir=HL2_env)
close(con)
# P5
cat('    reading in prince5 biochemical data', sep = '\n')
P5_env <- new.env()
con <- url("ftp://ftp.dfo-mpo.gc.ca/AZMP_Maritimes/azmpdata/data/biochemical/Prince-5/ChlNut_Data_Filtered.RData")
load(con, envir=P5_env)
close(con)

# load physical data

# 1. read in mission look up tables and combine
cat('    reading in lookup table data', sep = '\n')

url_name <- 'ftp://ftp.dfo-mpo.gc.ca/AZMP_Maritimes/AZMP_Reporting/lookup/' # have to move this to new directory
result <- getURL(url_name,
                 verbose=TRUE,ftp.use.epsv=TRUE, dirlistonly = TRUE)

filenames <- unlist(strsplit(result, "\r\n"))
lookup <- list()
lookup[[1]] <- read.csv(paste0(url_name, filenames[1]))
lookup[[2]] <- read.csv(paste0(url_name, filenames[2]))
# lookupfiles <- list.files(lookupPath, pattern = '^mission.*', full.names = TRUE)
# lookup <- lapply(lookupfiles, read.csv)
missions <- do.call('rbind', lookup)

# 2. read in the data and combine
cat('    reading in station2 and prince5 physical data', sep = '\n')
url_name <- 'ftp://ftp.dfo-mpo.gc.ca/AZMP_Maritimes/azmpdata/data/physical/fixedStations/'
result <- getURL(url_name,
                 verbose=TRUE,ftp.use.epsv=TRUE, dirlistonly = TRUE)

filenames <- unlist(strsplit(result, "\r\n"))

# only pick out relevant files
filenames <- grep(filenames, pattern = '*IntegratedVariables*', value = TRUE, invert = TRUE)

d <- list()
for(i in 1:length(filenames)){
  con <- url(paste0(url_name, filenames[[i]]))
  d[[i]] <- read.csv(con)
}

############################
# this requires to be fixed
############################
# # datafile <- paste(dataPath, c('prince5.csv', 'station2.csv'), sep = '/')
# # d <- lapply(datafile, read.csv)
#  d <- do.call('rbind', d)
#
# # 3. match up cruiseNumber and mission_number
# # note : if anything goes sideways matching up idx, the structure of idx will change
# #        so it could become a list instead of a vector
# idx <- apply(d, 1, function(k) {ok <- which(missions[['mission_name']] == k[['cruiseNumber']]);
#                                 if(length(ok) > 1) {
#                                   ok[1]
#                                 } else if(length(ok) == 0){
#                                   NA
#                                 } else {
#                                   ok
#                                 }})
#
# d <- cbind(d, descriptor = missions[['mission_descriptor']][idx])
#
# fixedStationsPO <- d
fixedStationsPO <- rbind(d[[3]], d[[2]])

# rename variables
fixedStationsPO <- fixedStationsPO %>%
  dplyr::rename(sea_temperature = temperature) %>%
  dplyr::rename(depth = pressure) %>%
  dplyr::filter(year>=1999) %>%
  tidyr::unite(date, year, month, day, sep="-", remove=T) %>%
  dplyr::mutate(date = lubridate::ymd(date))

# assemble metadata
df_metadata <- dplyr::bind_rows(HL2_env$df_metadata,
                                P5_env$df_metadata) %>%
  dplyr::mutate(nominal_depth = Standard_Depth_Lookup_vec(station, depth))


# assemble data
Discrete_Occupations_Stations <- dplyr::bind_rows(HL2_env$df_data_averaged_l,
                                                  P5_env$df_data_averaged_l)

# clean up
rm(list=c("HL2_env", "P5_env"))

# target variables to include
target_var <- c("Chlorophyll A" = "chlorophyll",
                "Nitrate" = "nitrate",
                "Phosphate" = "phosphate",
                "Silicate" = "silicate")

# print order
print_order_station <- c("HL2" = 1,
                         "P5" = 2)

# reformat data
Discrete_Occupations_Stations <- Discrete_Occupations_Stations %>%
  dplyr::filter(parameter_name %in% names(target_var)) %>%
  dplyr::mutate(parameter_name = unname(target_var[parameter_name])) %>%
  tidyr::pivot_wider(names_from=parameter_name, values_from=data_value) %>%
  dplyr::left_join(df_metadata %>%
                     dplyr::select(custom_sample_id, custom_event_id, date, latitude, longitude, depth, nominal_depth, station),
                   by=c("custom_sample_id")) %>%
  dplyr::mutate(order_station = unname(print_order_station[station])) %>%
  dplyr::group_by(custom_event_id) %>%
  dplyr::arrange(depth, .by_group=T) %>%
  dplyr::arrange(order_station, date) %>%
  dplyr::ungroup() %>%
  dplyr::select(station, latitude, longitude, date, depth, nominal_depth, unname(target_var))

# add physical data
Discrete_Occupations_Stations <- Discrete_Occupations_Stations %>%
  dplyr::bind_rows(., fixedStationsPO)

# save as dataframe not tibble
Discrete_Occupations_Stations <- as.data.frame(Discrete_Occupations_Stations)

# save data to csv
readr::write_csv(Discrete_Occupations_Stations, "inst/extdata/csv/Discrete_Occupations_Stations.csv")

# save data to rda
usethis::use_data(Discrete_Occupations_Stations, overwrite = TRUE)
casaultb/azmpdata documentation built on July 4, 2025, 11:04 a.m.