R/selpoint.multi.R
In cmsafops: Tools for CM SAF NetCDF Data

Documented in selpoint.multi

#'Extract data at multiple points.
#'
#'This function extracts all data at given points. The points are given by a
#'pair of vectors with longitude and latitude coordinates. The function will
#'find the closest grid points to the given coordinates and extracts the data
#'for these points. For each point a separate output file is written. The
#'output-files can be optional in NetCDF or csv. Input can be a single NetCDF
#'file (given by the infile attribute) or a bunch of NetCDF files (given by the
#'path and pattern attributes).
#'
#'@param var Name of NetCDF variable (character).
#'@param infile Filename of input NetCDF file. This may include the directory
#'  (character). Infile is not needed if path and pattern are given.
#'@param path Directory of input files (character). Will not be used if infile
#'  is given.
#'@param pattern Pattern that all desired files in the 'path' directory have in
#'  common (character).
#'@param outpath Directory where output files will be stored (character).
#'@param lon1 Longitude vector of desired points (numeric vector). Must
#'  have the same length as \code{lat1}.
#'@param lat1 Latitude vector of desired points (numeric vector). Must have
#'  the same length as \code{lon1}.
#'@param station_names Optional vector of names, which will be used for the
#'  output files (character vector). Must have the same length as
#'  \code{lon1} and \code{lat1}.
#'@param format Intended output format. Options are \code{nc} or \code{csv}. Default is
#'  \code{nc} (character).
#'@param nc34 NetCDF version of output file. If \code{nc34 = 3} the output file will be
#'  in NetCDFv3 format (numeric). Default output is NetCDFv4.
#'@param verbose logical; if TRUE, progress messages are shown
#'@param nc Alternatively to \code{infile} you can specify the input as an
#'  object of class `ncdf4` (as returned from \code{ncdf4::nc_open}).
#'
#'@return For each pair of longitude and latitude coordinates one separate
#'  NetCDF or csv file including the selected data is written. The csv files are
#'  tested for use in Excel and include four columns (Time ; Data ; Longitude ;
#'  Latitude), which are separated by ';'. If station_names are defined, the
#'  output files will be named according to this vector. Otherwise, the output
#'  files will be named as selpoint_longitude_latitude.format. Already existing
#'  files will be overwritten in case that station_names are given or renamed
#'  (e.g., \code{selpoint1_longitude_latitude.nc}) in case that no station_names are
#'  given.
#'@export
#'
#'@family selection and removal functions
#'
#' @examples
#'## Create an example NetCDF file with a similar structure as used by CM
#'## SAF. The file is created with the ncdf4 package.  Alternatively
#'## example data can be freely downloaded here: <https://wui.cmsaf.eu/>
#'
#'library(ncdf4)
#'
#'## create some (non-realistic) example data
#'
#'lon <- seq(5, 15, 0.5)
#'lat <- seq(45, 55, 0.5)
#'time <- seq(as.Date("2000-01-01"), as.Date("2010-12-31"), "month")
#'origin <- as.Date("1983-01-01 00:00:00")
#'time <- as.numeric(difftime(time, origin, units = "hour"))
#'data <- array(250:350, dim = c(21, 21, 132))
#'
#'## create example NetCDF
#'
#'x <- ncdim_def(name = "lon", units = "degrees_east", vals = lon)
#'y <- ncdim_def(name = "lat", units = "degrees_north", vals = lat)
#'t <- ncdim_def(name = "time", units = "hours since 1983-01-01 00:00:00",
#'  vals = time, unlim = TRUE)
#'var1 <- ncvar_def("SIS", "W m-2", list(x, y, t), -1, prec = "short")
#'vars <- list(var1)
#'ncnew <- nc_create(file.path(tempdir(),"CMSAF_example_file.nc"), vars)
#'ncvar_put(ncnew, var1, data)
#'ncatt_put(ncnew, "lon", "standard_name", "longitude", prec = "text")
#'ncatt_put(ncnew, "lat", "standard_name", "latitude", prec = "text")
#'nc_close(ncnew)
#'
#'## Select two points of the example CM SAF NetCDF file and write the
#'## output to a csv-file.
#'selpoint.multi(var = "SIS", infile = file.path(tempdir(),"CMSAF_example_file.nc"),
#'  outpath = tempdir(), lon1 = c(8, 9), lat1 = c(48, 49),
#'  station_names = c("A", "B"), format = "csv")
#'
#'unlink(c(file.path(tempdir(),"CMSAF_example_file.nc"), file.path(tempdir(),"A.csv"), 
#'  file.path(tempdir(),"B.csv")))
selpoint.multi <- function(var, infile, path, pattern, outpath, lon1, lat1,
                           station_names = NULL, format = "nc", nc34 = 4, verbose = FALSE,
                           nc = NULL) {
  check_variable(var)

  if (missing(infile) && missing(nc) && (missing(path) || missing(pattern))) {
    stop(paste0("Missing input: Please provide either infile, nc or path and pattern.",
                collapse = " "))
  }
  if (!missing(infile) || !is.null(nc)) {
    if (is.null(nc)) check_infile(infile)
    case <- 1
    if (!missing(path) || !missing(pattern)) {
      if (verbose) message("infile defined, path and pattern will not be used")
    }
  }else{
    stopifnot(length(list.files(path = path, pattern = pattern)) > 0)
    case <- 2
  }
  stopifnot(length(lon1) == length(lat1))
  check_nc_version(nc34)
  check_format(format)
  calc_time_start <- Sys.time()

  if (case == 1) {
    file_data <- read_file(infile, var, nc = nc)
    if (file_data$time_info$has_time_bnds) {
      time_bnds <- get_time_bounds_from_file(infile, nc = nc)
    }
  }else{
    filelist <- list.files(path = path, pattern = pattern, full.names = TRUE)
    fdim <- length(filelist)

    if (fdim == 0) {
      stop("No files found that match the pattern")
    }

    filelist <- sort(filelist)
    file <- filelist[1]
    file_data <- read_file(file, var)
  }

  if (!(file_data$grid$is_regular || length(file_data$grid$vars))) {
    stop("No lon/lat information found in file, please add by applying add_grid_info")
  }

  # find closest point to target coordinates using fields package
  target_lon  <- NULL
  target_lat  <- NULL
  target_x <- NULL
  target_y <- NULL
  result_data <- NULL

  if (file_data$grid$is_regular) {
    dlon <- abs(file_data$dimension_data$x[1] - file_data$dimension_data$x[2])
    dlat <- abs(file_data$dimension_data$y[1] - file_data$dimension_data$y[2])

    for (n in seq_along(lon1)) {
      lon_limit <- which(file_data$dimension_data$x >= (lon1[n] - dlon) & file_data$dimension_data$x <= (lon1[n] + dlon))
      lat_limit <- which(file_data$dimension_data$y >= (lat1[n] - dlat) & file_data$dimension_data$y <= (lat1[n] + dlat))

      if (!(any(lon_limit) & any(lat_limit))) {
        stop("Coordinates outside of the domain.")
      }

      lon2 <- file_data$dimension_data$x[lon_limit]
      lat2 <- file_data$dimension_data$y[lat_limit]
      
      pos <- cbind(lon1[n], lat1[n])
      dum_dist <- 1000
      for (i in seq_along(lon2)) {
        for (j in seq_along(lat2)) {
          dist <- fields::rdist.earth(x1 = pos, x2 = cbind(lon2[i], lat2[j]),
                                      miles = FALSE)
          if (dist <= dum_dist) {
            dum_dist <- dist
            dumi <- i
            dumj <- j
          }
        }
      }

      lon_limit <- which(file_data$dimension_data$x == lon2[dumi])
      lat_limit <- which(file_data$dimension_data$y == lat2[dumj])


      if (!(any(lon_limit) & any(lat_limit))) {
        stop("Coordinates outside of the domain.")
      }

      target_lon <- append(target_lon, file_data$dimension_data$x[lon_limit])
      target_lat <- append(target_lat, file_data$dimension_data$y[lat_limit])
      target_x <- append(target_x, lon_limit)
      target_y <- append(target_y, lat_limit)

      if (case == 1) {
        if (!is.null(nc)) id <- nc
        else id <- nc_open(infile)
        result <- ncvar_get(id, file_data$variable$name, start = c(lon_limit, lat_limit, 1), count = c(1, 1, -1))
        if (is.null(nc)) nc_close(id)
        result_data <- rbind(result_data, result)
      }

    } # end for
  }else{
    target_x_nearest <- NULL
    target_y_nearest <- NULL

    dlon <- LON_RANGE
    dlat <- LAT_RANGE

    for (n in seq_along(lon1)) {

      lon_limit <- which(file_data$grid$vars_data[[LON_NAMES$DEFAULT]] >= (lon1[n] - dlon) & file_data$grid$vars_data[[LON_NAMES$DEFAULT]] <= (lon1[n] + dlon), arr.ind = TRUE)
      lat_limit <- which(file_data$grid$vars_data[[LAT_NAMES$DEFAULT]] >= (lat1[n] - dlat) & file_data$grid$vars_data[[LAT_NAMES$DEFAULT]] <= (lat1[n] + dlat), arr.ind = TRUE)

      if (!(any(lon_limit) && any(lat_limit))) {
        stop("Coordinates outside of the domain.")
      }

      lonlat_merge <- data.matrix(merge(lon_limit, lat_limit, by.x = c("row", "col"), by.y =  c("row", "col"), out.class = matrix))

      dist <- fields::rdist.earth(cbind(file_data$grid$vars_data[[LON_NAMES$DEFAULT]][lonlat_merge],
                                       file_data$grid$vars_data[[LAT_NAMES$DEFAULT]][lonlat_merge]),
                                 cbind(lon1, lat1), miles = FALSE)
      mini <- which.min(dist)
      nearest <- lonlat_merge[mini, ]

      x_nearest <- nearest[1]
      y_nearest <- nearest[2]

      x_dim <- file_data$dimension_data$x[x_nearest]
      y_dim <- file_data$dimension_data$y[y_nearest]

      target_x_nearest <- append(target_x_nearest, x_nearest)
      target_y_nearest <- append(target_y_nearest, y_nearest)
      target_x <- rbind(target_x, x_dim)
      target_y <- rbind(target_y, y_dim)
      target_lon <- append(target_lon, file_data$grid$vars_data[[LON_NAMES$DEFAULT]][x_nearest, y_nearest])
      target_lat <- append(target_lat, file_data$grid$vars_data[[LAT_NAMES$DEFAULT]][x_nearest, y_nearest])

      if (case == 1) {
        if (!is.null(nc)) id <- nc
        else id <- nc_open(infile)
        result <- ncvar_get(id, file_data$variable$name, start = c(x_nearest, y_nearest, 1), count = c(1, 1, -1))
        if (is.null(nc)) nc_close(id)
        result_data <- rbind(result_data, result)
      }

    } # end for
  }

  # get time reference
    dt_ref <- get_time(file_data$time_info$units, 0)
    unit_ref <- unlist(strsplit(file_data$time_info$units, split = " "))[1]

    # check reference time unit
    unit_ref <- switch(
      substr(toupper(unit_ref), 1, 3),
      "MIN" = "mins",
      "SEC" = "secs",
      "HOU" = "hours",
      "DAY" = "days",
      "WEE" = "weeks",
      "MON" = "months",
      "auto"
    )
	
  if (case == 1) {

    if (file_data$time_info$has_time_bnds) {
      time_bnds <- get_time_bounds_from_file(infile, nc = nc)
    }

  }else{

    time_sorting <- NULL
    time_bnds <- NULL
    for (j in seq_len(fdim)) {
      dummy <- NULL
      file <- filelist[j]
      id <- nc_open(file)
      for (i in seq_along(target_x)) {
        if (file_data$grid$is_regular) {
          dummy <- cbind(dummy, ncvar_get(id, file_data$variable$name, start = c(target_x[i], target_y[i], 1), count = c(1, 1, -1)))
        }else{
          result <- ncvar_get(id, file_data$variable$name, start = c(target_x_nearest[i], target_y_nearest[i], 1), count = c(1, 1, -1))
          dummy <- rbind(dummy, result)
        }
      }
      
	  dum_time <- as.numeric(ncvar_get(id, TIME_NAMES$DEFAULT))
	  dum_t_units <- ncatt_get(id, TIME_NAMES$DEFAULT, ATTR_NAMES$UNITS)$value
    dt_dum <- get_time(dum_t_units, dum_time)
	  dum_time <- as.numeric(difftime(dt_dum, dt_ref, units = c(unit_ref)))
	  
      time_sorting <- append(time_sorting, dum_time)
      result_data <- rbind(result_data, dummy)
      if (file_data$time_info$has_time_bnds) {
        tbnds1 <- ncvar_get(id, TIME_BOUNDS_NAMES$DEFAULT)
        time_bnds <- rbind(time_bnds, tbnds1)
      }
      nc_close(id)
    }

    file_data$dimension_data$t <- time_sorting
    result_data <- aperm(result_data, c(2, 1))
    result_data <- result_data[, order(time_sorting)]
}
    
  target_lon <- round(target_lon, digits = 3)
  target_lat <- round(target_lat, digits = 3)

  if (is.null(result_data)) {
    stop("Some Error occured with your data")
  }
  if (is.null(dim(result_data)[1])) {
    result_data <- array(result_data, dim = c(1, length(result_data)))
  }

  for (i in seq_len(dim(result_data)[1])) {

    format <- ifelse(toupper(format) == "CSV", "csv", "nc")

    # create filename
    index <- 1
   # if (!missing(station_names)) {
     if (!is.null(station_names)) {
      if (length(station_names) == length(lon1)) {
        outfile <- file.path(outpath, paste0(station_names[i], ".", format))
      } else {
        if (verbose) message("station_names not used because length is not equal lon1.")
        outfile <- file.path(outpath, paste0("selpoint_", target_lon[i], "_", target_lat[i], ".", format))
        while (file.exists(outfile)) {
          outfile <- file.path(outpath, paste0("selpoint", index, "_", target_lon[i], "_", target_lat[i], ".", format))
          index <- index + 1
        }
      }
    } else {
      outfile <- file.path(outpath, paste0("selpoint_", target_lon[i], "_", target_lat[i], ".", format))
      while (file.exists(outfile)) {
        outfile <- file.path(outpath, paste0("selpoint", index, "_", target_lon[i], "_", target_lat[i], ".", format))
        index <- index + 1
      }
    }

    if (format == "nc") {
      # create netcdf
      nc_format <- get_nc_version(nc34)
      result <- result_data[i, ]
      result[is.na(result)] <- file_data$variable$attributes$missing_value

      cmsaf_info <- (paste0("cmsafops::selpoint.multi for variable ", file_data$variable$name))

      if (file_data$time_info$has_time_bnds) {
        vars_data <- list(result = result, time_bounds = time_bnds)
      }else{
        vars_data <- list(result = result)
      }

      ##### prepare output #####
      global_att_list <- names(file_data$global_att)
      global_att_list <- global_att_list[toupper(global_att_list) %in% toupper(GLOBAL_ATT_DEFAULT)]
      global_attributes <- file_data$global_att[global_att_list]

      x_data <- ifelse(file_data$grid$is_regular, target_lon[i], target_x[i])
      y_data <- ifelse(file_data$grid$is_regular, target_lat[i], target_y[i])

      dims <- define_dims(file_data$grid$is_regular,
                          x_data,
                          y_data,
                          file_data$dimension_data$t,
                          NB2,
                          file_data$time_info$units,
                          with_time_bnds = file_data$time_info$has_time_bnds)

      vars <- define_vars(file_data$variable, dims, nc_format$compression, with_time_bnds = file_data$time_info$has_time_bnds)

      if (!file_data$grid$is_regular) {
        new_data <- list()
        new_data[[which(names(file_data$grid$vars) %in% LON_NAMES)]] <- target_lon[i]
        new_data[[which(names(file_data$grid$vars) %in% LAT_NAMES)]] <- target_lat[i]
        file_data$grid <- redefine_grid_vars(file_data$grid, dims, nc_format$compression, new_data)
      }

      write_output_file(
        outfile,
        nc_format$force_v4,
        vars,
        vars_data,
        file_data$variable$name,
        file_data$grid$vars, file_data$grid$vars_data,
        cmsaf_info,
        file_data$time_info$calendar,
        file_data$variable$attributes,
        global_attributes,
        with_time_bnds = file_data$time_info$has_time_bnds
      )

    } else {# csv
      result <- result_data[i, ]
      lon <- target_lon[i]
      lat <- target_lat[i]
      lon_rep <- rep(lon, length(file_data$dimension_data$t))
      lat_rep <- rep(lat, length(file_data$dimension_data$t))
      dataframe <- data.frame(get_time(file_data$time_info$units, file_data$dimension_data$t), result, lon_rep, lat_rep)
      dum_fname <- unlist(strsplit(outfile, "\\."))
      if (dum_fname[length(dum_fname)] != "csv") (outfile <- paste0(outfile, ".csv"))
      utils::write.table(dataframe, file = outfile, row.names = FALSE, sep = ";")
    }

  } # end for

  calc_time_end <- Sys.time()
  if (verbose) message(get_processing_time_string(calc_time_start, calc_time_end))
}