swiftr: Process bird (swift) GPS position tracker data and meta-data.

Documented in merge_pos_file

#' Merges perviously downloaded remote sensing data
#' with it's original *.pos position tracking data
#' currently this function focusses on vegetation indices
#' as derived from MODIS reflectance (MCD43A4) data and matching land
#' cover type data (MCD12Q1). More so, altitude data is rescaled to
#' relative flight hight if a Google Maps API key is provided.
#'
#' @param position_file a *.pos data logger / tracker file location
#' @param tag a tag number (default = unknown)
#' @param window_size number of trailing or leading days over which to
#' integrate remote sensing (vegetation index) data
#' @param stat statistic to apply when aggregating traling and leading
#' remote sensing data (default = mean)
#' @param remote_sensing_path location of the remote sensing files
#' @param api_key location of the Google Maps API key file (default = NULL)
#' @param out_path path where to save the merged data structure as a
#' serialized R file (default = NULL)
#' @return returns merged data structure for the given position file
#' @keywords ancillary data, remote sensing, google earth engine
#' @export
#' @examples
#' \dontrun{
#' merged_pos_data = merge_pos_file(position_file = "test.pos")
#' }

merge_pos_file = function(
  position_file = NULL,
  tag = NULL,
  window_size = 30,
  stat = "mean",
  remote_sensing_path = "~/google_earth_engine_subsets/data",
  api_key = "~/google_elevation_api.txt",
  out_path = NULL
){

  # read original position data
  df = read.table(position_file, skip = 5, sep = ",", stringsAsFactors = FALSE)

  # drop columns without known meaning (to me anyway)
  df = df[,-c(7,8,12,13,14)]

  # give the remaining columns proper headers
  colnames(df) = c("day",
                "month",
                "year",
                "hour",
                "min",
                "sec",
                "latitude",
                "longitude",
                "altitude")

  # convert the year to a proper range
  df$year = df$year + 2000

  # create a date column
  df$date = as.Date(sprintf("%s-%s-%s",
                            df$year,
                            df$month,
                            df$day),
                    "%Y-%m-%d")

  # convert time component to a julian date
  # not time zone offset is necessary as all
  # times are recorded in the same time frame
  jd  = julianDay(
    year = df$year,
    month = df$month,
    day = df$day,
    hour = df$hour,
    min = df$min,
    sec = df$sec
  )

  # solar elevation / azimuth not taking into account elevation
  # values at sea level (sl)
  solar_pos = solarPosition(jd,
                            lon = df$longitude,
                            lat = df$latitude)

  # first column is solar zenith angle
  # 90 - zenith angle = solar elevation above horizon
  # - values are below the horizon
  df$solar_elevation = 90 - solar_pos[,1]

  # solar azimuth angle is the location of the sun in the sky
  # in the horizontal plane
  # north = 0, east = 90, south = 180, west = 270
  df$solar_azimuth = solar_pos[,2]

  # loop over all locations grab the correct remote
  # sensing data and derive statistics
  cat("Processing remote sensing data for site locations... \n")
  vi_list = lapply(1:nrow(df),function(i, ...){


    # find MCD43A4 file
    mcd43a4_file = sprintf("%s/tag_%s_%04d_MCD43A4_gee_subset.csv",
                           remote_sensing_path,
                           tag,
                           i)

    if(!file.exists(mcd43a4_file)){
      stop("missing remote sensing data, rerun download routine...")
    } else {
      cat(sprintf("\r%s\r",i))
    }

    # read in remote sensing data
    modis_data = read.table(mcd43a4_file,sep=",", header=TRUE)

    # calculate dates
    middle = df$date[i]
    start = df$date[i] - window_size
    end = df$date[i] + window_size

    # extract dates and correct band with multiplier
    date = modis_data$date
    nir = modis_data$Nadir_Reflectance_Band2 * 0.0001
    red = modis_data$Nadir_Reflectance_Band1 * 0.0001
    blue = modis_data$Nadir_Reflectance_Band3 * 0.0001

    # calculate VI vegetation index
    evi = 2.5 * (nir - red)/(nir + (6 * red) - (7.5 * blue) + 1)
    ndvi = (nir - red)/(nir + red)
    evi[evi > 2 | evi < -2] = NA
    ndvi[ndvi > 2 | ndvi < -2] = NA

    # median values for the EVI
    evi = by(evi, INDICES = date, median, na.rm = TRUE)
    ndvi = by(ndvi, INDICES = date, median, na.rm = TRUE)

    # convert to a matrix
    date = as.matrix(names(evi))
    evi = as.matrix(evi)
    ndvi = as.matrix(ndvi)

    # interpolate missing values
    dates_full = seq(as.Date(min(date)), as.Date(max(date)), by = "day")

    evi_full = rep(NA,length(dates_full))
    evi_full[which(dates_full %in% as.Date(date))] = evi
    evi = na.approx(evi_full, na.rm = FALSE)

    if (inherits(evi, "try-error")){
      evi = evi_full
    }

    ndvi_full = rep(NA,length(dates_full))
    ndvi_full[which(dates_full %in% as.Date(date))] = ndvi
    ndvi = na.approx(ndvi_full, na.rm = FALSE)

    if (inherits(ndvi, "try-error")){
      ndvi = ndvi_full
    }

    # rename some variables
    site = rep(i,length(evi_full))
    date = dates_full

    # selections
    trailing = which(dates_full >= start & dates_full <= middle)
    leading = which(dates_full <= end & dates_full >= middle)

    # the statistic to apply
    evi_leading = do.call(stat, list(x = evi[leading], na.rm = TRUE, ...))
    evi_trailing = do.call(stat, list(x = evi[trailing], na.rm = TRUE, ...))

    ndvi_leading = do.call(stat, list(x = ndvi[leading], na.rm = TRUE, ... ))
    ndvi_trailing = do.call(stat, list(x = ndvi[trailing], na.rm = TRUE, ... ))

    # if an api key is provided return relative altitude
    # otherwise not
    if(!is.null(api_key)){
      key = read.table(api_key, stringsAsFactors = FALSE)
      url = "https://maps.googleapis.com/maps/api/elevation/json?locations="
      elev = jsonlite::fromJSON(sprintf("%s%s,%s&key=%s",
                  url,
                  df$latitude[i],
                  df$longitude[i],
                  key))
      relative_altitude = df$altitude[i] - elev$results$elevation
    } else {
      relative_altitude = NA
    }

    # return data
    return(list(data.frame("site" = i,
                           date,
                           ndvi,
                           evi),
                data.frame("site" = i,
                           evi_leading,
                           evi_trailing,
                           ndvi_leading,
                           ndvi_trailing,
                           relative_altitude
                           ))
           )
  })

  cat("Merging datasets... \n")
  # split out two data frames and bind
  raw_modis_data = lapply(vi_list, function(x)x[[1]])
  raw_modis_data = data.frame(do.call("rbind",
                                 raw_modis_data))

  summary_modis_data = lapply(vi_list, function(x)x[[2]])
  summary_modis_data = data.frame(do.call("rbind",
                                          summary_modis_data))
  cat("Done ! \n")

  # return these two data frames
  if (is.null(out_path)){
    return(list("remote_sensing_data" = raw_modis_data,
                "summary_data" = summary_modis_data))
  }else{
    saveRDS(list("remote_sensing_data" = raw_modis_data,
                 "summary_data" = summary_modis_data),
            sprintf("%s/tag_%s_%s.rds",out_path, tag, stat))
  }
}