em38: Process N38 binary files from EM38-MK2 sensors

Documented in n38_chunk n38_decode n38_import n38_to_m38

#' Import EM38 data
#'
#' This function reads N38 binary files into R and does some minimal
#' pre-processing.
#' @param path A file path pointing to a valid *.N38 file, produced by a Geonics
#'   EM38-MK2 conductivity sensor connected to an Allegra CX or Archer
#'   datalogger (and optionally, a GPS device).
#' @return A matrix with n rows and 25 columns, containing raw bytes.
#' @examples
#' n38_mat <-
#'   n38_import(system.file("extdata", "em38_demo.N38", package = "em38"))
#' @export
#'
n38_import <- function(path = NULL) {

  n38_con <- file(path, open = 'rb')
  n38_raw <- readBin(n38_con, what = raw(), n = file.size(path))
  close(n38_con)

  n38_mat <- matrix(n38_raw, ncol = 26, byrow = TRUE)
  # chuck the line end column
  n38_mat[, -26]
}

#' Separate message types
#'
#' This function chunks imported n38 binary matrices into a list, grouping file
#' header and survey line information for further processing.
#' @param n38_mat The binary matrix output by  \code{\link{n38_import}}.
#' @return A nested list containing binary matrices encoding a) file header
#'   information and b) each survey line in the input file. For each survey
#'   line, a list of matrices with survey line data separated by type - header
#'   and calibration data, instrument readings, position information, and
#'   comments.
#' @examples
#' data('n38_demo')
#' n38_chunks <- n38_chunk(n38_demo)
#' @export
#'
n38_chunk <- function(n38_mat = NULL) {

  # pull all the start-of-row characters, they ID what each row is about
  rids <- rawToChar(n38_mat[ , 1], multiple = TRUE)
  # count total number of survey lines in file
  slines <- length(which(rids == 'L'))

  # establish a list with one file header entry and n survey line entries
  out_list <- vector('list', length = 1 + slines)
  names(out_list) <- c('file_header', paste0('survey_line_', 1:slines))

  # populate said list with parts of the input matrix as appropriate

  # file header
  fh_rns <- match('E', rids)
  out_list[['file_header']] <- n38_mat[fh_rns:(fh_rns + 1), ]

  # get each survey line start and end row numbers into two vectors
  sl_starts <- which(rids == 'L')
  sl_ends   <- if(length(sl_starts) == 1) {
    dim(n38_mat)[1]
  } else {
    c(sl_starts[2:length(sl_starts)] - 1, dim(n38_mat)[1])
  }

  out_list[2:length(out_list)] <- lapply(2:length(out_list), function(i) {
    # ugh but w/e
    sl_x <- n38_mat[sl_starts[i - 1]:sl_ends[i - 1], ]

    # separate the record types within each survey line
    sline_list <- vector('list', length = 7)
    names(sline_list) <- c('sl_header', 'cal_data', 'timer_data',
                           'reading_data', 'location_data', 'new_station',
                           'comments')

    # get the row indices for each category in sline_list
    sl_rids <- rids[sl_starts[i - 1]:sl_ends[i - 1]]

    # survey line header
    sline_list[['sl_header']] <-
      sl_x[which(sl_rids %in% c('L', 'B', 'A', 'Z')), , drop = FALSE]

    # calibration data
    sline_list[['cal_data']] <- sl_x[which(sl_rids == 'O'), , drop = FALSE]

    # timer data
    sline_list[['timer_data']]   <- sl_x[which(sl_rids == '*'), ]

    # instrument readings
    sline_list[['reading_data']] <-
      sl_x[which(sl_rids %in% c('T', 't', '2')), , drop = FALSE]

    # location data (note readings that fail the checksum still come through
    # here and are handled later)
    gps <- which(!(sl_rids %in% c('L', 'B', 'A', 'Z','O', '*', 'T', 't', '2',
                                  'C', 'S', 'X')))
    sline_list[['location_data']] <- sl_x[gps, , drop = FALSE]

    # new station (nb often null, for when no GPS in use???)
    sline_list[['new_station']] <- sl_x[which(sl_rids == 'S'), , drop = FALSE]

    # comments (nb often null bc who has the time really)
    sline_list[['comments']] <- sl_x[which(sl_rids == 'C'), , drop = FALSE]

    sline_list
  })

  out_list
}

#' process chunked n38 data
#'
#' This function decodes n38 data according to the specifications provided by
#' the EM38-MKII manual.
#' @param chunks The nested list of raw matrices output by
#'   \code{\link{n38_chunk}}.
#' @return A nested list containing decoded n38 data with the following
#'   structure:
#'   \itemize{
#'   \item file header information (list)
#'   \item survey line
#'   \item header information (list)
#'   \item calibration data (data frame)
#'   \item timer data (list)
#'   \item instrument readings (timestamped data frame, calibrated)
#'   \item location data (timestamped data frame of NMEA-0138 message strings)
#'   \item new station data (timestamped data frame)
#'   \item comments (timestamped data frame)
#'   }
#'
#' @examples
#' data('n38_demo')
#' n38_chunks <- n38_chunk(n38_demo)
#' n38_decoded <- n38_decode(n38_chunks)
#' @importFrom purrr map map_lgl transpose
#' @export
#'
n38_decode <- function(chunks = NULL) {

  chunks[['file_header']] <- process_fheader(chunks[['file_header']])

  chunks[2:length(chunks)] <- lapply(2:length(chunks), function(i) {

    chunks[[i]][['sl_header']] <- process_slheader(chunks[[i]][['sl_header']])

    chunks[[i]][['cal_data']] <-
      apply(chunks[[i]][['cal_data']], MARGIN = 1, FUN = process_cal)
    chunks[[i]][['cal_data']] <- purrr::transpose(chunks[[i]][['cal_data']])
    chunks[[i]][['cal_data']] <-
      as.data.frame(lapply(chunks[[i]][['cal_data']], unlist),
                    stringsAsFactors = FALSE)

    chunks[[i]][['timer_data']] <- process_timer(chunks[[i]][['timer_data']])
    # adjust date to that given by survey line header
    chunks[[i]][['timer_data']][['computer_time']]$mday <-
      chunks[[i]][['sl_header']][['timestamp']]$mday
    chunks[[i]][['timer_data']][['computer_time']]$mon  <-
      chunks[[i]][['sl_header']][['timestamp']]$mon
    chunks[[i]][['timer_data']][['computer_time']]$year <-
      chunks[[i]][['sl_header']][['timestamp']]$year
    chunks[[i]][['timer_data']][['computer_time']]$wday <-
      chunks[[i]][['sl_header']][['timestamp']]$wday
    chunks[[i]][['timer_data']][['computer_time']]$yday <-
      chunks[[i]][['sl_header']][['timestamp']]$yday

    # process instrument readings
    chunks[[i]][['reading_data']] <-
      apply(chunks[[i]][['reading_data']], MARGIN = 1, FUN = process_reading)
    chunks[[i]][['reading_data']] <-
      purrr::transpose(chunks[[i]][['reading_data']])
    chunks[[i]][['reading_data']] <-
      lapply(chunks[[i]][['reading_data']], function(x) {
      unlist(x, recursive = FALSE)
    })
    chunks[[i]][['reading_data']] <-
      as.data.frame(chunks[[i]][['reading_data']],
                    col.names = names(chunks[[i]][['reading_data']]),
                    stringsAsFactors = FALSE)

    # apply calibration factors to instrument readings
    chunks[[i]][['reading_data']]$cond_05 <-
      chunks[[i]][['reading_data']]$cond_05 +
      chunks[[i]][['cal_data']]$cal_current[2]

    chunks[[i]][['reading_data']]$cond_1 <-
      chunks[[i]][['reading_data']]$cond_1 +
      chunks[[i]][['cal_data']]$cal_current[1]

    chunks[[i]][['reading_data']]$IP_05 <- mapply(function(mode, reading) {
      if(mode == 'Vertical') {
        reading - chunks[[i]][['cal_data']]$cal_current[4]
      } else {
        reading - chunks[[i]][['cal_data']]$cal_current[6]
      }
    },
    mode    = chunks[[i]][['reading_data']]$mode,
    reading = chunks[[i]][['reading_data']]$IP_05)

    chunks[[i]][['reading_data']]$IP_1 <- mapply(function(mode, reading) {
      if(mode == 'Vertical') {
        reading - chunks[[i]][['cal_data']]$cal_current[3]
      } else {
        reading - chunks[[i]][['cal_data']]$cal_current[5]
      }
    },
    mode    = chunks[[i]][['reading_data']]$mode,
    reading = chunks[[i]][['reading_data']]$IP_1)

    # location data
    # Each reading starts with @, different types are different
    # lengths, and there are variable stretches of whitespace plus multiline
    # signifiers to deal with, what a fun time this was
    chunks[[i]][['location_data']] <-
      if(dim(chunks[[i]][['location_data']])[1] == 0) {
        NA
        } else {
          loc  <- rawToChar(unlist(t(chunks[[i]][['location_data']])),
                            multiple = TRUE)
          # drop nuls before we get too far
          keep <- which(loc != '\xcc')
          loc <- loc[keep]
          locs <- split(loc, cumsum(loc %in% c('@', '?')))

    # ditch the `#` newline signifiers and whitespace, convert to string
    locs <- lapply(locs, function(x) {
      paste0(x[- which(x %in% c('#', ' '))], collapse = '')
    })

    # The following ditches checksum fails that have already been flagged by GPS
    # software. These start with ? not @ and use " instead of # as an internal
    # newline signifier. Also handled are cases where GPS messages can
    # occasionally get cut off after the message comes through but before the
    # timestamp does (where someone hits pause at the wrong time)
    keep <- purrr::map_lgl(locs, function(x) grepl('^@.+\\!', x))
    locs <- locs[keep]

    # 20230924: Geode-brand devices report battery information using a custom
    # string of the form $PJSI,BAT,a,b,c,d,*CC<CR><LF>
    # see https://junipersys.com/support/article/14754
    # these can also be dropped
    drop <- purrr::map_lgl(locs, function(x) grepl('^@\\$PJSI', x))
    locs <- locs[!drop]

    # split up string into main components (decode happens later)
    # note that only TYPE = 'G*GGA' is used in final output
    bang <- as.numeric(gregexpr('!', locs))
    type <- substr(locs, 3, 7)
    out  <- list('TYPE'         = type,
                 'MESSAGE'      = substr(locs, 9, bang - 1),
                 # NB this only works if a checksum failure does not involve
                 # scrambling/loss of the message type - check is skipped if
                 # G*GGA is scrambled to e.g. GPGA,. Such failures are still
                 # never decoded down the track so not a big deal
                 'CHKSUM'       = ifelse(type %in% c('GPGGA', 'GLGGA', 'GAGGA',
                                                     'GNGGA'),
                                         nmea_check(substr(locs, 2, bang - 1)),
                                         NA),
                 'timestamp_ms' =
                   as.numeric(substr(locs, bang + 1, nchar(locs))))
    as.data.frame(out, col.names = names(out), stringsAsFactors = FALSE)
    }

    chunks[[i]][['new_station']] <-
      if(dim(chunks[[i]][['new_station']])[1] == 0) {
        NA
        } else {
          chunks[[i]][['new_station']] <-
            apply(chunks[[i]][['new_station']], MARGIN = 1, FUN = process_nstat)
          chunks[[i]][['new_station']] <-
            purrr::transpose(chunks[[i]][['new_station']])
          chunks[[i]][['new_station']] <-
            as.data.frame(lapply(chunks[[i]][['new_station']], unlist),
                          stringsAsFactors = FALSE)
        }

    chunks[[i]][['comments']] <- if(dim(chunks[[i]][['comments']])[1] == 0) {
      NA
      } else {
        chunks[[i]][['comments']] <-
          apply(chunks[[i]][['comments']], MARGIN = 1, FUN = process_comment)
        chunks[[i]][['comments']] <- purrr::transpose(chunks[[i]][['comments']])
        chunks[[i]][['comments']] <-
          as.data.frame(lapply(chunks[[i]][['comments']], unlist),
                        stringsAsFactors = FALSE)
      }

    chunks[[i]]
  })

  chunks

}

#' output m38
#'
#' This function processes decoded n38 data into the m38 format used by Geonics.
#' Its a bit useless from an R perspective, but users may wish to compare
#' outputs with Geonics' own conversion software or process it further with the
#' DAT38MK2 application.
#' @param n38_decoded Nested list output by n38_decode
#' @return Large character; combination of location data and calibrated
#'   instrument readings. Note that record timestamps won't always agree with an
#'   'official' m38 file at the microsecond level.
#' @note Write output to file in working directory with e.g. \code{write(m38_example,
#'   paste0('m38_from_R_', Sys.Date(), '.m38'))}
#' @examples
#' data('n38_demo')
#' n38_chunks <- n38_chunk(n38_demo)
#' n38_decoded <- n38_decode(n38_chunks)
#' m38_example <- n38_to_m38(n38_decoded)
#' @importFrom purrr flatten
#' @importFrom stats complete.cases
#' @export
#'
n38_to_m38 <- function(n38_decoded = NULL) {
  options(digits.secs = 3)

  # shrug emoji
  fn <- paste0(c(n38_decoded$file_header$file_name,
          rep.int(' ',
                  times = (8 - length(n38_decoded$file_header$file_name)))),
          collapse = '')
  file_header <- paste0(substr(n38_decoded$file_header$prog_file_id, 1, 4), '-',
                     substr(n38_decoded$file_header$prog_file_id, 5, 7), ' V',
                     n38_decoded$file_header$version_no, ' ', fn)

  survey_lines <- lapply(2:length(n38_decoded), function(i) {

    sl_mode <- if(length(unique(n38_decoded[[i]]$reading_data$mode)) > 1) {
      'B'
    } else if(unique(n38_decoded[[i]]$reading_data$mode) == 'Horizontal') {
      'H'
    } else {
      'V'
    }
    slunits <-
      if (n38_decoded$file_header$unit_type == 'meters') {
        0
      } else {
        1
      }
    slmode <-
      if (n38_decoded$file_header$survey_mode == 'Auto') {
        'A'
      } else {
        'M'
      }
    datetime <- format(n38_decoded[[i]]$sl_header$timestamp,
                       format = '%d/%m/%Y %H:%M:%S')

    sl_header <- paste0('L', paste0(rep.int(' ', times = 3), collapse = ''),
           n38_decoded[[i]]$sl_header$line_name, ' ',
            paste0(rep.int(' ', 8 - nchar(n38_decoded[[i]]$sl_header$line_name)),
                  collapse = ''),
           'B', sl_mode, 10, slunits, ' ', n38_decoded[[i]]$sl_header$direction, ' ',
                  slmode, sprintf('%.3f', n38_decoded$file_header$time_samples),
           ' ', fn, '   ', datetime)

    # add station data to readings
    # increment is -ve when GPS not in use and GRD dir is South or West
    # allegedly
    # must make it easier to recombine back-and-forth tracks
    # this probably isn't quite right but unsure of solution at 20191125
    stn <- rep(seq(n38_decoded[[i]]$sl_header$start_station,
                   dim(n38_decoded[[i]]$reading_data)[1],
                   by = n38_decoded[[i]]$sl_header$station_increment),
               each = n38_decoded[[i]]$sl_header$station_increment)
    stn <- stn[seq(dim(n38_decoded[[i]]$reading_data)[1])]

    n38_decoded[[i]]$reading_data$station <- stn

    reading_split <- split(n38_decoded[[i]]$reading_data,
                           seq.int(dim(n38_decoded[[i]]$reading_data)[1]))
    # slow :/
    readings <- purrr::map_chr(reading_split, function(row) {

      param_1 <- if(grepl('first', row['indicator']))   {'S'} else {'R'}
      param_2 <- if(row['mode'] == 'Vertical')          {'V'} else {'H'}
      param_3 <- if(row['marker'] == 'no marker')       { 0 } else { 1 }
      param_4 <- if(n38_decoded[[1]][[1]] == 'EM38MK2') { 1 } else { 0 }


      paste0(param_1, param_2, param_3, param_4, ',',
             paste0(rep.int(' ', 13 - nchar(sprintf('%.3f', row['station']))),
                    collapse = ''),
             sprintf('%.3f', row['station']), ',',
             # cond 1m, - all numbers are formatted to F11.3
             paste0(rep.int(' ', 11 - nchar(sprintf('%.3f', row['cond_1']))), collapse = ''),
             sprintf('%.3f', row['cond_1']), ',',
             # IP 1m
             paste0(rep.int(' ', 11 - nchar(sprintf('%.3f', row['IP_1']))), collapse = ''),
             sprintf('%.3f', row['IP_1']), ',',
             # cond 0.5m
             paste0(rep.int(' ', 11 - nchar(sprintf('%.3f', row['cond_05']))), collapse = ''),
             sprintf('%.3f', row['cond_05']), ',',
             # IP 0.5m
             paste0(rep.int(' ', 11 - nchar(sprintf('%.3f', row['IP_05']))), collapse = ''),
             sprintf('%.3f', row['IP_05']), ',',
             # temp 1m
             paste0(rep.int(' ', 11 - nchar(sprintf('%.3f', row['temp_1']))), collapse = ''),
             sprintf('%.3f', row['temp_1']), ',',
             # temp 0.5m
             paste0(rep.int(' ', 11 - nchar(sprintf('%.3f', row['temp_05']))), collapse = ''),
             sprintf('%.3f', row['temp_05']), ',',
             # timestamp - should be ok but doesn't quite match offical version -
             # some kind of rounding at 3rd dec pl, difference is never more than a couple of ms
             # main thing is that order of records is not affected so all good
             format(conv_stamp(n38_decoded[[i]]$timer_data$computer_time,
                                     n38_decoded[[i]]$timer_data$timestamp_ms,
                                     row$timestamp_ms),
                          format = '%H:%M:%OS') # note options set above
      )
    })

  new_stations <- if(all(!is.na(n38_decoded[[i]]$new_station))) {
      apply(n38_decoded[[i]]$new_station, MARGIN = 1, FUN = function(row) {
        # haven't seen one of these yet so not sure if this is correct
        sentence <- paste0('S ', row['new_station'])
        timestamp <- as.character(n38_decoded[[i]]$timer_data$computer_time +
                                    (as.numeric(row['timestamp_ms']) -
                                       n38_decoded[[i]]$timer_data$timestamp_ms) / 1000,
                                  format = '%H:%M:%OS')
        padding <-
          paste0(rep.int(' ', times = 103 - (nchar(sentence) + nchar(timestamp))),
                 collapse = '')
        paste0(sentence, padding, timestamp)
      })
    } else { NA }

  comments <- if(all(!is.na(n38_decoded[[i]]$comments))) {
    apply(n38_decoded[[i]]$comments, MARGIN = 1, FUN = function(row) {
      sentence <- paste0('C ', row['comment'])
      timestamp <- as.character(n38_decoded[[i]]$timer_data$computer_time +
                     (as.numeric(row['timestamp_ms']) -
                        n38_decoded[[i]]$timer_data$timestamp_ms) / 1000,
                   format = '%H:%M:%OS')
      padding <-
        paste0(rep.int(' ', times = 103 - (nchar(sentence) + nchar(timestamp))),
               collapse = '')
      paste0(sentence, padding, timestamp)
    })
  } else {
      NA
  }

  # only some location messages are output
  loc_subset <-
    n38_decoded[[i]]$location_data[n38_decoded[[i]]$location_data$TYPE %in%
                                     c('GPGGA', 'GPGSA',
                                       'GLGGA', 'GLGSA',
                                       'GAGGA', 'GAGSA',
                                       'GNGGA', 'GNGSA'), ]

  locations <- apply(loc_subset, MARGIN = 1, FUN = function(row) {
      sentence <- paste0('$', row['TYPE'], ',', row['MESSAGE'], ',')
      timestamp <-
        format(n38_decoded[[i]]$timer_data$computer_time +
                       (as.numeric(row['timestamp_ms']) -
                          n38_decoded[[i]]$timer_data$timestamp_ms) / 1000,
                     format = '%H:%M:%OS')
      padding <-
        paste0(rep.int(' ', times = 103 - (nchar(sentence) + nchar(timestamp))),
               collapse = '')
      paste0(sentence, padding, timestamp)
    })

  readings_done <-
    data.frame('DATA' = readings,
               'TIME' = n38_decoded[[i]]$reading_data$timestamp_ms,
               stringsAsFactors = FALSE)

  nstat_done  <- if(all(!is.na(n38_decoded[[i]]$new_station))) {
    data.frame('DATA' = new_stations,
               'TIME' = n38_decoded[[i]]$new_station$timestamp_ms,
               stringsAsFactors = FALSE)
  } else {
    # ughhhh
    data.frame('DATA' = NA_character_,
               'TIME' = NA_integer_,
               stringsAsFactors = FALSE)
  }

  comments_done  <- if(all(!is.na(n38_decoded[[i]]$comments))) {
    data.frame('DATA' = comments,
               'TIME' = n38_decoded[[i]]$comments$timestamp_ms,
               stringsAsFactors = FALSE)
  } else {
    data.frame('DATA' = NA_character_,
               'TIME' = NA_integer_,
               stringsAsFactors = FALSE)
  }

  locations_done <- data.frame('DATA' = locations,
                               'TIME' = loc_subset$timestamp_ms,
                               stringsAsFactors = FALSE)

  all_sl_readings <-
    do.call('rbind',
            list(readings_done, nstat_done, comments_done, locations_done))
  all_sl_readings <- all_sl_readings[complete.cases(all_sl_readings), ]

  # order by timestamp asc
  all_sl_readings <- all_sl_readings[with(all_sl_readings, order(TIME)), ]

  append(sl_header, all_sl_readings$DATA)
  })

  out <- append(file_header, unlist(survey_lines))
  # write(out, paste0('m38_from_R_', Sys.Date(), '.m38'))
  out

  }