R/elg.R

# Series of functions that work to read in an event data file from SEA's archive

#' Read in SEA data an ELG event file and return a well formatted output
#'
#' SEA event files contain output from a number of instruments
#' including GPS, flow-through, chirp, etc.
#'
#' @param filein .elg file to be read in
#' @param forceGPS option to force the longitude, latitude and time to come from one
#'  or other of the GPS feeds
#' @param preCheck logical to do an initial check of lines to remove any standard issues
#' @param skip number of lines to skip below header
#' @keywords
#' @export
#' @examples
#' read_elg()
#'
read_elg <- function(filein,forceGPS=NULL,preCheck = T,skip=0) {

  # TODO: Optimize code using pmap from purrr Package

  if(skip>0) {
    col_names <- names(readr::read_csv(filein, n_max = 0))
  } else {
    col_names=T
  }

  # Toggle a preCheck of the elg file for clearly bad lines and commas at end
  if(preCheck) {

    # check for bad lines by checking for number of commas
    liner <- readr::read_lines(filein)
    numcom <- stringr::str_count(liner,"\\,")
    liner <- liner[numcom==numcom[1]]

    # clean and process end of line
    liner <- stringr::str_replace(liner,"\\,$","") # remove the trailing comma on many ELG files
    liner <- stringr::str_replace(liner,"$","\\\n")   # add new line to end of each line
    liner <- stringr::str_c(liner,collapse="")           # collapse vector into single line for read_csv

    # Read in lines using readr package (quicker than base read.csv)
    df <- readr::read_csv(liner,col_types = readr::cols(.default=readr::col_character()),skip=skip,col_names = col_names)

  } else {

    # If no precheck then just read in the file as is
    df <- readr::read_csv(filein,col_types = readr::cols(.default=readr::col_character()),skip=skip,col_names = col_names)

  }

  # Reasign names that have dashes in them to be referenced more easily
  names(df) <- stringr::str_replace_all(names(df),"-",".")

  # Construct the
  args <- tibble::tribble(~name,~regex,~parse_fun,
                          "sys_date","date",lubridate::mdy,
                          "sys_time","^time",readr::parse_time,
                          "nav_time","gps.*nav.*time",readr::parse_character,
                          "nav_lon","gps.*nav.*lon",sea::parse_lon,
                          "nav_lat","gps.*nav.*lat",sea::parse_lat,
                          "nav_sog","gps.*nav.*sog",readr::parse_double,
                          "nav_cog","gps.*nav.*cog",readr::parse_double,
                          "nav_quality","gps.*nav.*quality",readr::parse_integer,
                          "lab_time","gps.*lab.*time",readr::parse_character,
                          "lab_lon","gps.*lab.*lon",sea::parse_lon,
                          "lab_lat","gps.*lab.*lat",sea::parse_lat,
                          "lab_sog","gps.*lab.*sog",readr::parse_double,
                          "lab_cog","gps.*lab.*cog",readr::parse_double,
                          "lab_quality","gps.*lab.*quality",readr::parse_integer,
                          "temp","tsal.*temp",readr::parse_double,
                          "sal","tsal.*sal",readr::parse_double,
                          "fluor","^fluo.*invivo",readr::parse_double,
                          "fluor_1min","fluo.*chl.*1.*min",readr::parse_double,
                          "fluor_60min","^fluo.*chl.*60.*min",readr::parse_double,
                          "CDOM","cdom.*raw",readr::parse_double,
                          "CDOM_1min","cdom.*1.*min",readr::parse_double,
                          "CDOM_60min","cdom.*60.*min",readr::parse_double,
                          "xmiss",c("trans.*raw","xmiss.*raw","xmiss.*[^m]"),readr::parse_double,
                          "xmiss_1min",c("trans.*1\\.*min","xmiss.*1\\.*min"),readr::parse_double,
                          "xmiss_60min",c("trans.*60.*min","xmiss.*60.*min"),readr::parse_double,
                          "wind_sp","true.*wind.*sp",readr::parse_double,
                          "wind_dir","true.*wind.*dir",readr::parse_double,
                          "wind_sp_rel","^wind.*sp",readr::parse_double,
                          "wind_dir_rel","^wind.*dir",readr::parse_double,
                          "heading",c("hdg","heading"),readr::parse_double,
                          "pitch","pitch",readr::parse_double,
                          "roll","roll",readr::parse_double,
                          "depth",c("depth","dbt"),readr::parse_double,
                          "wire_payout","lci90.*payout",readr::parse_double,
                          "wire_tension","lci90.*tension",readr::parse_double,
                          "wire_speed","lci90.*spd",readr::parse_double
                          )

  args_in <- tibble::as_tibble(list(df=list(df),regex=args$regex,parse_fun=args$parse_fun))
  namelist <- purrr::as_vector(dplyr::select(args,name))

  # Work out how to pass format arguments or just post-process afterward

  # output <- purrr::pmap(dplyr::select(args,df,regex,parse_fun),parse_field)
  output <- purrr::pmap(args_in,sea::parse_field)

  names(output) <- namelist
  df <- tibble::as.tibble(output)

  # additional parsing for some elements
  df$nav_time <- readr::parse_time(stringr::str_extract(df$nav_time,"[0-9]{6}"),format="%H%M%S")
  df$lab_time <- readr::parse_time(stringr::str_extract(df$lab_time,"[0-9]{6}"),format="%H%M%S")
  df$sys_dttm <- update(df$sys_date, hour = lubridate::hour(df$sys_time),
                        minute=lubridate::minute(df$sys_time),
                        second=lubridate::second(df$sys_time))

  # Make datetimes from GPS using the system datetime
  df <- dplyr::mutate(df,lab_dttm = create_gps_dttm(lab_time,sys_dttm))
  df <- dplyr::mutate(df,nav_dttm = create_gps_dttm(nav_time,sys_dttm))

  # choose master datetime
  # use nav GPS as the default and revert to lab gps and sys time as required
  if(is.null(forceGPS)) {
    lon <- df$nav_lon
    lon[is.na(lon) & !is.na(df$lab_lon)] <- df$lab_lon[is.na(lon) & !is.na(df$lab_lon)]
    lat <- df$nav_lat
    lat[is.na(lat) & !is.na(df$lab_lat)] <- df$lab_lat[is.na(lat) & !is.na(df$lab_lat)]
    dttm <- df$nav_dttm
    dttm[is.na(dttm) & !is.na(df$lab_dttm)] <- df$lab_dttm[is.na(dttm) & !is.na(df$lab_dttm)]
  } else if (forceGPS == 'nav') {
    lon <- df$nav_lon
    lat <- df$nav_lat
    dttm <- df$nav_dttm
  } else if (forceGPS == 'lab') {
    lon <- df$lab_lon
    lat <- df$lab_lat
    dttm <- df$lab_dttm
  }

  # add the chosen, lon, lat and dttm
  df <- dplyr::mutate(df,lon=lon,lat=lat,dttm=dttm)

  # rearrange the columns into correct order
  df <- df[,c(42,40,41,37,1,2,39,3:8,38,9:36)]

}





#' Update ELG data
#'
#' Add to an existing elg tibble from an SEA Event File that is in the process of being recorded.
#'
#' @param df current tibble of ELG data
#' @param filein location of the *.elg file to update from
#'
#' @return
#' @export
#'
#' @examples
update_elg <- function(df,filein,preCheck=T) {

  # Find number of rows to skip
  nskip <- nrow(df)+1

  # read in the lines
  dfadd <- read_elg(filein,skip=nskip,preCheck=preCheck)

  df <- bind_rows(df,dfadd)
}


#' Cut out bad lines of elg file based on bad nav gps signals
#'
#' SEA event files contain output from a number of instruments
#' including GPS, flow-through, chirp, etc.
#'
#' @param filein .elg filepath to be read in
#' @param latstr regexp string to use to search for
#' @param fileout a specified output filepath.
#' If not specified, "_clean" is appended to name of input file and saved in the same directory.
#' @keywords
#' @export
#' @examples
#' clean_bad_elg()
#'
clean_bad_elg <- function(filein,latstr="[0-9]{4}\\.[0-9]{4}[NS]{1}",lonstr="[0-9]{5}\\.[0-9]{4}[EW]{1}",fileout=NULL) {

  # read elg data as charactor tibble
  df <- read_csv(filein,col_types = cols(.default = col_character()))
  names(df) <- gsub("-",".",names(df))

  # find all the correctly and incorrectly formatted latitude nav values using latstr


  keep1 <- 1:length(df$GPS.nav.Lat) %in% grep(latstr,df$GPS.nav.Lat)
  keep2 <- 1:length(df$GPS.nav.Lon) %in% grep(lonstr,df$GPS.nav.Lon)
  keep3 <- 1:length(df$GPS.nav.quality) %in% grep("^1$",df$GPS.nav.quality)

  badi <- which(!keep1|!keep2|!keep3)
  goodi <- which(keep1&keep2&keep3)

  # read in the raw lines from the elg file
  df_raw <- read_lines(filein)

  # only keep those lines which are good (plus the header line)
  df_raw_clean <- df_raw[append(1,goodi+1)]

  # create output filename if not specified in function call
  if(is.null(fileout)) {
    fileout <- str_replace(filein,".elg","_clean.elg")
  }

  # write data to file
  write_lines(df_raw_clean,fileout)

}


#' Create a GPS DateTime field
#'
#' ELG files typically have a GPS time, but no GPS data.
#' This function takes the system datetime field already parsed and returns a well formated GPS datetime.
#'
#' @param gps_time Raw GPS time field
#' @param sys_dttm system datetime
#'
#' @return
#' @export
#'
#' @examples
create_gps_dttm <- function(gps_time,sys_dttm) {
  if(length(which(is.na(gps_time))) < length(gps_time) &
     length(which(!is.na(gps_time))) > 100) {
    sys_time <- readr::parse_time(format(sys_dttm,"%H:%M:%S"))
    difft <- gps_time - sys_time
    goodi <- !is.na(difft)
    dayoffi <- difft < -8000
    x <- 1:length(difft)
    lf <- lsfit(x[goodi & !dayoffi],difft[goodi & !dayoffi])
    difft <- x*lf[[1]][2] + lf[[1]][1]
    gps_dttm = sys_dttm+difft
  } else {
    gps_dttm = readr::parse_datetime(rep(NA,length(gps_time)))
  }
  return(gps_dttm)
}



# ELG Parse functions -----------------------------------------------------

# Could make the following two functions into one




# Following code has now been replaced with parse_field

#' Generic parser for fields contained in araw charactor tibble
#'
#' Data read in to character tibble by read_csv(...,col_types = cols(.default=col_character()))).
#' Takes parsing function as input so can differentiate between different column types.
#' Takes tibble and returns vector
#'
#' @param df character tibble
#' @param field name of field to parse (passed to warning and message functions for readability).
#' @param regex regex for finding the appropriate field. Can be vector of searches with earlier values prioritized.
#' @param parse_fun the parsing function to use in creating the new vector
#' @param ... additional parameters passed to parse_fun (i.e. format = "" for parse_date)
#'
#' @return
#' @export
#'
#' @examples
#' parse_elg_field()
parse_elg_field <- function(df,field,regex,parse_fun,...) {

  df_names <- str_to_lower(names(df))

  # Find and parse field
  for (i in 1:length(regex)) {
    ii <- grep(regex[i],df_names)
    if(length(ii)>0) break
  }

  if (length(ii)>1) {
    warning_mult(field,ii)
    ii <- ii[1]
  }
  if (length(ii)==0) {
    warning_empty(field)
    output <- parse_fun(rep(NA,nrow(df)),...)
    # output <- list(parse_fun(rep(NA,nrow(df)),...))
    # names(output) <- name
    # df <- bind_cols(df,output)
  } else {
    message_read(field,ii)
    # names(df[ii]) <- name
    # df[[ii]] <- parse_fun(df[[ii]],...)
    output <- parse_fun(df[[ii]],...)
  }



}

#' Parse GPS fields from ELG file
#'
#' Takes in a raw character tibble data frame and parses the 6 potential GPS fields
#' Takes tibble, returns vector
#'
#' @param df tibble character data frame input
#' @param source choose which GPS source to parse ("lab","nav")
#'
#' @return
#' @export
#'
#' @examples
#' parse_gps()
parse_elg_gps <- function(df,source){
  df_names <- str_to_lower(names(df))

  output <- NULL

  fieldnames <- c("time","lon","lat","sog","cog","quality")
  for (i in 1:6) {
    field <- paste(str_to_title(source),"GPS",str_to_title(fieldnames[i]))
    regexp <- paste("gps",source,fieldnames[i],sep=".")
    outfield <- paste0(source,"_", fieldnames[i])

    ii <- grep(regexp,df_names)
    if (length(ii)>1) {
      warning_mult(field,ii)
      ii <- ii[1]
    }
    if(length(ii)==0) {
      warning_empty(field)
      # TODO: Ensure that correct class is passed on to each empty field here
      assign(outfield,rep(NA,nrow(df)))
    } else {
      message_read(field,ii)
      if (i == 1) {
        assign(outfield,parse_time(str_extract(df[[ii]],"[0-9]{6}"),format="%H%M%S"))
      } else if (i == 2) {
        assign(outfield,parse_lon(df[[ii]]))
      } else if (i == 3) {
        assign(outfield,parse_lat(df[[ii]]))
      } else if (i == 4 | i == 5) {
        assign(outfield,parse_double(df[[ii]]))
      } else if (i == 6) {
        assign(outfield,parse_integer(df[[ii]]))
      }
    }
    output[[i]] <- get(outfield)
    names(output)[i] <- outfield
  }

  return(output)
}
# TODO: Work out if you can change the function called to parse the data on the fly
# Should be able to generate generic parsers for all fields
# See parse_elg_field - need to find a way to impliment for parse_gps


# warning/message functions -------------------------------------------------------

# NB: All these functions should become obsolete when parsing of elg fields is optimized


#' Produces warning for an empty field
#'
#' Called during parse_elg_* functions if no match to the input regex is found
#'
#' @param field The proper name of the field - gets passed to output string to better read
#'
#' @return
#' @export
#'
#' @examples
#' warning_empty("Temperature")
warning_empty <- function(field) {
  warning(paste(field,"not found in ELG file. Setting all values to NA"))
}

#' Produces warning for multiple name matches
#'
#' Called during parse_elg_* functions if multiple matches to the input regex are found
#' NB: by default the first value is selected.
#'
#' @param field  The proper name of the field - gets passed to output string to better read
#' @param ii matched field names
#'
#' @return
#' @export
#'
#' @examples
#' warning_mult("Temperature,c(2,23))
warning_mult <- function(field,ii) {
  warning(paste(field,"found in multiple slots:", paste0(ii,collapse="; "), ". By default, using data from slot", ii[1]))
}

#' Prints message that states which slot is being read
#'
#' Called during parse_elg_* functions when field is being parsed
#'
#' @param field The proper name of the field - gets passed to output string to better read
#' @param ii slot used for the parsing
#'
#' @return
#' @export
#'
#' @examples
#' message_read("Temperature",13)
message_read <- function(field,ii) {
  message(paste("Reading", field, "data from slot", ii))
}
benharden27/sea documentation built on May 14, 2019, 4:18 p.m.