R/EC_functions.R

Defines functions vpr_img_check vpr_img_copy vpr_img_category vpr_img_depth vpr_img_reclassified vpr_plot_profile vpr_plot_contour vp_plot_unkn vpr_plot_histsize vp_plot_matrix vpr_plot_TScat vpr_plot_TS isopycnal_calculate vpr_plot_sizefreq getRoiMeasurements vpr_autoid_check insertRow vpr_summary vpr_hour vpr_day vpr_category vpr_roi vpr_ctd_files vpr_dayhour ctd_cast bin_calculate vpr_trrois_size normalize_matrix ctd_df_cols px_to_mm vpr_autoid_read vpr_ctdroi_merge vpr_ctd_read vpr_oce_create bin_cast concentration_category vpr_roi_concentration vpr_autoid_copy vpr_ctdroisize_merge vpr_size_bin vpr_ctd_ymd vpr_save

Documented in bin_calculate bin_cast concentration_category ctd_cast ctd_df_cols getRoiMeasurements insertRow isopycnal_calculate normalize_matrix px_to_mm vp_plot_matrix vp_plot_unkn vpr_autoid_check vpr_autoid_copy vpr_autoid_read vpr_category vpr_ctd_files vpr_ctd_read vpr_ctdroi_merge vpr_ctdroisize_merge vpr_ctd_ymd vpr_day vpr_dayhour vpr_hour vpr_img_category vpr_img_check vpr_img_copy vpr_img_depth vpr_img_reclassified vpr_oce_create vpr_plot_contour vpr_plot_histsize vpr_plot_profile vpr_plot_sizefreq vpr_plot_TS vpr_plot_TScat vpr_roi vpr_roi_concentration vpr_save vpr_size_bin vpr_summary vpr_trrois_size

##### Import packages ####
#' Packages
#'  VPR processing functions depend on these packages
#'
#'These packages are needed!
#'
#' @import dplyr ggplot2 oce
#' @importFrom graphics hist par plot.new
#' @importFrom stats aggregate median quantile
#' @importFrom utils menu read.csv read.table write.table
#'
#' @rawNamespace import(gridExtra, except = combine)
#' @rawNamespace import(metR, except = coriolis)
#'
NULL
#### PROCESSING FUNCTIONS ####


#' Save VPR data as an \link[oce]{as.oce} object
#'
#' @details This function will pass a VPR data frame object to an `oce` object.
#'   Using an `oce` object as the default export format for VPR data allows for
#'   metadata and data to be kept in the same, space efficient file, and avoid
#'   redundancy in the data frame. The function check for data parameters that
#'   may actually be metadata parameters (rows which have the same value
#'   repeated for every observation). These parameters will automatically be
#'   copied into the metadata slot of the `oce` object. The function will also
#'   prompt for a variety of required metadata fields. Depending on specific
#'   research / archiving requirements, these metadata parameters could be
#'   updated by providing the argument `metadata`.
#'
#'   Default metadata parameters include 'deploymentType', 'waterDepth',
#'   'serialNumber', 'latitude', 'longitude', 'castDate', 'castStartTime',
#'   'castEndTime', 'processedBy', 'opticalSetting', 'imageVolume', 'comment'.
#'
#'
#' @param data a VPR data frame
#' @param metadata (optional) a named list of character values giving metadata
#'   values. If this argument is not provided user will be prompted for a few
#'   generic metadata requirements.
#'
#'
#' @return an oce CTD object with all VPR data as well as metadata
#' @export
#'
#' @examples
#' data("taxa_conc_n")
#' metadata <- c('deploymentType' = 'towyo', 'waterDepth' =
#' max(ctd_roi_merge$pressure), 'serialNumber' = NA, 'latitude' = 47,
#' 'longitude' = -65, 'castDate' = '2019-08-11', 'castStartTime'= '00:00',
#' 'castEndTime' = '01:00', 'processedBy' = 'E. Chisholm', 'opticalSetting' =
#' 'S2', 'imageVolume' = 83663, 'comment' = 'test data')
#'
#' oce_dat <- vpr_save(taxa_conc_n, metadata)
#' # save(oce_dat, file = vpr_save.RData') # save data
#'
vpr_save <- function(data, metadata){

  # create oce objects

  oce_data <- as.oce(data)

  # check for metadata in dataframe
  rem_list <- list()
  for(i in 1:length(oce_data@data)){
    if(length(unique(oce_data@data[[i]])) == 1){
      print(paste('Metadata parameter found in Data object! ', names(oce_data@data)[[i]], 'value of' , unique(oce_data@data[[i]]), 'moved to metadata slot. '))
      # add as metadtaa parameter
      oce_data <- oceSetMetadata(oce_data, name = names(oce_data@data)[[i]], value = unique(oce_data@data[[i]]))
      rem_list[[i]] <- i
    }

  }
  # remove data lines
  oce_data@data <- oce_data@data[-unlist(rem_list)]

  # check for other metadata and ask user to supply
if(missing(metadata)){
  req_meta <- c('deploymentType', 'waterDepth', 'serialNumber', 'latitude', 'longitude', 'castDate', 'castStartTime', 'castEndTime', 'processedBy', 'opticalSetting', 'imageVolume', 'comment')
# TODO include metadata examples or skip
  # clarify serial number, water depth?

  for(rm in req_meta){
    if(is.null(oce_data@metadata[[rm]])){
      print(paste('Please provide value for Metadata parameter', rm))
      rm_val <- readline(paste('Metadata slot, ', rm, ': '))
      oce_data <- oceSetMetadata(oce_data, name = rm , value = rm_val, note = NULL)
    }
    # TODO : add possibility of value existing as 'unknown or some other placeholder which should be overwritten

  }
}else{
# if metadata names and values are provided as list
  for(rm in names(metadata)){

      rm_val <- metadata[[rm]]
      oce_data <- oceSetMetadata(oce_data, name = rm , value = rm_val, note = NULL)


  }
}

  return(oce_data)
}

#' Add Year/ month/ day hour:minute:second information
#'
#' Calculate and record calendar dates for vpr data from day-of-year, hour, and time (in milliseconds) info.
#' Will also add 'avg_hr' parameter if not already present.
#'
#' @param data VPR data frame from \code{\link{vpr_ctdroi_merge}}
#' @param year Year of data collection
#' @param offset time offset in hours between VPR CPU and processed data times (optional)
#'
#' @return a VPR data frame with complete date/time information in a new row named 'ymdhms'
#'
#' @examples
#' year <- 2019
#' data('ctd_roi_merge')
#' dat <- vpr_ctd_ymd(ctd_roi_merge, year)
#'
#'
#' @export

vpr_ctd_ymd <- function(data, year, offset){
  # avoid CRAN notes
  . <- time_ms <- NA

  d <- grep(names(data), pattern = 'avg_hr')
  if(length(d) == 0){
    data <- data %>%
      dplyr::mutate(., avg_hr = time_ms / 3.6e+06)
  }


  day_num <- substr(data$day, 2, 4)

  hour_num <- substr(data$hour, 2, 3)

  ymdd <- as.Date(as.numeric(day_num), origin = paste0(year,'-01-01'))


  l_per <- round(lubridate::seconds_to_period(data$time_ms/1000),0)


  ymdhms_obj <- as.POSIXct(l_per, origin = ymdd, tz = 'UTC')

  if(!missing(offset)){

    ymdhms_obj <- ymdhms_obj + offset*3600 # convert hour offset to seconds and adjust times

  }

  data <- data %>%
    dplyr::mutate(., ymdhms = ymdhms_obj)

  return(data)

}


#' Bin VPR size data
#'
#' Calculates statistics for VPR measurement data in depth averaged bins for analysis and visualization
#'
#' @param data_all a VPR CTD and measurement dataframe from \code{\link{vpr_ctdroisize_merge}}
#' @param bin_mea Numerical value representing size of depth bins over which data will be combined, unit is metres, typical values range from 1 - 5
#'
#' @return a dataframe of binned VPR size data statistics including number of observations, median, interquartile ranges, salinity and pressure, useful for making boxplots
#'
#' @examples
#'
#' data('size_df_f')
#' vpr_size_bin(size_df_f, bin_mea = 5)
#'
#' @export
#'
#'
#'
vpr_size_bin <- function(data_all, bin_mea){

  #Bin by depth
  p <- data_all$pressure
  max_pressure <- max(p, na.rm = TRUE)
  min_pressure <- min(p, na.rm = TRUE)
  x_breaks <- seq(from = floor(min_pressure), to = ceiling(max_pressure), by = bin_mea)

  #Get variables of interest using oce bin functions

  med <- oce::binApply1D(p, data_all$long_axis_length, xbreaks = x_breaks, median)$result
  iqr3 <- oce::binApply1D(p, data_all$long_axis_length,  xbreaks = x_breaks, quantile, probs = 0.75)$result
  iqr1 <- oce::binApply1D(p, data_all$long_axis_length,  xbreaks = x_breaks, quantile, probs = 0.25)$result
  n_obs <- oce::binApply1D(p, data_all$long_axis_length, xbreaks = x_breaks, length)$result
  temperature <- oce::binApply1D(p, data_all$temperature, xbreaks = x_breaks, mean)$result
  salinity <- oce::binApply1D(p, data$salinity, xbreaks = x_breaks, mean)$result
  pressure <- oce::binApply1D(p, data$salinity, xbreaks = x_breaks, mean)$xmids #Could be any of the variables computed, but I just went with salinity

  if (!(length(pressure) == length(salinity))) {

    salinity_mean <- binMean1D(p, data_all$salinity, xbreaks = x_breaks)$result

    idx_rm <- which(is.na(salinity_mean))

    #informs user where bins were removed due to NAs
    #note if a bin is 'NA' typically because there is no valid data in that depth range,
    #if you have a lot of NA bins, think about increasing your binSize
    print(paste('Removed bins at', pressure[idx_rm]))

    lp <- length(pressure)
    pressure <- pressure[-idx_rm]

  }

  station_id <- unique(data$station)

  dfs <- data.frame('median' = med, 'IQR1' = iqr1,
                    'IQR3' = iqr3, 'n_obs' = n_obs,
                    'temperature' = temperature, 'salinity' = salinity,
                    'pressure' = pressure)
  return(dfs)
}

#' Format CTD and Size data from VPR
#'
#' Format CTD and Meas data frames into combined data frame for analysis and plotting of size data
#'
#' @param data VPR dataframe from \code{\link{vpr_ctdroi_merge}}, with calculated variable sigmaT
#' @param data_mea VPR size data frame from \code{\link{vpr_autoid_read}}
#' @param taxa_of_interest a list of taxa of interest to be included in output dataframe
#'
#' @return A dataframe containing VPR CTD and size data
#'
#' @examples
#'
#' data("ctd_roi_merge")
#' data("roimeas_dat_combine")
#' category_of_interest = 'Calanus'
#'
#'ctd_roi_merge$avg_hr <- ctd_roi_merge$time_ms /3.6e+06
#'
#' size_df_f <- vpr_ctdroisize_merge(ctd_roi_merge, data_mea = roimeas_dat_combine,
#'  taxa_of_interest = category_of_interest)
#'
#' @export
#'
vpr_ctdroisize_merge <- function(data, data_mea, taxa_of_interest){

  # avoid CRAN notes
  . <- time_ms <- day <- hour <- roi_ID <- day_hour <- frame_ID <- pressure <- temperature <- salinity <- sigmaT <- fluorescence_mv <- turbidity_mv <- Perimeter <- Area <- width1 <- width2 <- width3 <- short_axis_length <- long_axis_length <- taxa <- NA

data <- data[!duplicated(data$time_ms),]

#get CTD data
data_ctd <- data %>%
  dplyr::mutate(., roi_ID = as.character(time_ms)) %>%
  dplyr::mutate(., day_hour = paste(day, hour, sep = ".")) %>%
  dplyr::mutate(., frame_ID = paste(roi_ID, day_hour, sep = "_")) %>%
  dplyr::select(., frame_ID, pressure, temperature, salinity, sigmaT, fluorescence_mv, turbidity_mv)

#get measurement data
data_mea <- data_mea %>%
  dplyr::mutate(., roi_ID = as.character(time_ms)) %>%
  dplyr::mutate(., frame_ID = paste(roi_ID, day_hour, sep = "_")) %>%
  dplyr::select(., -Perimeter, -Area, -width1, -width2, -width3, -short_axis_length)

#combine measurement and ctd data
data_all <- right_join(data_ctd, data_mea) %>%
  dplyr::filter(., !(is.na(pressure))) %>% #There are NAs at the beginning of CAP3.1 (i.e. measurements that are not in the ctd data)
  dplyr::mutate(., long_axis_length = as.numeric(long_axis_length)) %>%
  dplyr::filter(., taxa %in% taxa_of_interest)

#cut off data below maximum pressure to maintain consistent analysis between stations with varying depths
#data_all <- data_all %>%
  #dplyr::filter(., pressure <= max_pressure)

return(data_all)

}

#' Copy VPR images into folders
#'
#' Organize VPR images into folders based on classifications provided by visual plankton
#'
#' @param basepath A file path to your autoid folder where VP data is stored eg. "C:\\\\data\\\\cruise_XXXXXXXXXXXXXXX\\\\autoid\\\\"
#' @param day character string representing numeric day of interest
#' @param hour character string representing hour of interest
#' @param classifier_type character string representing the type of classifier (either 'svm', 'nn' or 'dual') from Visual Plankton
#' @param classifier_name character string representing name of Visual Plankton classifier
#' @param taxa optional list of character strings if you wish to only copy images from specific classification groups
#'
#' @return organized file directory where VPR images are contained with folders, organized by day, hour and classification,
#' inside your basepath/autoid folder
#'
#' @export
vpr_autoid_copy <- function(basepath, day, hour, classifier_type, classifier_name, taxa){

  folder_names <- list.files(basepath)

  if(!missing(taxa)){
    folder_names <- folder_names[folder_names %in% taxa]
  }

  #check valid folders
  if(length(folder_names) < 1){
    stop('No valid taxa folders found in basepath!')
  }

  day_hour <- paste0('d', day, '.h', hour)
  type_day_hour <- paste0(classifier_type,'aid.', day_hour)

for (i in folder_names) {

  #Get name of folder containing .txt files with roi paths within a category
  dir_roi <- paste(basepath, i, "/", "aid", sep = "")

  #Get names of text files
  txt_roi <- list.files(dir_roi)

  subtxt <- grep(txt_roi, pattern = type_day_hour, value = TRUE)
  txt_roi <- subtxt

  subtxt2 <- grep(txt_roi, pattern = classifier_name, value = TRUE)
  txt_roi <- subtxt2

  for(ii in txt_roi) {

    setwd(dir_roi)

    roi_path_str <- read.table(ii, stringsAsFactors = FALSE)

    #Create a new folder for autoid rois
    roi_folder <- paste(basepath, i, "\\", ii, "_ROIS", sep = "")
    command1 <- paste('mkdir', roi_folder, sep = " ")
    shell(command1)

    #Copy rois to this directory
    for (iii in 1:nrow(roi_path_str)) {

      dir_tmp <- as.character(roi_path_str[iii,1])
      command2 <- paste("copy", dir_tmp, roi_folder, sep = " ")
      shell(command2)

      print(paste(iii, '/', nrow(roi_path_str),' completed!'))
    }

  }

  print(paste(i, 'completed!'))
}

print(paste('Day ', day, ', Hour ', hour, 'completed!'))
}


#'Calculate VPR concentrations
#'
#' Calculates concentrations for each named taxa in dataframe
#'
#' @param data a VPR dataframe as produced by \code{\link{vpr_ctdroi_merge}}
#' @param taxas_list a list of character strings representing taxa present in the station being processed
#' @param station_of_interest The station being processed
#' @param binSize passed to \code{\link{bin_calculate}}, determines size of depth bins over which data is averaged
#' @param imageVolume the volume of VPR images used for calculating concentrations (mm^3)
#'
#' @examples
#'
#' data('ctd_roi_merge')
#' ctd_roi_merge$avg_hr <- ctd_roi_merge$time_ms /3.6e+06
#'
#' taxas_list <- c('Calanus', 'krill')
#' binSize <- 5
#' station_of_interest <- 'test'
#' imageVolume <- 83663
#'
#' taxa_conc_n <- vpr_roi_concentration(ctd_roi_merge, taxas_list,
#' station_of_interest, binSize, imageVolume)
#'
#'@export
#'
#'
vpr_roi_concentration <- function(data, taxas_list, station_of_interest, binSize, imageVolume){

  # avoid CRAN notes
  . <- NA
  # check that taxa exist for this station

  taxa_in_data <- names(data) %in% taxas_list

  valid_taxa <- names(data)[taxa_in_data == TRUE]

  # calculate concentrations
  conc_dat <- list()
  for ( ii in 1:length(valid_taxa)){
    conc_dat[[ii]] <- concentration_category(data, valid_taxa[ii], binSize, imageVolume) %>%
      dplyr::mutate(., taxa = valid_taxa[ii])
  }

  names(conc_dat) <- valid_taxa

  taxa_conc <- do.call(rbind, conc_dat)

  taxa_conc_n <- taxa_conc %>%
    dplyr::mutate(., station = station_of_interest)

  return(taxa_conc_n)
}

#' Binned concentrations
#'
#' This function produces depth binned concentrations for a specified taxa. Similar to \code{\link{bin_cast}} but calculates concentrations for only one taxa.
#' Used inside \code{\link{vpr_roi_concentration}}
#'
#'
#' @param data dataframe produced by processing internal to vpr_roi_concentration
#' @param taxa name of taxa isolated
#' @param binSize passed to \code{\link{bin_calculate}}, determines size of depth bins over which data is averaged
#' @param imageVolume the volume of VPR images used for calculating concentrations (mm^3)
#' @param rev Logical value defining direction of binning, FALSE - bins will be
#'   calculated from surface to bottom, TRUE- bins will be calculated bottom to
#'   surface
#'
#' @details Image volume calculations can change based on optical setting of VPR as well as autodeck setting used to process images
#' For IML2018051 (S2) image volume was calculated as 108155 mm^3 by seascan (6.6 cubic inches)
#' For COR2019002 S2 image volume was calculated as 83663 mm^3 and S3 image volume was calculated as 366082 mm^3
#'
#'
#' @author E. Chisholm
#'
#' @export
concentration_category <- function(data, taxa, binSize, imageVolume, rev = FALSE){
  . <- NA # avoid CRAN notes

  # remove other data rows #ADDED BY KS, DAY HOUR CHANGED TO DAY, HOUR
  nontaxa <-
    c(
      'time_ms',
      'conductivity',
      'temperature',
      'pressure',
      'salinity',
      'sigmaT',
      'fluor_ref',
      'fluorescence_mv',
      'turbidity_ref',
      'turbidity_mv',
      'altitude_NA',
      'day',
      'hour',
      'station',
      'avg_hr',
      'roi',
      'depth'
    )
  dt <- data %>%
    dplyr::select(., nontaxa, taxa)

  # get n_roi of only one taxa
  names(dt) <-
    gsub(names(dt), pattern = taxa, replacement = 'n_roi')

  # format into oce ctd
  ctd_roi_oce <- vpr_oce_create(dt)

  # bin data
  final <- bin_cast(ctd_roi_oce = ctd_roi_oce, imageVolume = imageVolume, binSize = binSize, rev = rev)

  return(final)
}


#' Bin vpr data
#'
#' Formats \code{oce} style VPR data into depth averaged bins using \code{\link{ctd_cast}} and \code{\link{bin_calculate}}
#' This function is used inside \code{\link{concentration_category}}
#'
#'
#' @param ctd_roi_oce \code{oce} ctd format VPR data from \code{\link{vpr_oce_create}}
#' @param binSize passed to \code{\link{bin_calculate}}, determines size of depth bins over which data is averaged
#' @param imageVolume the volume of VPR images used for calculating concentrations (mm^3)
#' @param rev logical value,passed to \code{\link{bin_calculate}} if TRUE, binning will begin at bottom of each cast,
#'   this controls data loss due to uneven binning over depth. If bins begin at
#'   bottom, small amounts of data may be lost at the surface of each cast, if
#'   binning begins at surface (rev = FALSE), small amounts of data may be lost
#'   at bottom of each cast
#'
#' @details Image volume calculations can change based on optical setting of VPR as well as autodeck setting used to process images
#' For IML2018051 (S2) image volume was calculated as 108155 mm^3 by seascan (6.6 cubic inches)
#' For COR2019002 S2 image volume was calculated as 83663 mm^3 and S3 image volume was calculated as 366082 mm^3
#'
#'
#'@return A dataframe of depth averaged bins of VPR data over an entire cast with calculated concentration values
#' @export
#'
#'
bin_cast <- function(ctd_roi_oce, imageVolume, binSize, rev = FALSE){

. <- avg_hr <- conc_m3 <- NA
  #find upcasts
  upcast <- ctd_cast(data = ctd_roi_oce, cast_direction = 'ascending', data_type = 'df')
  upcast2 <- lapply(X = upcast, FUN = bin_calculate, binSize = binSize, imageVolume = imageVolume, rev = rev)
  upcast_df <- do.call(rbind, upcast2)

  #find downcasts
  downcast <- ctd_cast(ctd_roi_oce, cast_direction = "descending", data_type = "df")
  downcast2 <- lapply(X = downcast, FUN = bin_calculate, binSize = binSize, imageVolume = imageVolume, rev = rev)
  downcast_df <- do.call(rbind, downcast2)

  #combine_data in bins
  vpr_depth_bin <- rbind(upcast_df, downcast_df)
  vpr_depth_bin <- data.frame(vpr_depth_bin)

  #Remove infinite concentrations (why do these occur again?)
  vpr_depth_bin <- vpr_depth_bin %>%
    dplyr::mutate(., avg_hr = avg_hr - min(avg_hr)) %>%
    dplyr::filter(., is.finite(conc_m3))

  return(vpr_depth_bin)
}



#' Create ctd oce object with vpr data
#'
#' Formats VPR data frame into \code{oce} format CTD object
#'
#' @author E. Chisholm
#'
#' @param data data frame of vpr data with variable names \itemize{'time_ms', 'fluorescence_mv', 'turbidity_mv', 'n_roi', 'sigmaT'}
#'
#' @examples
#' data('ctd_roi_merge')
#' oce_dat <- vpr_oce_create(ctd_roi_merge)
#'
#' @export
vpr_oce_create <- function(data){

  # create oce objects
  ctd_roi_oce <- oce::as.ctd(data)
  otherVars<-  c('time_ms', 'fluorescence_mv', 'turbidity_mv', 'n_roi', 'sigmaT', 'depth', 'avg_hr') # TODO edit to avoid hard coding variable names
  for ( o in otherVars){
    eval(parse(text = paste0("ctd_roi_oce <- oce::oceSetData(ctd_roi_oce, name = '",o,"', value = data$",o,")")))
  }

  return(ctd_roi_oce)
}

#' Read and format CTD VPR data
#'
#' Acts as a wrapper for \code{\link{ctd_df_cols}}
#'
#' Reads CTD data and adds day, hour, and station information.
#' Calculates sigma T and depth variables from existing CTD data to supplement raw data.
#' If there are multiple hours of CTD data, combines them into single dataframe.
#'
#' **WARNING** \code{\link{ctd_df_cols}} is hard coded to accept a specific
#' order of CTD data columns. The names and values in these columns can change
#' based on the specific instrument and should be updated/confirmed before processing data
#' from a new VPR.
#'
#' @author E. Chisholm & K. Sorochan
#'
#'
#' @param ctd_files full file paths to vpr ctd \code{.dat} files
#' @param station_of_interest VPR station name
#' @param day Day of interest, if not provided will be pulled from file path
#' @param hour Hour of interest, if not provided will be pulled from file path
#' @param col_list Optional list of CTD data column names
#'
#' @examples
#'
#' station_of_interest <- 'test'
#'
#' ctd_files <- system.file("extdata/COR2019002/rois/vpr5/d222", "h03ctd.dat",
#' package = "vprr", mustWork = TRUE)
#'
#' ctd_dat_combine <- vpr_ctd_read(ctd_files, station_of_interest)
#'
#' @export

vpr_ctd_read <- function(ctd_files, station_of_interest, day, hour, col_list){

  # avoid CRAN notes
  . <- NA

if(length(ctd_files) == 0){
  stop('No CTD files provided!')
}
  ctd_dat <- list()
  for (i in 1:length(ctd_files)){

    if(missing(day)){
    day_id <- unlist(vpr_day(ctd_files[i]))
    }else{
      day_id <- day
    }

    if(missing(hour)){
    hour_id <- unlist(vpr_hour(ctd_files[i]))
    }else{
      hour_id <- hour
    }


    station_id <- station_of_interest

    if(missing(col_list)){
      ctd_dat_tmp <- ctd_df_cols(ctd_files[i])
    }else{
        ctd_dat_tmp <- ctd_df_cols(ctd_files[i], col_list)
    }

    ctd_dat[[i]] <- data.frame(ctd_dat_tmp,
                               day = day_id,
                               hour = hour_id,
                               station = station_id,
                               stringsAsFactors = FALSE)
  }


  # combine ctd dat

  ctd_dat_combine <- do.call(rbind, ctd_dat)

  # add calculated vars as default
  # sigma t, depth

  ctd_dat_combine <- ctd_dat_combine %>%
    dplyr::mutate(., sigmaT = oce::swSigmaT(
      ctd_dat_combine$salinity,
      ctd_dat_combine$temperature,
      ctd_dat_combine$pressure
    )) %>%
  dplyr::mutate(., depth = oce::swDepth(ctd_dat_combine$pressure)) # note that default latitude is used (45)


  return(ctd_dat_combine)
}


#'Merge CTD and ROI data from VPR
#'
#'Combines CTD data (time, hydrographic parameters), with ROI information
#'(identification number) into single dataframe, aligning ROI identification
#'numbers and taxa classifications with time and hydrographic parameters
#'
#'@author E. Chisholm & K. Sorochan
#'
#'@param ctd_dat_combine a CTD dataframe from VPR processing from \code{\link{vpr_ctd_read}}
#'@param roi_dat_combine a data frame of roi aid data from \code{\link{vpr_autoid_read}}
#'
#'
#' @examples
#' data('ctd_dat_combine')
#' data('roi_dat_combine')
#'
#' ctd_roi_merge <- vpr_ctdroi_merge(ctd_dat_combine, roi_dat_combine)
#'@export
#'
vpr_ctdroi_merge <- function(ctd_dat_combine, roi_dat_combine){

  # avoid CRAN notes
  . <- roi <- NA
  # First subset ctd data by roi id
  ctd_time <- ctd_dat_combine$time_ms
  roi_time <- as.numeric(roi_dat_combine$time_ms)

  roi_index <- which(ctd_time %in% roi_time)
  ctd_subset <- data.frame(ctd_dat_combine[roi_index, ])


  # Get total number of rois per frame
  taxas <- colnames(roi_dat_combine)[!(colnames(roi_dat_combine) %in% c('time_ms', 'roi'))]
  taxa_col_id <- which(colnames(roi_dat_combine) %in% taxas)
  taxa_subset <- roi_dat_combine[,taxa_col_id]
  n_roi_total <- base::rowSums(taxa_subset)
  roi_dat_2 <- data.frame(roi_dat_combine, n_roi_total)

  # Combine subsetted CTD and roi data
  ctd_subset_roi <- full_join(ctd_subset, roi_dat_2)

  # combine subsetted roi data and all CTD data such that frames with zero rois are included
  ctd_roi_merge <- ctd_subset_roi %>%
    dplyr::right_join(., ctd_dat_combine)

  ctd_roi_merge[is.na(ctd_roi_merge)] <- 0

  ctd_roi_merge <- ctd_roi_merge %>%
    dplyr::mutate(., roi = ifelse(roi == 0, NA, roi))

  return (ctd_roi_merge)
}


#'Read VPR aid files
#'
#'Read aid text files containing ROI string information or measurement data and output as a dataframe
#'
#'Only outputs either ROI string information OR measurement data but both file types must be provided
#'
#'
#' @author E. Chisholm & K. Sorochan
#'
#'@param  file_list_aid a list object of aid text files, containing roi strings. Output from matlab Visual Plankton software.
#'@param file_list_aidmeas  a list object of aidmea text files, containing ROI measurements. Output from matlab Visual Plankton software.
#'@param export a character string specifying which type of data to output, either 'aid' (roi strings) or 'aidmeas' (measurement data)
#'@param station_of_interest Station information to be added to ROI data output, use NA if irrelevant
#'@param opticalSetting Optional argument specifying VPR optical setting. If provided will be used to convert size data into mm from pixels, if missing size data will be output in pixels
#'@param warn Logical, FALSE silences size data unit warnings
#'
#'@note Full paths to each file should be specified
#'
#' @examples
#'
#' station_of_interest <- 'test'
#' dayhour <- c('d222.h03', 'd222.h04')
#'
#' #' #VPR OPTICAL SETTING (S0, S1, S2 OR S3)
#' opticalSetting <- "S2"
#' imageVolume <- 83663 #mm^3
#'
#' auto_id_folder <- system.file('extdata/COR2019002/autoid/', package = 'vprr', mustWork = TRUE)
#' auto_id_path <- list.files(paste0(auto_id_folder, "/"), full.names = TRUE)
#'
#' #'   # Path to aid for each taxa
#' aid_path <- paste0(auto_id_path, '/aid/')
#' # Path to mea for each taxa
#' aidmea_path <- paste0(auto_id_path, '/aidmea/')
#'
#' # AUTO ID FILES
#' aid_file_list <- list()
#' aidmea_file_list <- list()
#' for (i in 1:length(dayhour)) {
#'   aid_file_list[[i]] <-
#'     list.files(aid_path, pattern = dayhour[[i]], full.names = TRUE)
#'   # SIZE DATA FILES
#'   aidmea_file_list[[i]] <-
#'     list.files(aidmea_path, pattern = dayhour[[i]], full.names = TRUE)
#' }
#'
#' aid_file_list_all <- unlist(aid_file_list)
#' aidmea_file_list_all <- unlist(aidmea_file_list)
#'
#'  # ROIs
#' roi_dat_combine <-
#'   vpr_autoid_read(
#'     file_list_aid = aid_file_list_all,
#'     file_list_aidmeas = aidmea_file_list_all,
#'     export = 'aid',
#'     station_of_interest = station_of_interest,
#'     opticalSetting = opticalSetting,
#'     warn = FALSE
#'   )
#'
#' # MEASUREMENTS
#' roimeas_dat_combine <-
#'   vpr_autoid_read(
#'     file_list_aid = aid_file_list_all,
#'     file_list_aidmeas = aidmea_file_list_all,
#'     export = 'aidmeas',
#'     station_of_interest = station_of_interest,
#'     opticalSetting = opticalSetting,
#'     warn = FALSE
#'  )
#'
#' @export
vpr_autoid_read <- function(file_list_aid, file_list_aidmeas, export, station_of_interest, opticalSetting, warn = TRUE){

  # avoid CRAN notes
  . <- roi <- taxa <- n_roi <- day_hour <- Perimeter <- Area <- width1 <- width2 <- width3 <- short_axis_length <- long_axis_length <- NA
if( export == 'aidmeas'){
  if (missing(opticalSetting)){
    opticalSetting <- NA
    if(warn != FALSE){
    warning('No optical setting provided, size data output in pixels!!!')
    }
  }
}
# aid

  col_names <- c("roi")
  dat <- list()
  for(i in 1:length(file_list_aid)) {

    data_tmp <- read.table(file = file_list_aid[i], stringsAsFactors = FALSE, col.names = col_names)

    data_tmp$roi <- unlist(vpr_roi(data_tmp$roi))




    data_tmp$taxa <- unlist(vpr_category(file_list_aid[i]))
    day <- unlist(vpr_day(file_list_aid[i]))
    hour <- unlist(vpr_hour(file_list_aid[i]))
    data_tmp$day_hour <- paste(day, hour, sep = ".")
    dat[[i]]<- data_tmp

  }


  dat_combine_aid <- do.call(rbind, dat)

  #browser()
  remove(dat, data_tmp, day, hour)

# aidmeas


  dat <- list()
  col_names <- c('Perimeter','Area','width1','width2','width3','short_axis_length','long_axis_length')
  for(i in 1:length(file_list_aidmeas)) {

    data_tmp <- read.table(file_list_aidmeas[i], stringsAsFactors = FALSE, col.names = col_names)


if(!is.na(opticalSetting)){
      data_tmp <- px_to_mm(data_tmp, opticalSetting)
}




  data_tmp$taxa <- unlist(vpr_category(file_list_aidmeas[i]))
  day <- unlist(vpr_day(file_list_aidmeas[i]))
  hour <- unlist(vpr_hour(file_list_aidmeas[i]))
  data_tmp$day_hour <- paste(day, hour, sep = ".")
  dat[[i]]<- data_tmp

  }

  dat_combine_aidmeas <- do.call(rbind, dat)

  # remove(dat, data_tmp, day, hour)


# format
  dat_combine_aid$id <- row.names(dat_combine_aid)
  dat_combine_aidmeas$id <- row.names(dat_combine_aidmeas)


    # Get tabulated rois per time by taxa
    roi_df <- dat_combine_aid %>%
      dplyr::mutate(., roi = substr(roi, 1, 8)) %>%
      dplyr::group_by(., taxa, roi) %>%
      dplyr::summarise(., n_roi = n()) %>%
      tidyr::spread(., taxa, n_roi) %>%
      dplyr::mutate(., time_ms = as.numeric(roi))

    roi_dat <- data.frame(roi_df)
    roi_dat[is.na(roi_dat)] <- 0




    # Get roi measurement data frame
    dat_combine_selected <- dat_combine_aidmeas %>%
      dplyr::select(., taxa, day_hour, id, Perimeter, Area, width1, width2, width3, short_axis_length, long_axis_length) #added all measurement columns EC Jan 28 2020

    roimeas_dat_combine <- right_join(dat_combine_aid, dat_combine_selected, by = c('taxa', 'day_hour', 'id') ) %>%
      dplyr::select(., - id) %>%
      dplyr::mutate(., station = station_of_interest) %>%
      dplyr::mutate(., long_axis_length = as.numeric(long_axis_length)) %>%
      dplyr::mutate(., time_ms = as.numeric(substr(roi, 1, 8)))

  #  browser()
# export
  if (export == 'aid'){
    return(roi_dat)
  }

  if (export == 'aidmeas'){
     return(roimeas_dat_combine)

  }



}



#'Get conversion factor for pixels to mm for roi measurements
#'
#'Used internally
#'
#' @details converts pixels to mm using conversion factor specific to optical setting
#'
#' @param x an aidmea data frame (standard) to be converted into mm from pixels
#' @param opticalSetting the VPR setting determining the field of view and conversion factor between mm and pixels
#'
#' @details Options for opticalSetting are 'S0', 'S1', 'S2', or 'S3'
#'
#' @export
px_to_mm <- function(x, opticalSetting) {



  #find correct conversion factor based on VPR optical setting
  if (opticalSetting == 'S0'){
    #px to mm conversion factor
    frame_mm <- 7
    mm_px <- frame_mm/1024 #1024 is resolution of VPR images (p.4 DAVPR manual)
  }
  if (opticalSetting == 'S1'){
    #px to mm conversion factor
    frame_mm <- 14
    mm_px <- frame_mm/1024 #1024 is resolution of VPR images (p.4 DAVPR manual)
  }
  if (opticalSetting == 'S2'){
    #px to mm conversion factor
    frame_mm <- 24
    mm_px <- frame_mm/1024 #1024 is resolution of VPR images (p.4 DAVPR manual)
  }
  if (opticalSetting == 'S3'){
    #px to mm conversion factor
    frame_mm <- 48
    mm_px <- frame_mm/1024 #1024 is resolution of VPR images (p.4 DAVPR manual)
  }
  #original default to S2 setting
  #mm_px <- 24/1024 #mm/pixel

  mm2_px2 <- (mm_px)^2

  x[, c(1, 3:7)] <- x[, c(1, 3:7)]*mm_px
  x[,2] <- x[,2]*mm2_px2

  return(x)

}


#'Read CTD data (SBE49) and Fluorometer data from CTD- VPR package
#'
#'Internal use \code{\link{vpr_ctd_read}}
#'
#'**WARNING** This is hard coded to accept a specific
#' order of CTD data columns. The names and values in these columns can change
#' based on the specific instrument and should be updated before processing data
#' from a new VPR.
#'
#'Text file format .dat file
#'Outputs ctd dataframe with variables time_ms, conductivity, temperature,
#'pressure, salinity, fluor_ref, fluorescence_mv, turbidity_ref,
#'turbidity_mv, altitude_NA
#' @author K. Sorochan, E. Chisholm
#'
#'
#'
#'@param x full filename (ctd .dat file)
#' @param col_list list of CTD data column names
#'
#'@export
ctd_df_cols <- function(x, col_list) {

  if(missing(col_list)){
    col_list <- c("time_ms", "conductivity", "temperature", "pressure", "salinity", "fluor_ref", "fluorescence_mv",
      "turbidity_ref", "turbidity_mv", "altitude_NA")
    warning('CTD data columns named based on 2019 defaults!')
  }


  data <- read.table(textConnection(gsub(":", ",", readLines(x))), sep = ",")
  time <- data[,1]
  time <- as.numeric(gsub("[^[:digit:]]", "", time))


  data2 <- cbind(time, data[,-c(1)])
  colnames(data2) <- col_list
  data2 <- data2[!duplicated(data2), ]
  data2
}



#' Normalize a matrix
#'
#' take each element of matrix dived by column total
#'
#' Make sure to remove total rows before using with VP data
#'
#' @note used internally for visualization of confusion matrices
#'
#' @param mat a matrix to normalize
#'
#'
normalize_matrix <- function(mat){


  nm <- matrix(nrow = dim(mat)[1], ncol = dim(mat)[2])
  for(i in 1:length(nm[,1])){
    for (j in 1:length(nm[1,])){
      nm[i,j] <- mat[i,j]/colSums(mat)[j]
    }
  }
  return(nm)

}

#' Get size data from idsize files
#'
#'
#' useful for getting size distribution of known rois from each taxa. gathers
#' size information from idsize text files produced when training a new
#' classifier in VP (Visual Plankton)
#'
#'
#'@param directory cruise directory eg. 'C:/data/IML2018051/'
#'@param taxa list of character elements containing taxa of interest
#'@param opticalSetting VPR optical setting determining conversion between pixels and millimetres (options are 'S0', 'S1', 'S2', or 'S3')
#'
#' @export


# @examples
# \dontrun{
#
# }

vpr_trrois_size <- function(directory, taxa, opticalSetting){

  #loop for each taxa of interest
  for (t in taxa){
    #check
    # g <- grep(taxa_names, pattern = t)
    # if (length(g) == 0){
    #   stop(paste('Taxa of interest, ', t, 'not found in data provided!'))
    # }
    #
    size_file <- list.files(path = paste0(directory,'/idsize'), pattern = paste0('mea.', t))
    #roi_file <- list.files(path = paste0(directory,'/idsize'), pattern = paste0('hid.v0.',t))

    #Get info
    #roi_ID <- read.table(paste0(directory,'/idsize/', roi_file), stringsAsFactors = FALSE)
    auto_measure_px <- read.table(paste0(directory, '/idsize/', size_file), stringsAsFactors = FALSE, col.names = c('Perimeter','Area','width1','width2','width3','short_axis_length','long_axis_length'))

    eval(parse(text = paste0('auto_measure_', t,'_mm <- px_to_mm(auto_measure_px, opticalSetting )'))) #Convert to mm


  }
  #returns a data frame with size information and named columns
  eval(parse(text = paste0('return(auto_measure_', t,'_mm)')))
}



#' Get bin averages for VPR and CTD data
#'
#' Bins CTD data for an individual cast to avoid depth averaging across tow-yo's
#'
#' @author E. Chisholm, K. Sorochan
#'
#'

#' @param data ctd data frame object including scan, salinity, temperature,
#'   depth, conductivity, time, fluor_ref, turbidity_ref, turbidity_mv,
#'   altitude, cast_id, n_roi
#' @param binSize the height of bins over which to average, default is 1 metre
#' @param imageVolume the volume of VPR images used for calculating concentrations (mm^3)
#' @param rev logical value, if TRUE, binning will begin at bottom of each cast,
#'   this controls data loss due to uneven binning over depth. If bins begin at
#'   bottom, small amounts of data may be lost at the surface of each cast, if
#'   binning begins at surface (rev = FALSE), small amounts of data may be lost
#'   at bottom of each cast
#'
#'
#' @details Image volume calculations can change based on optical setting of VPR as well as autodeck setting used to process images
#' For IML2018051 (S2) image volume was calculated as 108155 mm^3 by seascan (6.6 cubic inches)
#' For COR2019002 S2 image volume was calculated as 83663 mm^3 and S3 image volume was calculated as 366082 mm^3.
#' Used internally ( \code{\link{bin_cast}} ) after \code{\link{ctd_cast}} on a single ascending or descending section of VPR cast
#'
#'
#'
#' @note binSize should be carefully considered for best results
#' @note Depth is used for calculations! Please ensure depth is included in data frame using \link[oce]{swDepth}
#'
#' @export
#'
bin_calculate <- function(data, binSize = 1, imageVolume, rev = FALSE){

# browser()
  cast_id <-unique(data$cast_id)




  cast_id <-unique(data$cast_id)
  max_cast_depth <- max(data$depth) # ADDED BY KS TO IDENTIFY EACH TOWYO CHUNK

  p <- data$depth
  max_depth <- max(p, na.rm = TRUE)
  min_depth <- min(p, na.rm = TRUE)
  x_breaks <- seq(from = floor(min_depth), to = ceiling(max_depth), by = binSize)
  if (rev == TRUE){
    x_breaks <- seq(from = ceiling(max_depth), to = floor(min_depth), by = - binSize) #reversed by KS
  }

  # error when cast is too small
  if(max_depth - min_depth < binSize){
    warning(paste('Cast', cast_id, 'is too small to calculate information for bins of size', binSize))
    data.frame(NULL)
  }else{


  # Get variables of interest using oce bin functions

  min_time_s <- oce::binApply1D(p, data$time/1000, xbreaks = x_breaks, min)$result
  max_time_s <- oce::binApply1D(p, data$time/1000, xbreaks = x_breaks, max)$result
  min_depth <- oce::binApply1D(p, data$depth, xbreaks = x_breaks, min)$result
  max_depth <- oce::binApply1D(p, data$depth, xbreaks = x_breaks, max)$result
  n_roi_bin <- oce::binApply1D(p, data$n_roi, xbreaks = x_breaks, sum)$result
  temperature <- oce::binApply1D(p, data$temperature, xbreaks = x_breaks, mean)$result
  salinity <- oce::binApply1D(p, data$salinity, xbreaks = x_breaks, mean)$result
  density <- oce::binApply1D(p, data$sigmaT, xbreaks = x_breaks, mean)$result
  fluorescence <- oce::binApply1D(p, data$fluorescence_mv, xbreaks = x_breaks, mean)$result
  turbidity <- oce::binApply1D(p, data$turbidity_mv, xbreaks = x_breaks, mean)$result
  avg_hr <- oce::binApply1D(p, data$time/(1000*3600), xbreaks = x_breaks, mean)$result
if (rev == TRUE){

  depth <- rev(oce::binApply1D(p, data$depth, xbreaks = x_breaks, mean)$xmids)

}else{ # simplify?

    depth <- oce::binApply1D(p, data$salinity, xbreaks = x_breaks, mean)$xmids

  }
  # calculates number of frames captured per depth bin by counting number of pressure observations per bin
  n_frames <- oce::binApply1D(p, data$depth, xbreaks = x_breaks, length)$result # KS edit 10/9/19

  # WARNING
  # binApply1D does not calculate NAs, if there is binned depth range that does
  # not contain any data, the binApply function will not create an empty or NA
  # placeholder bin in that case the result length will be different than the
  # length of midpoints since the variable "pressure" is a mid point calculation it is used to
  # test for non existent empty bins. If there are non existant empty bins,
  # binMean1D will calculate them as NA, this loop fin where the bins would
  # have been located and removes those indexes from the pressure vector so the
  # length of variables is all identical

  if (!(length(depth) == length(salinity))) {

    salinity_mean <- binMean1D(p, data$salinity, xbreaks = x_breaks)$result

    idx_rm <- which(is.na(salinity_mean))

    # informs user where bins were removed due to NAs
    # note if a bin is 'NA' typically because there is no valid data in that depth range,
    # if you have a lot of NA bins, think about increasing your binSize
    message(paste('Removed bins at', depth[idx_rm]))

    lp <- length(depth)
    depth <- depth[-idx_rm]
    if (length(n_frames) == lp){
      n_frames <- n_frames[-idx_rm]
    }

  }
  # make sure n_frames matches the length of other data frame rows
  if (length(n_frames) > length(depth)){
    n_frames <- n_frames[-length(n_frames)]
  }
  if( length(n_frames) < length(depth)){
    n_frames <- c(n_frames, 0)
  }
  if (length(n_frames) != length(depth)){
    length(n_frames) <- length(depth)
  }
  # Get derived variables

  time_diff_s <- max_time_s - min_time_s

  # calculate concentration based on opticalSetting

  # "Old way" of calculating concentration assuming constant frame rate of 15 fps
  # conc_m3 <- n_roi_bin/((imageVolume/1e09)*(15)*(time_diff_s)) #

  # "New way" of calculating concentration by summing volume associated with frames over depth bin
  vol_sampled_bin_m3 <- (imageVolume/1e09)*n_frames
  conc_m3 <- n_roi_bin/(vol_sampled_bin_m3) # KS edit 10/9/19

  depth_diff <- max_depth - min_depth

  # Output
  data.frame(depth, min_depth, max_depth, depth_diff, min_time_s, max_time_s, time_diff_s,
             n_roi_bin, conc_m3,
             temperature, salinity, density, fluorescence, turbidity, avg_hr, n_frames, vol_sampled_bin_m3,
             towyo = cast_id, max_cast_depth) # MAX CAST PRESSURE ADDED BY KS
} # end else loop for size error
}


#' Isolate ascending or descending section of ctd cast
#'
#' This is an internal step required to bin data
#'
#'
#' @author  K Sorochan, E Chisholm
#'
#' @param data an \code{oce} ctd object
#' @param cast_direction 'ascending' or 'descending' depending on desired section
#' @param data_type specify 'oce' or 'df' depending on class of desired output
#' @param cutoff Argument passed to \link[oce]{ctdFindProfiles}
#' @param breaks Argument passed to \link[oce]{ctdFindProfiles}
#' @return Outputs either data frame or oce ctd object
#'
#'
#'
#' @note \code{\link{ctdFindProfiles}} arguments for \code{minLength} and \code{cutOff} were updated to
#' prevent losing data (EC 2019/07/23)
#'
#'
#' @export
#'
ctd_cast <- function(data, cast_direction = 'ascending', data_type, cutoff = 0.1, breaks = NULL) {

  cast_updated <- list()


  if (is.null(breaks)){
    cast <- oce::ctdFindProfiles(data, direction = cast_direction, minLength = 0, cutoff = cutoff)
  }else{
    cast <- oce::ctdFindProfiles(data, breaks = breaks, direction = cast_direction)

  }



  # append data with 'cast_id' to be able to identify/ combine data frames
  for(i in 1:length(cast)) {

    data <- cast[[i]]

    n_obs <- length(data@data$pressure)
    cast_number <- i
    cast_id <- paste(cast_direction, i, sep = "_")
    cast_id_vec <- rep(cast_id, n_obs)

    cast_updated[[i]] <- oce::oceSetData(data, "cast_id", cast_id_vec, "no_unit")

  }

  # output in oce format
  if(data_type == "oce") {

    cast_updated

  }

  # output in dataframe
  if(data_type == "df") {

    getDf <- function(x) {

      data.frame(x@data, stringsAsFactors = FALSE)

    }

    lapply(cast_updated, getDf)

  }

}

#' Find day & hour info to match each station of interest for processing
#'
#'
#'  @author E. Chisholm and K. Sorochan
#'
#' @param stations a vector of character values naming stations of interest
#' @param file CSV file containing 'day', 'hour', 'station', and 'day_hour' columns
#'
#' @return Vector of day-hour combinations corresponding to stations of interest
#'
#' @export
vpr_dayhour <- function(stations, file) {

  # avoid CRAN notes
  . <- station <- NA

  #####DEFINE FOR MULTIPLE STATIONS
  stations_of_interest <- stations

  #USE STATION LIST WITH CORRESPONDING DAY AND HOUR TO MATCH AREA OF INTEREST
  #match hour and day to station
  station_info <- read.csv(file, stringsAsFactors = FALSE)

  soi_info <- station_info %>%
    dplyr::filter(., station %in% stations_of_interest)

  dayhour <- unique(soi_info$day_hour)

  return(dayhour)

}



#' Create a list of ctd files to be read
#'
#' Searches through typical VP directory structure
#'
#' Use with caution
#'
#'
#' @param castdir root directory for ctd cast files
#' @param cruise cruise name (exactly as in directory structure)
#' @param day_hour vector of day-hour combinations (e.g, dXXX.hXX)
#' @author E. Chisholm and K. Sorochan
#'
#' @return vector of ctd file paths matching days-hour combinations provided
#'
#' @export
vpr_ctd_files <- function(castdir, cruise, day_hour) {


  # ADDED BY KS
  vpr_cast_folders <- list.files(castdir, pattern = '')

  # find right vpr cast -- subset by tow number #
  # folder <- grep(vpr_cast_folders, pattern = paste0('AD_', vprnum,'.VPR.', cruise,'*'), value = T) #ADDED BY KS, AD PATTERN IS NOT CONSISTENT?

  # not subset by tow number
  folder <- grep(vpr_cast_folders, pattern = paste0('VPR.', cruise,'*'), value = TRUE) # removed leading period before VPR to fit file naming scheme in COR2019002

  if (length(folder) == 0){stop("No CTD files found!")}
  folder_path <- paste0(castdir, folder)

  # grab all days
  full_path <- list.files(folder_path, full.names = TRUE)

  # extract for only specific days
  ctd_files_all <- list.files(full_path, pattern = '*ctd*', full.names = TRUE)

  day_id <- vpr_day(ctd_files_all)
  hour_id <- vpr_hour(ctd_files_all)
  day_hour_id <- paste(day_id, hour_id, sep = ".")

  ctd_files_idx <- which(day_hour_id %in% day_hour)

  ctd_files <- ctd_files_all[ctd_files_idx]

  return(ctd_files)

}


#' Get roi ids from string
#'
#' @author K Sorochan
#' @param x A string specifying directory and file name of roi
#'
#' @return A string of only the 10 digit roi identifier
#'
#' @examples
#'
#' roi_string <- 'roi.0100000000.tif'
#' vpr_roi(roi_string)
#'
#' @seealso \code{\link{vpr_hour}}, \code{\link{vpr_day}}, \code{\link{vpr_category}}
#' @export
#'
#'
vpr_roi <- function(x) {

  m <- gregexpr("\\d{10}", x)

  y <- regmatches(x, m)

  return(y)

}


#' Get taxa ids from string
#'
#' @author K Sorochan
#'
#' @param x A string specifying the directory of the "taxafolder", containing the taxa id
#'
#' @return A string of only the taxa id
#'
#' @examples
#' taxa_string <- 'C:/data/cruise/autoid/Calanus/d000/h00'
#' vpr_category(taxa_string)
#'
#' @seealso \code{\link{vpr_hour}}, \code{\link{vpr_day}}, \code{\link{vpr_roi}}
#' @export
#'
#'
vpr_category <- function(x) {
# TODO if x is a list
  taxa_ids <- c(
    "bad_image_blurry",
    "bad_image_malfunction",
    "bad_image_strobe",
    "Calanus",
    "chaetognaths",
    "ctenophores",
    "Echinoderm_larvae",
    "krill",
    "marine_snow",
    "Other",
    "small_copepod",
    "stick",
    "larval_fish",
    'other_copepods',
    'larval_crab',
    'amphipod',
    'Metridia',
    'Paraeuchaeta',
    'cnidarians'
  )

  for(i in 1:length(taxa_ids)) {

    taxa_id <- taxa_ids[i]

    m_tmp <- gregexpr(taxa_id, x)

    if (m_tmp[[1]][1] > 0) {

      m <- m_tmp

    } else{
      # stop('Taxa ID not found! Check internal list of taxa options!')
    }

  }

  y <- regmatches(x, m)

  return(y)

}



#' Get day identifier
#'
#' @author K Sorochan
#' @param x A string specifying the directory and file name of the size file
#'
#' @return A string of only the day identifier (i.e., "dXXX")
#'
#' @examples
#' day_string <- 'C:/data/cruise/autoid/Calanus/d000/h00'
#' vpr_day(day_string)
#'
#' @seealso \code{\link{vpr_hour}}, \code{\link{vpr_roi}}, \code{\link{vpr_category}}
#' @export
#'
#'
vpr_day <- function(x) {

  m <- gregexpr("[d]+\\d{3}", x)

  y <- regmatches(x, m)

  return(y)

}



vpr_hour <- function(x) {

  #' Get hour identifier
  #'
  #' @author K Sorochan
  #' @param x A string specifying the directory and file name of the size file
  #'
  #' @return A string of only the hour identifier (i.e., "hXX")
  #' @seealso \code{\link{vpr_day}}, \code{\link{vpr_roi}}, \code{\link{vpr_category}}
  #'
  #' @examples
  #' hour_string <- 'C:/data/cruise/autoid/Calanus/d000/h00'
  #' vpr_hour(hour_string)
  #'
  #' @export
  #'
  #'

  m <- gregexpr("[h]+\\d{2}", x)

  y <- regmatches(x, m)

  return(y)

}




vpr_summary <- function(all_dat, fn, tow = tow, day = day, hour = hour){
  #'  Data Summary Report
  #'
  #'  Part of VP easy plot processing, prints data summary report to give quantitative, exploratory analysis of data
  #'
  #' @author E Chisholm
  #' @param all_dat data frame containing VPR and CTD data including time_ms,
  #'   avg_hr, conductivity, temperature, pressure, salinity, fluorescence_mv,
  #'   turbidity_mv, sigmaT
  #' @param fn file name to save data summary, if not provided, summary will print to console
  #' @param tow VPR tow number
  #' @param day julian day
  #' @param hour two digit hour (24 hr clock)
  #'
  #'
  #' @export

  #prints a data summary report, part of VP easyPlot
  if(!missing(fn)){sink(fn)}

  cat('                  Data Summary Report \n')
  cat('Report processed:', as.character(Sys.time()), '\n')
  cat('Cast: ', tow, '   Day: ', day, '   Hour: ', hour, '\n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Time \n')
  cat('Data points: ', length(all_dat$time_ms),'\n')
  cat('Range: ', min(all_dat$time_ms),' - ', max(all_dat$time_ms), ' (ms) \n')
  cat('Range: ', min(all_dat$avg_hr),' - ', max(all_dat$avg_hr), ' (hr) \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Conductivity \n')
  cat('Data points: ', length(all_dat$conductivity),'\n')
  cat('Range: ', min(all_dat$conductivity),' - ', max(all_dat$conductivity), '  \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Temperature \n')
  cat('Data points: ', length(all_dat$temperature),'\n')
  cat('Range: ', min(all_dat$temperature),' - ', max(all_dat$temperature), ' (c) \n')
  cat('QC: ', length(all_dat[all_dat$temperature < 0 ]), 'points below zero deg c \n')
  cat('QC: ', length(all_dat[all_dat$temperature > 10]), 'points above ten deg c \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Pressure \n')
  cat('Data points: ', length(all_dat$pressure),'\n')
  cat('Range: ', min(all_dat$pressure),' - ', max(all_dat$pressure), ' (db) \n')
  cat('QC: ', length(all_dat[all_dat$pressure < 0 ]), 'below zero db \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Salinity \n')
  cat('Data points: ', length(all_dat$salinity),'\n')
  cat('Range: ', min(all_dat$salinity),' - ', max(all_dat$salinity), ' (PSU) \n')
  cat('QC: ', length(all_dat[all_dat$salinity < 28 ]), 'points below twenty-eight PSU \n')
  cat('QC: ', length(all_dat[all_dat$salinity > 34 ]), 'points above thirty-four PSU \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Fluorescence \n')
  cat('Data points: ', length(all_dat$fluorescence_mv),'\n')
  cat('Range: ', min(all_dat$fluorescence_mv),' - ', max(all_dat$fluorescence_mv), ' (mv) \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Turbidity \n')
  cat('Data points: ', length(all_dat$turbidity_mv),'\n')
  cat('Range: ', min(all_dat$turbidity_mv),' - ', max(all_dat$turbidity_mv), ' (mv) \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  ROI count \n')
  cat('Data points: ', length(all_dat$n_roi),'\n')
  cat('Range: ', min(all_dat$n_roi),' - ', max(all_dat$n_roi), ' (counts) \n')
  cat('\n')
  cat('\n')
  cat(' >>>>  Sigma T \n')
  cat('Data points: ', length(all_dat$sigmaT),'\n')
  cat('Range: ', min(all_dat$sigmaT),' - ', max(all_dat$sigmaT), '  \n')
  cat('QC: ', length(all_dat[all_dat$sigmaT < 22 ]), 'points below twenty-two  \n')
  cat('QC: ', length(all_dat[all_dat$sigmaT > 28 ]), 'points above twenty-eight  \n')

  if(!missing(fn)){sink()}

}



#' INTERNAL USE ONLY
#' quick data frame function from github to insert row inside dat frame
#'
#'
#' @param existingDF data frame
#' @param newrow new row of data
#' @param r index of new row
#'
insertRow <- function(existingDF, newrow, r) {


  existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),]
  existingDF[r,] <- newrow
  existingDF
}




##check through new aid files and fix errors

#outputs vpr_autoid_check report with any errors


#' Checks manually created aid files for errors
#'
#' Removes any empty aid files after manual reclassification, checks for tow
#' numbers and other metadata to match. Performs check to ensure measurement and
#' ROI files are the same length
#'
#' @author E Chisholm
#'
#' @param basepath basepath to autoid folder eg. C:/data/CRUISENAME/autoid/
#' @param cruise name of cruise which is being checked
#'
#' @return text file (saved in working directory) named CRUISENAME_aid_file_check.txt
#'
#'
#' @export
#'
#'
vpr_autoid_check <- function(basepath, cruise){



  taxa_folders <- list.files(basepath, full.names = TRUE)

  sink(paste0(cruise,'_aid_file_check.txt'))
  # loop through each taxa

  for (i in 1:length(taxa_folders)){
    path <- taxa_folders[i]

    # get all files (aid )
    aid_fns <- list.files(file.path(path, 'aid'), full.names = TRUE)

    #### EMPTY FILE CHECK
    # check for empty files
    empty_ind <- list()
    for (ii in 1:length(aid_fns)){
      fn <- readLines(aid_fns[ii])
      if(length(fn) == 0){
        cat('\n')
        cat(aid_fns[ii], '\n')
        cat('File is empty, please delete! \n')
        cat('\n')

        empty_ind[ii] <- TRUE
      }else{
        empty_ind[ii] <- FALSE
      }
    }
    cat('Empty file check complete for', taxa_folders[i], '\n')

    # automated deleteion of empty files

    empty_aids <- aid_fns[empty_ind == TRUE]

    # find corresponding aid meas files
    # get all files (aidmea )
    aidmea_fns <- list.files(file.path(path, 'aidmea'), full.names = TRUE)

    empty_aidmeas <- aidmea_fns[empty_ind == TRUE]

    # double check that files are empty
    empty_files <- c(empty_aids, empty_aidmeas)

    if (length(empty_files) != 0){ # only if empty files exist
      for (ii in 1:length(empty_files)){

        check <- readLines(empty_files[ii])

        if (length(check) != 0){

          stop('Attempting to delete file which is not empty!')
        }else{
          cat('\n')
          cat('Deleteing empty aid and aidmea files! \n')
          cat(empty_files[ii], 'deleted! \n')

          unlink(empty_files[ii])

        }
      }
    }

    # remove any empty files from data frame before running next check

    aid_fns <- aid_fns[empty_ind == FALSE]

    if(length(aid_fns) != 0){
      #### VPR TOW NUMBER CHECK
      # check that all vpr two numbers are the same


      # read each aid file
      for (ii in 1:length(aid_fns)){
        fn <- readLines(aid_fns[ii])
        # check vpr tow number

        v_loc <- stringr::str_locate(fn, 'vpr')

        vpr_num <- list()
        for (iii in 1:length(v_loc[,1])){
          fn_str <- fn[iii]
          fn_str_split <- stringr::str_split(fn_str, pattern = '\\\\')
          vpr_num[iii] <- fn_str_split[[1]][5]

        }

        # test that they are all the same

        un_num_vpr <- unique(vpr_num)
        if(length(un_num_vpr) > 1){
          cat('\n')
          cat('Warning, multiple vpr tow numbers present in', aid_fns[ii], '\n')
          cat(unlist(un_num_vpr), '\n')

          # get numeric tow numbers
          tow_num <- as.numeric(substr(un_num_vpr,4,  nchar(un_num_vpr)))

          # should be less than 14 if duplicated
          tow_num_final <- tow_num[tow_num <14]

          final_vpr <- paste0('vpr', tow_num_final)
          cat('Changing', aid_fns[ii], '\n')
          cat(final_vpr, '\n')
          cat('\n')
          # put strings back together and save file

          for(iii in 1:length(fn)){
            fn_str <- fn[iii]
            fn_str_split <- stringr::str_split(fn_str, pattern = '\\\\')
            fn_str_split[[1]][5] <- final_vpr
            # paste string back together
            s_str <- paste(fn_str_split[[1]][1], fn_str_split[[1]][2], fn_str_split[[1]][3], fn_str_split[[1]][4], fn_str_split[[1]][5], fn_str_split[[1]][6], fn_str_split[[1]][7], fn_str_split[[1]][8], sep = '\\')

            fn[iii] <- s_str

          }
          write.table(file = aid_fns[ii], fn, quote = FALSE, col.names = FALSE, row.names = FALSE)


        }


      }
      cat('VPR tow number check complete, ', taxa_folders[i], '\n')


      #### SIZE / ROI FILE LENGTH CHECK

      # check that aid and aid mea files are same length
      # find files
      sizefiles <- list.files(paste(taxa_folders[i],'aidmea',sep='\\'), full.names = TRUE)
      roifiles <- list.files(paste(taxa_folders[i],'aid',sep='\\'), full.names=TRUE)

      if(length(sizefiles) != length(roifiles)){
        cat('Mismatched number of size and roi files! \n')
      }

      for (ii in 1:length(sizefiles)){
        s_fn <- readLines(sizefiles[[ii]])
        r_fn <- readLines(roifiles[[ii]])
        if (length(s_fn > 0)){

          if (length(s_fn[[1]]) != length(r_fn[[1]])){
            cat('Warning mismatched file lengths! \n')
            cat(sizefiles[[ii]], ',', roifiles[[ii]], '\n')
            cat(length(s_fn[[1]]), ',', length(r_fn[[1]]), '\n')
          }
        }
      }

      cat('File size check complete, ', taxa_folders[i], '\n')
      cat('\n')
      cat('------------ \n')
      cat('\n')

    }else{
      cat('All files are empty, ', taxa_folders[i], '\n')
      cat('No other checks completed \n')
      cat('\n')
      cat('------------ \n')
      cat('\n')


    } # end if all files are empty for taxa

  }


  sink()

}



#deprecated ----------------------------------------------------------------------------------------------------------------------
getRoiMeasurements <- function(taxafolder, nchar_folder, unit = 'mm', opticalSetting) {

  #' THIS FUNCTION HAS BEEN DEPRECATED
  #'
  #' pull roi measurements from all taxa, all files
  #'
  #' @param taxafolder path to taxa folder (base -- autoid folder)
  #' @param nchar_folder number of characters in basepath
  #' @param unit unit data will be output in, 'mm' (default -- millimetres) or 'px' (pixels)
  #' @param opticalSetting VPR optical setting determining conversion between pixels and millimetres (options are 'S0', 'S1', 'S2', or 'S3')
  #'
  #' @note This function is very finicky, easily broken because it relies on character string splitting.
  #' taxaFolder argument should not end in a backslash, please check output carefully to
  #' ensure taxa names or ROI numbers have been properly sub string'd
  #' @export
  #browser()

  # avoid CRAN notes
  . <- day <- hour <- NA

  .Deprecated('vpr_autoid_read')

  auto_measure_mm_alltaxa_ls <- list()
  # browser()
  for (i in 1:length(taxafolder)) {
    # print(paste( 'i = ',i))
    #find files
    sizefiles <- list.files(paste(taxafolder[i],'aidmea',sep='\\'), full.names = TRUE)
    roifiles <- list.files(paste(taxafolder[i],'aid',sep='\\'), full.names=TRUE)

    #remove dummy files for vpr_manual_classification
    #check for dummy files
    sfd <-  grep(sizefiles, pattern = 'dummy')
    rfd <-  grep(roifiles, pattern = 'dummy')

    if (length(rfd) != 0){
      # print('dummy files removed')
      #remove dummy files from meas consideration to avoid error
      sizefiles <- sizefiles[-sfd]
      roifiles <- roifiles[-rfd]

    }
    #skip for blank taxa
    #browser()
    if(length(roifiles) == 0){
      SKIP = TRUE
      # print(paste(i , ': SKIP == TRUE'))
      #browser()
      #i = i+1
      #find files
      # sizefiles <- list.files(paste(taxafolder[i],'aidmea',sep='\\'), full.names = T)
      # roifiles <- list.files(paste(taxafolder[i],'aid',sep='\\'), full.names=T)

    } else{
      SKIP = FALSE
      # print(paste(i, 'SKIP == FALSE'))

      #prevent mixing of taxa in same list where some hours were not properly being overwirtten
      auto_measure_mm_ls <- list() #moved from before i loop, attempt to correct bug


      for(j in 1:length(sizefiles)) {
        #print(paste('j = ', j))
        sizefile <- sizefiles[j]
        roifile <- roifiles[j]

        ##make sure file will not produce error
        mtry <- try(read.table(sizefile, sep = ",", header = TRUE),
                    silent = TRUE)

        if (class(mtry) != "try-error") {
          # print('try error == FALSE')
          #Get info
          roi_ID <- read.table(roifile, stringsAsFactors = FALSE)
          auto_measure_px <- read.table(sizefile, stringsAsFactors = FALSE, col.names = c('Perimeter','Area','width1','width2','width3','short_axis_length','long_axis_length'))

        } else {
          # print(paste('cannot open roi file from ', taxafolder[i]))
          #      print(roifiles)
          stop(paste("File [", roifile, "] doesn't exist or is empty, please check!"))
        }

        #convert to mm
        if (unit == 'mm'){
          auto_measure_mm_tmp <- px_to_mm(auto_measure_px, opticalSetting) #Convert to mm
        }else{
          #or leave in pixels
          auto_measure_mm_tmp <- auto_measure_px
        }

        #auto_measure_mm$roi_ID <- (roi_ID$V1) #Get roi ids

        #!!!!!!!!!!!!!!!!!!!!!!!!!#
        #!! WARNING HARD CODING !!#
        #!!!!!!!!!!!!!!!!!!!!!!!!!#

        #auto_measure_mm$roi_ID <- substr(auto_measure_mm$roi_ID, nchar(auto_measure_mm$roi_ID)-13, nchar(auto_measure_mm$roi_ID)-4) #Remove path information for rois
        #auto_measure_mm$roi_ID <- lapply(auto_measure_mm$roi_ID, vpr_roi)

        #taxa <- substr(taxafolder[i], nchar_folder + 2, nchar(taxafolder[i])) #Get taxa label
        #taxa <- substr(taxafolder[i], nchar_folder + 1, nchar(taxafolder[i])) #Get taxa label

        #auto_measure_mm$taxa <- rep(taxa, nrow(auto_measure_mm)) #add taxa to dataset


        #day_hour <- substr(sizefile, nchar(sizefile) - 7, nchar(sizefile))
        #auto_measure_mm$day_hour <- rep(day_hour, nrow(auto_measure_mm))
        #saveRDS(auto_measure_mm, paste(taxafolder, "/", "measurements_mm_", taxa[i], ".RDS", sep=""))

        taxafolder_tmp <- taxafolder[i]
        # browser()
        auto_measure_mm <- auto_measure_mm_tmp %>%
          dplyr::mutate(., roi_ID = unlist(lapply(roi_ID$V1, vpr_roi))) %>%
          dplyr::mutate(., taxa = unlist(lapply(taxafolder_tmp, vpr_category))) %>%
          dplyr::mutate(., day = unlist(lapply(sizefile, vpr_day))) %>%
          dplyr::mutate(., hour = unlist(lapply(sizefile, vpr_hour))) %>%
          dplyr::mutate(., day_hour = paste(as.character(day), as.character(hour), sep = ".")) %>%
          dplyr::select(., -day, -hour)

        auto_measure_mm_ls[[j]] <- auto_measure_mm

        if(auto_measure_mm$day[1] == 'd240.h09' ){
          # browser()
          #cat('d240.h09')
          # cat(taxafolder[i], '\n')
          #cat('number of ROIs: ', length(auto_measure_mm$roi_ID), '\n')
          #cat('number of unique ROIs: ', length(unique(auto_measure_mm$roi_ID)), '\n')

        }
        #print(paste('completed', roifiles))

      }

      auto_measure_mm_alltaxa_ls[[i]] <- do.call(rbind, auto_measure_mm_ls)
      #browser()

      # print(paste('completed', taxafolder[i]))
    }
  }



  auto_measure_mm_alltaxa_df <- do.call(rbind, auto_measure_mm_alltaxa_ls)
  #browser()
  return(auto_measure_mm_alltaxa_df)



}

# deprecated ? ----------------------------------------------------------------------------------------------------------


#####PLOTTING FUNCTIONS#####


#' Size Frequency plots for VPR data
#'
#' This uses the \code{\link{hist}} plot function in base R to give a histogram of size (long axis length) frequency within a taxa.
#' \strong{!!WARNING:} this function uses hard coded plot attributes
#'
#' @author K. Sorochan
#'
#' @param x a data frame with columns 'taxa', 'long_axis_length'
#' @param number_of_classes numeric value passed to nclass argument in hist()
#' @param colour_of_bar character value defining colour of plotted bars
#'
#'
#' @export
#'
#'
vpr_plot_sizefreq <- function(x, number_of_classes, colour_of_bar) {

  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))

  # avoid CRAN notes
  . <- NA
  data <- x
  taxa <- unique(data$taxa)

  for(i in 1:length(taxa)) {

    par(mfrow = c(1,2))

    taxa_id <- taxa[i]

    data_hist <- data %>%
      dplyr::filter(., taxa == taxa_id)

    data_hist2 <- data_hist$long_axis_length

    hist(data_hist2, nclass = number_of_classes, col = colour_of_bar, xlab = "Long axis of bug (mm)", main = taxa_id) #Eventually you will want to loop through taxa

    if(length(taxa) == 1) {

      plot.new()

    }

  }

}


#balloon plot with isopycnals final

#create TS data frame
isopycnal_calculate<- function(sal, pot.temp, reference.p = 0){
  #' Get vector to draw isopycnal lines on TS plot
  #' Used internally to create TS plots
  #' @author E. Chisholm
  #'
  #' @param sal salinity vector
  #' @param pot.temp temperature vector in deg C
  #' @param reference.p reference pressure for calculation, set to 0
  #'
  #'
  #' @note: modified from source:\url{https://github.com/Davidatlarge/ggTS/blob/master/ggTS_DK.R}
  #' @export

  # avoid CRAN notes
  density <- NA

  TS <- expand.grid(
    sal = seq(floor(min(sal, na.rm = TRUE)), ceiling(max(sal, na.rm = TRUE)), length.out = 100),
    pot.temp = seq(floor(min(pot.temp, na.rm = TRUE)), ceiling(max(pot.temp, na.rm = TRUE)), length.out = 100)
  )
  TS$density <- gsw::gsw_rho_t_exact(SA = TS$sal, t = TS$pot.temp, p = reference.p) - 1000 # the function calculates in-situ density, but because potential temperature and a single reference pressure is used the result equals potential density at reference pressure

  # isopycnal labels
  # +- horizontal isopycnals
  h.isopycnals <- subset(TS,
                         sal == ceiling(max(TS$sal)) & # selects all rows where "sal" is the max limit of the x axis
                           round(density,1) %in% seq(min(round(TS$density*2)/2, na.rm = TRUE),
                                                     max(round(TS$density*2)/2, na.rm = TRUE),
                                                     by = .5)) # selects any line where the rounded denisty is equal to density represented by any isopycnal in the plot
  if(nrow(h.isopycnals)>0){
    h.isopycnals$density <- round(h.isopycnals$density, 1) # rounds the density
    h.isopycnals <- aggregate(pot.temp~density, h.isopycnals, mean) # reduces number of "pot.temp" values to 1 per each unique "density" value
  }

  # +- vertical isopycnals
  if(nrow(h.isopycnals)==0){ # if the isopycnals are not +- horizontal then the df will have no rows
    rm(h.isopycnals) # remove the no-line df

    v.isopycnals <- subset(TS, # make a df for labeling vertical isopycnals
                           pot.temp == ceiling(max(TS$pot.temp)) & # selects all rows where "sal" is the max limit of the x axis
                             round(density,1) %in% seq(min(round(TS$density*2)/2),
                                                       max(round(TS$density*2)/2),
                                                       by = .5)) # selects any line where the rounded denisty is equal to density represented by any isopycnal in the plot
    v.isopycnals$density <- round(v.isopycnals$density, 1) # rounds the density
    v.isopycnals <- aggregate(sal~density, v.isopycnals, mean) # reduces number of "pot.temp" values to 1 per each unique "density" value
  }


  return(TS)
}

vpr_plot_TS <- function(x, reference.p = 0, var){

  #' Make a balloon plot against a TS plot
  #'
  #' TS balloon plot with ROI concentration, sorted by taxa
  #' includes isopycnal line calculations
  #'
  #' @author E. Chisholm
  #'
  #' @param x dataframe with temperature, salinity, number of rois (n_roi_bin)
  #' @param reference.p reference pressure (default at 0 for surface)- used to calculate isopycnals
  #' @param var variable on which size of points will be based, eg conc_m3 or n_roi_bin
  #'
  #'
  #'
  #' @note modified from source: \url{https://github.com/Davidatlarge/ggTS/blob/master/ggTS_DK.R}
  #'
  #'@export

# avoid CRAN notes
  p <- NA

  #get isopycnal lines
  sal <-  x$salinity
  pot.temp <-  x$temperature

  # make TS long table
  TS <- expand.grid(
    sal = seq(floor(min(sal, na.rm = TRUE)), ceiling(max(sal, na.rm = TRUE)), length.out = 100),
    pot.temp = seq(floor(min(pot.temp, na.rm = TRUE)), ceiling(max(pot.temp, na.rm = TRUE)), length.out = 100)
  )
  TS$density <- gsw::gsw_rho_t_exact(SA = TS$sal, t = TS$pot.temp, p = reference.p) - 1000 # the function calculates in-situ density, but because potential temperature and a single reference pressure is used the result equals potential density at reference pressure

  #removed isopycnal line labelling scheme so that every isopycnal line could be labelled using different method
  # isopycnal labels for plotting
  # +- horizontal isopycnals
  # h.isopycnals <- subset(TS,
  #                        sal == ceiling(max(TS$sal)) & # selects all rows where "sal" is the max limit of the x axis
  #                          round(density,2) %in% seq(min(round(TS$density*2)/2, na.rm = TRUE),
  #                                                    max(round(TS$density*2)/2, na.rm = TRUE),
  #                                                    by = .5)) # selects any line where the rounded denisty is equal to density represented by any isopycnal in the plot
  # if(nrow(h.isopycnals)>0){
  #   h.isopycnals$density <- round(h.isopycnals$density, 2) # rounds the density
  #   h.isopycnals <- aggregate(pot.temp~density, h.isopycnals, mean) # reduces number of "pot.temp" values to 1 per each unique "density" value
  # }
  #
  # #removing this calculates more isopycnals to be labelled
  # #otherwsie only labels one isopycnal line on plot
  # # +- vertical isopycnals (labels for plotting)
  # #if(nrow(h.isopycnals)==0){ # if the isopycnals are not +- horizontal then the df will have no rows
  #  # rm(h.isopycnals) # remove the no-line df
  #
  #   v.isopycnals <- subset(TS, # make a df for labeling vertical isopycnals
  #                          pot.temp == ceiling(max(TS$pot.temp)) & # selects all rows where "sal" is the max limit of the x axis
  #                            round(density,2) %in% seq(min(round(TS$density*2)/2),
  #                                                      max(round(TS$density*2)/2),
  #                                                      by = .5)) # selects any line where the rounded denisty is equal to density represented by any isopycnal in the plot
  #   v.isopycnals$density <- round(v.isopycnals$density, 2) # rounds the density
  #   v.isopycnals <- aggregate(sal~density, v.isopycnals, mean) # reduces number of "pot.temp" values to 1 per each unique "density" value
  # #}

  #plot
  eval(parse(text = paste0("p <- ggplot()+
                           #isopycnal lines
                           geom_contour(data = TS, aes(x = sal, y = pot.temp, z = density), col = 'grey', linetype = 'solid',
                           breaks = seq(min(round(TS$density*2)/2, na.rm = TRUE), # taking density times 2, rounding and dividing by 2 rounds it to the neares 0.5
                           max(round(TS$density*2)/2, na.rm = TRUE),
                           by = .5)) +
                           geom_text_contour(data = TS, aes(x = sal, y =pot.temp, z= density),binwidth = 0.5, col = 'grey', nudge_x = 0.1)+ #CONTOUR LABELS
                           #roi data sorted by number of rois and taxa
                           geom_point(data = x, aes(x = salinity, y = temperature, size = ", var, "), shape = 21) +
                           scale_size_area(max_size=10)+ #make balloons bigger
                           #label legends
                           labs(size = expression('Concentration /m'^3)) +
                           labs(col = 'Taxa')+
                           #set x axis (ensure scaling to data)
                           scale_x_continuous(name = 'Salinity [PSU]', expand = c(0,0),
                           limits = c(floor(min(x$salinity, na.rm = TRUE)), ceiling(max(x$salinity, na.rm = TRUE)))) + # use range of 'sal' for x axis
                           #set y axis (esure scaled to data)
                           scale_y_continuous(name = expression(paste('potential temperature [ ',degree,' C]')),
                           limits = c(floor(min(x$temperature, na.rm = TRUE)), ceiling(max(x$temperature, na.rm = TRUE)))) +
                           #get rid of grid lines and text formatting
                           theme_classic() + theme(text = element_text(size=14)) "
  )))

  #using geom_text_contour to label isopycnals instead
  # add isopycnal labels if isopycnals run +- horizontal
  # if(exists("h.isopycnals")){
  #   p <- p + geom_text(data = h.isopycnals,
  #                      aes(x = ceiling(max(TS$sal)), y = pot.temp, label = density),
  #                      hjust = "inward", vjust = 0, col = "grey")
  # }
  #
  # # add isopycnal labels if isopycnals run +- vertical
  # if(exists("v.isopycnals")){
  #   p <- p + geom_text(data = v.isopycnals,
  #                      aes(x = sal, y = ceiling(max(TS$pot.temp)), label = density),
  #                      vjust = "inward", hjust = 0, col = "grey")
  # }

  return(p)
}


vpr_plot_TScat <- function(x, reference.p = 0){

  #' Make a balloon plot
  #'
  #' Balloon plot against a TS plot with ROI concentration and sorted by taxa
  #' includes isopycnal line calculations. Version of \code{\link{vpr_plot_TS}}, with only relevant* taxa specified.
  #' *to current analysis and research objectives (See note).
  #'
  #'
  #' @note \strong{WARNING HARD CODED} FOR  5 TAXA, CALANUS, KRILL, ECHINODERM LARVAE, SMALL COPEPOD, CHAETOGNATHS
  #' !! Uses isopycnal labelling method which does not label every contour
  #'
  #' @param x dataframe with temperature, salinity, number of rois named by taxa
  #' @param reference.p reference pressure (default at 0 for surface)- used to calculate isopycnals
  #'
  #'
  #'
  #' @note modified from source: \url{https://github.com/Davidatlarge/ggTS/blob/master/ggTS_DK.R}
  #'
  #'@export

# avoid CRAN notes
  density <- salinity <- temperature <- calanus <- chaetognaths <- small_copepod <- krill <- echinoderm_larvae <- NA

  #get isopycnal lines
  sal <-  x$salinity
  pot.temp <-  x$temperature

  # make TS long table
  TS <- expand.grid(
    sal = seq(floor(min(sal, na.rm = TRUE)), ceiling(max(sal, na.rm = TRUE)), length.out = 100),
    pot.temp = seq(floor(min(pot.temp, na.rm = TRUE)), ceiling(max(pot.temp, na.rm = TRUE)), length.out = 100)
  )
  TS$density <- gsw::gsw_rho_t_exact(SA = TS$sal, t = TS$pot.temp, p = reference.p) - 1000 # the function calculates in-situ density, but because potential temperature and a single reference pressure is used the result equals potential density at reference pressure

  # isopycnal labels for plotting
  # +- horizontal isopycnals
  h.isopycnals <- subset(TS,
                         sal == ceiling(max(TS$sal)) & # selects all rows where "sal" is the max limit of the x axis
                           round(density,1) %in% seq(min(round(TS$density*2)/2, na.rm = TRUE),
                                                     max(round(TS$density*2)/2, na.rm = TRUE),
                                                     by = .5)) # selects any line where the rounded denisty is equal to density represented by any isopycnal in the plot
  if(nrow(h.isopycnals)>0){
    h.isopycnals$density <- round(h.isopycnals$density, 1) # rounds the density
    h.isopycnals <- aggregate(pot.temp~density, h.isopycnals, mean) # reduces number of "pot.temp" values to 1 per each unique "density" value
  }

  # +- vertical isopycnals (labels for plotting)
  if(nrow(h.isopycnals)==0){ # if the isopycnals are not +- horizontal then the df will have no rows
    rm(h.isopycnals) # remove the no-line df

    v.isopycnals <- subset(TS, # make a df for labeling vertical isopycnals
                           pot.temp == ceiling(max(TS$pot.temp)) & # selects all rows where "sal" is the max limit of the x axis
                             round(density,1) %in% seq(min(round(TS$density*2)/2),
                                                       max(round(TS$density*2)/2),
                                                       by = .5)) # selects any line where the rounded denisty is equal to density represented by any isopycnal in the plot
    v.isopycnals$density <- round(v.isopycnals$density, 1) # rounds the density
    v.isopycnals <- aggregate(sal~density, v.isopycnals, mean) # reduces number of "pot.temp" values to 1 per each unique "density" value
  }

  #initialize taxa
  #WARNING HARD CODING, 5 TAXA
  cols <- c('t1' = 'darkorchid3', 't2' = 'deeppink3', 't3' = 'dodgerblue3', 't4' = 'tomato3', 't5' = 'gold3')
  taxas <- c('calanus', 'chaetognaths', 'small_copepod', 'krill', 'echinoderm_larvae')
  #plot
  p <- ggplot()+
    #isopycnal lines
    geom_contour(data = TS, aes(x = sal, y = pot.temp, z = density), col = "grey", linetype = "solid",
                 breaks = seq(min(round(TS$density*2)/2, na.rm = TRUE), # taking density times 2, rounding and dividing by 2 rounds it to the neares 0.5
                              max(round(TS$density*2)/2, na.rm = TRUE),
                              by = .5)) +
    #roi data sorted by number of rois and taxa
    geom_point(data = x, aes(x = salinity, y = temperature, size = calanus, col = 't1' ), shape = 21) +
    #geom_point(data = x, aes(x = salinity, y = temperature, size = marine_snow), shape = 21) +
    #geom_point(data = x, aes(x = salinity, y = temperature, size = stick), shape = 21) +
    geom_point(data = x, aes(x = salinity, y = temperature, size = chaetognaths, col = 't2'), shape = 21) +
    geom_point(data = x, aes(x = salinity, y = temperature, size = small_copepod, col = 't3'), shape = 21) +
    geom_point(data = x, aes(x = salinity, y = temperature, size = krill, col = 't4'), shape = 21) +
    geom_point(data = x, aes(x = salinity, y = temperature, size = echinoderm_larvae, col = 't5'), shape = 21) +
    scale_colour_manual(name = 'Taxas', values = cols, guide = guide_legend(), labels = taxas) +
    scale_size_area(max_size=10)+ #make balloons bigger
    #label legends
    labs(size = 'Number of \n ROIs') +
    #labs(col = 'Taxa')+
    #set x axis (ensure scaling to data)
    scale_x_continuous(name = "salinity", expand = c(0,0),
                       limits = c(floor(min(x$salinity, na.rm = TRUE)), ceiling(max(x$salinity, na.rm = TRUE)))) + # use range of "sal" for x axis
    #set y axis (esure scaled to data)
    scale_y_continuous(name = "potential temperature [C]",
                       limits = c(floor(min(x$temperature, na.rm = TRUE)), ceiling(max(x$temperature, na.rm = TRUE)))) +
    #get rid of grid lines and text formatting
    theme_classic() + theme(text = element_text(size=14))

  # add isopycnal labels if isopycnals run +- horizontal
  if(exists("h.isopycnals")){
    p <- p + geom_text(data = h.isopycnals,
                       aes(x = ceiling(max(TS$sal)), y = pot.temp, label = density),
                       hjust = "inward", vjust = 0, col = "grey")
  }

  # add isopycnal labels if isopycnals run +- vertical
  if(exists("v.isopycnals")){
    p <- p + geom_text(data = v.isopycnals,
                       aes(x = sal, y = ceiling(max(TS$pot.temp)), label = density),
                       vjust = "inward", hjust = 0, col = "grey")
  }

  return(p)
}



vp_plot_matrix <- function(cm, classes, type, addLabels = TRUE, threshold = NULL){
  #' Plots normalized confusion matrix
  #'
  #' @author E. Chisholm
  #'
  #' @param cm Confusion matrix (numeric)
  #' @param classes character list of classes present in confusion matrix
  #'   (ordered)
  #' @param type character value 'NN', 'SVM' or 'Dual', appended to 'Confusion
  #'   Matrix' to create title
  #' @param addLabels logical value whether to add percentage accuracy labels to
  #'   plot (defaults to TRUE)
  #' @param threshold numeric value which determines the minimum value of
  #'   frequency labelled on the plot on a normalized scale of 0-1 (useful for highlighting significant
  #'   disagreement)
  #'
  #' @return  a visualization of the confusion matrix, normalized
  #'
  #' @export

  #check dimensions

  # avoid CRAN notes
  Var1 <- Var2 <- Freq <- NA

  # TODO check that this function runs , removed require(stringr), cant find stringr function

  dimcm <- dim(cm)
  if (dimcm[1] != length(classes) +1){
    stop(' Incorrect dimensions, matrix does not match classes given!')
  }



  #remove total columns
  conf <- cm[1:dimcm[1]-1,1:dimcm[2]-1]
  #create matrix and normalize
  input.matrix.normalized <- data.matrix(normalize_matrix(conf))


  #add labels
  colnames(input.matrix.normalized) = classes
  rownames(input.matrix.normalized) = classes

  #build conf mat
  confusion <- as.data.frame(as.table(input.matrix.normalized))

  #basic plot
  plot <- ggplot(confusion)

  #add data
  p <- plot + geom_tile(aes(x=Var1, y=Var2, fill=Freq)) +  #adds data fill
    theme(axis.text.x=element_text(angle=45, hjust=1)) + #fixes x axis labels
    scale_x_discrete(name="Actual Class") + #names x axis
    scale_y_discrete(name="Predicted Class") + #names y axis
    scale_fill_gradient(breaks=seq(from=-.5, to=4, by=.2)) + #creates colour scale
    labs(fill="Normalized\nFrequency") + #legend title
    #theme(legend.position = "none" ) +  #removes legend
    ggtitle(label = paste(type, 'Confusion Matrix'))

  #accuracy labels along diagonal

  if (addLabels == TRUE){
    #find diagonal values
    acc <- confusion$Freq[confusion$Var1 == confusion$Var2]
    #for each taxa
    for (i in 1:length(unique(confusion$Var1))){
      #add text label
      p <- p + annotate('text', x = i, y = i, #position on diagonal
                        #label with frequency as percent rounded to 2 digits
                        label = paste0(round(acc[i], digits = 2)*100, '%'),
                        #text formatting
                        colour = 'white',
                        size = 3)
    }
  }

  #threshold labels

  if ( !is.null(threshold)){
    for (i in 1:length(confusion$Var1)){
      #if frequency is above threshold
      if (confusion$Freq[i] > threshold ){
        #find x and y locations to plot
        x <- grep(levels(confusion$Var1), pattern = as.character(confusion$Var1[i]) )
        y <- grep(levels(confusion$Var2), pattern = as.character(confusion$Var2[i]) )
        #not already labelled on diagonal
        if( x != y){
          #add text
          p <- p + annotate('text', x = x, y = y,
                            #label - frequency as percent, rounded
                            label = paste0(round(confusion$Freq[i], digits = 2)*100, '%'),
                            #text formatting
                            colour = 'white',
                            size = 3)
        }
      }
    }
  }

  return(p)


  #end of function
}




vpr_plot_histsize <- function(data, param, title = NULL , bw = 0.1, xlim = NULL){
  #' Plot size frequency histogram
  #'
  #'@author E. Chisholm
  #' @param param size parameter of interest (corresponds to sub lists within data argument)
  #' @param data  size data from auto_measure_mm subset into taxas
  #' @param title main title for plot, if left null will default based on parameter and taxa
  #' @param bw bin width, defines width of bars on histogram, defaults to 0.1, decrease for more detail
  #' @param xlim plot xlimit, defaults to min max of data if not provided
  #'
  #'
  #' @note  param options are typically 'Perimeter', 'Area', 'width1','width2',
  #'   'width3', 'short_axis_length', 'long_axis_length'
  #'
  #' @export
  #'



  if (is.null(title)){
    title <- paste(param , ':', data$taxa[1])
  }
  if(is.null(xlim)){
    xlim <- c(min(data[[param]]), max(data[[param]]))
  }
  qplot(data[[param]], geom = 'histogram',
        binwidth = bw,
        main = title,
        xlab = 'length (mm)',
        ylab = 'Frequency',
        fill = I('green'),
        col = I('green'),
        alpha = I(.2),
        xlim = xlim)
}


vp_plot_unkn <- function(cm, classes, threshold = 0, summary = TRUE, sample_size = NULL){

  #' Function to visualize losses to unknown category due to disagreement in Dual classifier
  #'
  #' Makes confusion matrix like plot, where x axis represent SVM classification, y axis represent NN classification
  #' Allows visual summary of data lost to unknown category
  #'
  #'
  #' @param cm dual unknown confusion matrix from VP
  #' @param classes taxa groups in order, from VP
  #' @param threshold minimum value which will be labelled in plot
  #' @param sample_size character string describes the sample size used to train the model being plotted (optional)
  #' @param summary logical to add text summary to plot
  #' E. Chisholm May 2019
  #'
  #'
  #' @export

  # avoid CRAN notes
  Var1 <- Var2 <- Freq <- taxas <- NA
  dimcm <- dim(cm)
  #remove total columns
  conf <- cm[1:dimcm[1]-1,1:dimcm[2]-1]
  #create matrix and normalize
  input.matrix<- data.matrix(conf)


  #add labels
  colnames(input.matrix) = classes
  rownames(input.matrix) = classes

  #build conf mat
  confusion <- as.data.frame(as.table(input.matrix))

  #basic plot
  plot <- ggplot(confusion)

  #add data
  p<- plot + geom_tile(aes(x=Var1, y=Var2, fill=Freq, col = 'black')) +  #adds data fill
    theme(axis.text.x=element_text(angle=90, hjust=1)) + #fixes x axis labels
    scale_x_discrete(name="SVM Class") + #names x axis
    scale_y_discrete(name="NN Class") + #names y axis
    scale_fill_gradient(low = 'orchid', high = 'darkorchid4',breaks=seq(from=-.5, to=4, by=.2)) + #creates colour scale
    #labs(fill="Normalized\nFrequency") + #legend title
    theme(legend.position = "none" ) +  #removes legend
    ggtitle(label = 'Disagreement in Dual Classifier')


  threshold<- 0
  #labels
  for (i in 1:length(confusion$Var1)){
    #if frequency is above threshold
    if (confusion$Freq[i] > threshold ){
      #find x and y locations to plot
      x <- grep(levels(confusion$Var1), pattern = as.character(confusion$Var1[i]) )
      y <- grep(levels(confusion$Var2), pattern = as.character(confusion$Var2[i]) )
      #not already labelled on diagonal

      #add text
      p <- p + annotate('text', x = x, y = y,
                        #label - frequency as percent, rounded
                        label = round(confusion$Freq[i], digits = 2),
                        #text formatting
                        colour = 'white',
                        size = 3)

    }
  }

  #add summary text
  if (summary == TRUE){
    tab <- as.data.frame(
      c(
        'Sample Size' = sample_size , #update for different sizes
        'Total Disagreement' = sum(confusion$Freq),
        'Average loss per taxa' = round(sum(confusion$Freq)/length(taxas), digits = 0)
      )
    )

    # using gridExtra

    p_tab <- tableGrob(unname(tab))
    grid.arrange(p, p_tab, heights = c(1, 0.2))
  }
  return(p)

}




#contour plot with interpolation

vpr_plot_contour <- function(data, var, dup= 'mean', method = 'interp', labels = TRUE, bw = 1){

  #' Interpolated contour plot of particular variable
  #'
  #' Creates interpolated contour plot, can be used as a background for ROI or tow yo information
  #'
  #' @author E. Chisholm
  #'
  #' @param data data frame needs to include avg_hr, depth, and variable of
  #'   choice (var)
  #' @param var variable in dataframe which will be interpolated and plotted
  #' @param dup if method == 'interp'. Method of handling duplicates in interpolation, passed to interp function (options: 'mean', 'strip', 'error')
  #' @param method Specifies interpolation method, options are 'akima', 'interp'
  #'   or 'oce', akima and interp produce identical interpolations, oce uses
  #'   slightly different method (oce is least error prone)
  #' @param labels logical value indicating whether or not to plot contour labels
  #' @param bw bin width defining interval at which contours are labelled
  #'
  #' @export


 # avoid CRAN notes
  x <- y <- z <- NA
  #interpolate
  #use interp package rather than akima to avoid breaking R
  #ref: https://www.user2017.brussels/uploads/bivand_gebhardt_user17_a0.pdf
  if(method == 'akima'){
    interpdf <- akima::interp(x = data$avg_hr, y = data$depth, z = data[[var]], duplicate = dup ,linear = TRUE  )
  }
  if(method == 'interp'){
    interpdf <- interp::interp(x = data$avg_hr, y = data$depth, z = data[[var]], duplicate = dup ,linear = TRUE  )
  }
  if(method == 'oce'){
    interpdf_oce <- oce::interpBarnes(x = data$avg_hr, y = data$depth, z = data[[var]] )
    interpdf <- NULL
    interpdf$x <- interpdf_oce$xg
    interpdf$y <- interpdf_oce$yg
    interpdf$z <- interpdf_oce$zg
  }
  #convert to dataframe
  df <- akima::interp2xyz(interpdf, data.frame = TRUE)

  #zero time values
  df$x <- df$x - min(df$x)

  if(labels == TRUE){
    p <- ggplot(df) +
      geom_tile(aes(x = x, y = y, fill = z)) +
      labs(fill = var) +
      scale_y_reverse(name = 'Depth [m]') +
      scale_x_continuous(name = 'Time [h]') +
      theme_classic() +
      geom_contour(aes(x = x, y = y, z= z), col = 'black') +
      geom_text_contour(aes(x = x, y = y, z= z),binwidth = bw, col = 'white', check_overlap = TRUE, size = 8)+ #CONTOUR LABELS
      scale_fill_continuous(na.value = 'white')
  }else{
    p <- ggplot(df) +
      geom_tile(aes(x = x, y = y, fill = z)) +
      labs(fill = var) +
      scale_y_reverse(name = 'Depth [m]') +
      scale_x_continuous(name = 'Time [h]') +
      theme_classic() +
      geom_contour(aes(x = x, y = y, z= z), col = 'black') +
      scale_fill_continuous(na.value = 'white')
  }
  return(p)
}





# profile plotting

#' Plots VPR profiles of temperature, salinity, density, fluorescence and concentration (by classification group)
#'
#'
#' This plot allows a good overview of vertical distribution of individual classification groups along with reference to hydrographic parameters.
#' Facet wrap is used to create distinct panels for each taxa provided
#'
#' @param taxa_conc_n A VPR data frame with hydrographic and concentration data separated by taxa (from \code{\link{vpr_roi_concentration}})
#' @param taxa_to_plot The specific classification groups which will be plotted, if NULL, will plot all taxa combined
#' @param plot_conc Logical value whether or not to include a concentration plot (FALSE just shows CTD data)
#'
#' @return A gridded object of at least 3 ggplot objects
#' @export

vpr_plot_profile <- function(taxa_conc_n, taxa_to_plot, plot_conc){

  # avoid CRAN notes
  temperature <- depth <- salinity <- fluorescence <- density <- conc_m3 <- pressure <- NA
# plot temp
p <- ggplot(taxa_conc_n) +
  geom_point(aes(x = temperature, y = depth), col = 'red') +
  scale_y_reverse(name = 'Depth [m]')
# plot salinity
p_TS <- p + geom_point(aes(x = (salinity -25), y = depth), col = 'blue') +
  scale_x_continuous(name = expression(paste("Temperature [",degree,"C]")),sec.axis = sec_axis(~ . +25, name = 'Salinity [PSU]')) +
  theme(axis.line.x.bottom = element_line(colour = 'red'),
        axis.ticks.x.bottom = element_line(colour = 'red'),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        axis.line.y = element_line(linetype = 'solid'),
        axis.line.x.top = element_line(colour = 'blue'),
        axis.ticks.x.top = element_line(colour = 'blue'),
        axis.title = element_text(size = 20)
  )
# have to force rescale for multi axes ggplot

# plot fluorescence
p <- ggplot(taxa_conc_n) +
  geom_point(aes(x = fluorescence, y = depth), col = 'green') +
  scale_y_reverse(name = 'Depth [m]')

# plot density
p_FD <- p + geom_point(aes(x = (density  -20) *20, y = depth)) +
  scale_x_continuous(name = 'Fluorescence [mv]',sec.axis = sec_axis(~. /20  +20, name = 'Density')) +
  theme(axis.line.x.bottom = element_line(colour = 'green'),
        axis.ticks.x.bottom = element_line(colour = 'green'),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        axis.line.y = element_line(linetype = 'solid'),
        axis.line.x.top = element_line(colour = 'black'),
        axis.title = element_text(size = 20)
  )
# manual rescale


if(is.null(taxa_to_plot)){
  pp <- ggplot(taxa_conc_n) +
    geom_point(aes(x = depth, y = conc_m3/1000)) + #conversion of m3 to L using default density
    stat_summary_bin(aes(x = depth, y = conc_m3/1000), fun = 'mean', col = 'red', geom = 'line', size = 3)  +
    scale_x_reverse(name = 'Depth [m]') +
    scale_y_continuous(name = expression('ROI L'^'-1')) +
    # ggtitle('Concentrations') +
    theme_classic() +
    theme(strip.text = element_text(size = 18),
          plot.title = element_text(size = 25),
          axis.title = element_text(size = 20))+
    coord_flip()
}else{
# facet wrap plot all taxa, if only one taxa, comment out facet wrap
pp <- ggplot(taxa_conc_n[taxa_conc_n$taxa %in% c(taxa_to_plot),]) +
  geom_point(aes(x = pressure, y = conc_m3/1000)) + #conversion of m3 to L using default density
  stat_summary_bin(aes(x = pressure, y = conc_m3/1000), fun = 'mean', col = 'red', geom = 'line', size = 3)  +
  scale_x_reverse(name = 'Pressure [db]') +
  scale_y_continuous(name = expression('ROI L'^'-1')) +
  # ggtitle('Concentrations') +
  facet_wrap(~taxa, nrow = 1, ncol = length(unique(taxa_conc_n$taxa)), scales = 'free_x')+
  theme_classic() +
  theme(strip.text = element_text(size = 18),
        plot.title = element_text(size = 25),
        axis.title = element_text(size = 20))+
  coord_flip()
}

if(plot_conc == TRUE){
p <- grid.arrange(p_TS, p_FD, pp , widths = c(1, 1, 2), heights = c(2), nrow = 1, ncol = 3)
}else{
  p <- grid.arrange(p_TS, p_FD, widths = c(1, 1), heights = c(2), nrow = 1, ncol = 2)

}
return(p)
}




#### IMAGE ANALYSIS FUNCTIONS ####

#' Explore reclassified images
#'
#' Pull image from reclassified or misclassified files produced during \code{\link{vpr_manual_classification}}
#'
#' @param day Character string, 3 digit day of interest of VPR data
#' @param hour Character string, 2 digit hour of interest of VPR data
#' @param base_dir directory path to folder containing day/hour folders in which misclassified and reclassified files are organized (eg.'C:/VPR_PROJECT/r_project_data_vis/classification files/') which would contain 'd123.h01/reclassified_krill.txt' )
#' @param taxa_of_interest Classification group from which to pull images
#' @param image_dir directory path to ROI images, eg. "E:\\\\data\\\\cruise_IML2018051\\\\", file separator MUST BE "\\\\" in order to be recognized
#'
#' @return folders of misclassified or reclassified images inside image_dir
#' @export
#'
#'
vpr_img_reclassified <- function(day, hour, base_dir, taxa_of_interest, image_dir){

  ####directory where misclassified/reclassified files
  ####base_dir <- 'C:/VPR_PROJECT/r_project_data_vis/classification files/'


  #basepath where ROI images are
  ##image_dir <- 'E:\\data\\cruise_IML2018051\\'
  ##setwd(image_dir)

  #get misclassified/ reclassified images

  day_hour <- paste0('d', day, '.h', hour)


  folder <- list.files(base_dir, pattern = day_hour, full.names = TRUE)

  files <- list.files(folder, pattern = taxa_of_interest)

  #pulls out missclassified files
  #(you only want to look at images
  #you have reclassified as specific taxa - select "n" if pop up appears)

  #only exception (reason to select "y",
  #would be to look specifically at images that you pulled out of a taxa,
  #for example to check the accuracy of an automatic ID scheme)
  tt <- grep(files, pattern = 'misclassified')
  if (length(tt) > 0 ){
    warning(paste('misclassified files found for ', taxa_of_interest))
    ans <- readline('Would you like to include these files? (y/n)
                    *NOTE, looking at misclassified images will show you images that were removed from you taxa of interest
                    during reclassification, this may be useful to get an idea of the accuracy of your automatic
                    classification or which images are confusing your automatic classification.')
    if(ans == 'n'){
      files <- files[-tt]
      folder_name <- "reclassified_ROIS"
    }else{
      files <- files[tt]
      folder_name <- "misclassified_ROIS"
    }
  }

  message(paste('>>>>>', files, 'found for', taxa_of_interest, ' in ', day_hour, '!'))
  message('>>>>> Copying images now!')



  #runs through reclassified files (should only be one)
  for(ii in files) {


    #reads in roi strings
    roi_path_str <- read.table(paste0(base_dir, '/', day_hour, '/', ii), stringsAsFactors = FALSE)

    #sub out for new basepath where rois are located
    #note this is an extra step because I moved the "data" folder from my C drive
    tt<- stringr::str_locate(string = roi_path_str$V1[1], pattern = 'rois')
    sub_roi_path <- substr(roi_path_str$V1, tt[1], nchar(roi_path_str))
    new_roi_path <- file.path(image_dir, sub_roi_path, fsep = "\\")

    #Create a new folder for autoid rois (where images will be stored)
    roi_folder <- file.path(image_dir, taxa_of_interest, folder_name, fsep = "\\")
    command1 <- paste('mkdir', roi_folder, sep = " ")
    shell(command1)

    #Copy rois to this directory
    for (iii in 1:length(new_roi_path)) {

      dir_tmp <- as.character(new_roi_path[iii])
      command2 <- paste("copy", dir_tmp, roi_folder, sep = " ")
      shell(command2)

      print(paste(iii, '/', length(new_roi_path),' completed!'))
    }
message(paste('Images saved to ', roi_folder))
  }






}


vpr_img_depth <- function(data, min.depth , max.depth, roiFolder , format = 'list'){

  #' Explore VPR images by depth bin
  #'
  #' Allows user to pull VPR images from specific depth ranges, to investigate
  #' trends before classification of images into taxa groups
  #'
  #'
  #'
  #' @param data data frame containing CTD and ROI data from
  #'   \code{\link{vpr_ctdroi_merge}}, which also contains calculated variables
  #'   sigmaT and avg_hr
  #' @param min.depth minimum depth of ROIs you are interested in looking at
  #' @param max.depth maximum depth of ROIs you are interested in exploring
  #' @param roiFolder directory that ROIs are within (can be very general eg.
  #'   C:/data, but will be quicker to process with more specific file path)
  #' @param format option of how images will be output, either as 'list' a list
  #'   of file names or 'image' where images will be displayed
  #'
  #' @export
  #'


  #      #### examples
  # #determine range of interest
  # mid <- as.numeric(readline('Minimum depth of interest? '))
  # mad <- as.numeric(readline('Maximum depth of interest? '))
  # #run image exploration
  # roi_files <- vpr_img_depth(all_dat, min.depth = mid, max.depth = mad,
  # roiFolder = paste0('E:/data/IML2018051/rois/vpr', tow ), format = 'list')
  #
  # #copy image files into new directory to be browsed
  # roi_file_unlist <- unlist(roi_files)
  # newdir <- file.path(plotdir, paste0('vpr', tow, 'images_', mid, '_', mad, ''))
  # dir.create(newdir)
  # file.copy(roi_file_unlist, newdir)
  #

  # avoid CRAN notes
  . <- pressure <- roi <- NA

  data_filtered <- data %>%
    dplyr::filter(., pressure >= min.depth) %>%
    dplyr::filter(., pressure <= max.depth)

  if(length(data_filtered$roi) < 1){
    stop('No data exists within this depth range!')
  }


  #search for ROI files based on data
  roi_files <- paste0('roi.', sprintf('%08d', data_filtered$roi), '*')
  roi_file_list <- list()
  options(warn = 1)
  for (i in 1:length(roi_files)){
    roi_file_list[[i]] <- list.files(roiFolder, pattern = roi_files[i], recursive = TRUE, full.names = TRUE)
    if (length(roi_file_list[[i]]) >= 1){
      message(paste('Found', length(roi_file_list[[i]]),' files for ',roi_files[i] ))
    }else{
      warning('No file found in directory (', roiFolder, ')  matching ', roi_files[i])
    }
  }

  if( format == 'list'){
    return(roi_file_list)
  }
  if (format == 'image'){
    for(i in 1:length(roi_file_list)){
      for(ii in 1:length(roi_file_list[[i]])){
        data_roi <- data_filtered %>%
          dplyr::filter(., roi == roi[i])
        meta_str <- paste0('time (hr): ', data_roi$avg_hr[1], '\n temperature: ', data_roi$temperature[1], '\n pressure: ', data_roi$pressure[1], '\n salinity: ', data_roi$salinity[1], '\n')
        pp <-  magick::image_read(roi_file_list[[i]][ii]) %>%
          #print metadata on image
          #image_annotate(text = roi_files[i], color = 'white', size = 10) %>%
          #image_annotate(text = meta_str, color = 'white', location = '-100') %>%
          magick::image_scale(geometry = 'x300')

        print(pp)
        #print metadata
        #cat(paste0(roi_files[i], '\n'))
        #cat( paste0('time (hr): ', data_roi$avg_hr, '\n temperature: ', data_roi$temperature, '\n pressure: ', data_roi$pressure, '\n salinity: ', data_roi$salinity, '\n'))

      }
    }
  }




  #
}


vpr_img_category <- function(data, min.depth , max.depth, roiFolder , format = 'list', taxa_of_interest){

  #' Explore images by depth and classification
  #'
  #' Pulls images from specific depth ranges in specific classification group
  #'
  #'
  #'
  #'
  #'
  #' @param data data frame containing CTD and ROI data from
  #'   \code{\link{vpr_ctdroi_merge}}, which also contains calculated variables
  #'   sigmaT and avg_hr
  #' @param min.depth minimum depth of ROIs you are interested in looking at
  #' @param max.depth maximum depth of ROIs you are interested in exploring
  #' @param roiFolder directory that ROIs are within (can be very general eg.
  #'   C:/data, but will be quicker to process with more specific file path)
  #' @param format option of how images will be output, either as 'list' a list
  #'   of file names or 'image' where images will be displayed
  #' @param taxa_of_interest character string of classification group from which
  #'   to pull images
  #'
  #' @export
  #'
  #  ### examples
  # #determine range of interest
  # mid <- as.numeric(readline('Minimum depth of interest? '))
  # mad <- as.numeric(readline('Maximum depth of interest? '))
  # #run image exploration
  # roi_files <- vpr_img_category(all_dat, min.depth = mid, max.depth = mad,
  # roiFolder = paste0('E:/data/IML2018051/rois/vpr', tow ), format = 'list',
  # taxa = 'Calanus')
  #
  # #copy image files into new directory to be browsed
  # roi_file_unlist <- unlist(roi_files)
  # newdir <- file.path(plotdir, paste0('vpr', tow, 'images_', mid, '_', mad, '_', taxa))
  # dir.create(newdir)
  # file.copy(roi_file_unlist, newdir)
  #
  #

  # avoid CRAN notes
  . <- pressure <- taxa <- roi <- NA
  if(length(taxa_of_interest) > 1){
    stop("Only explore one taxa at a time!")
  }

  data_filtered <- data %>%
    dplyr::filter(., pressure >= min.depth) %>%
    dplyr::filter(., pressure <= max.depth) %>%
    dplyr::filter(., taxa %in% taxa_of_interest)

  if(length(data_filtered$roi) < 1){
    stop('No data exists within this depth and taxa range!')
  }


  #search for ROI files based on data
  roi_files <- paste0('roi.', sprintf('%08d', data_filtered$roi), '*')
  roi_file_list <- list()
  options(warn = 1)
  for (i in 1:length(roi_files)){
    roi_file_list[[i]] <- list.files(roiFolder, pattern = roi_files[i], recursive = TRUE, full.names = TRUE)
    if (length(roi_file_list[[i]]) >= 1){
      print(paste('Found', length(roi_file_list[[i]]),' files for ',roi_files[i] ))
    }else{
      warning('No file found in directory (', roiFolder, ')  matching ', roi_files[i])
    }
  }

  if( format == 'list'){
    return(roi_file_list)
  }
  if (format == 'image'){
    for(i in 1:length(roi_file_list)){
      for(ii in 1:length(roi_file_list[[i]])){
        data_roi <- data_filtered %>%
          dplyr::filter(., roi == roi[i])
        meta_str <- paste0('time (hr): ', data_roi$avg_hr[1], '\n temperature: ', data_roi$temperature[1], '\n pressure: ', data_roi$pressure[1], '\n salinity: ', data_roi$salinity[1], '\n')
        pp <-  magick::image_read(roi_file_list[[i]][ii]) %>%
          magick::image_scale(geometry = 'x300')

        print(pp)
        #print metadata
        cat(paste0(roi_files[i], '\n'))
        cat( paste0('time (hr): ', data_roi$avg_hr[1], '\n temperature: ', data_roi$temperature[1], '\n pressure: ', data_roi$pressure[1], '\n salinity: ', data_roi$salinity[1], '\n'))

      }
    }
  }




  #
}


vpr_img_copy <- function(auto_id_folder, taxas.of.interest, day, hour){
  #' Image copying function for specific taxa of interest
  #'
  #' This function can be used to copy images from a particular taxa, day and hour into distinct folders within the auto id directory
  #' This is useful for visualizing the ROIs of a particular classification group or for performing manual tertiary checks to remove
  #' images not matching classification group descriptions.
  #'
  #'
  #'
  #' @param auto_id_folder eg "D:/VP_data/IML2018051/autoid"
  #' @param taxas.of.interest eg. taxas.of.interest <- c('Calanus')
  #' @param day character, day of interest
  #' @param hour character, hour of interest
  #'
  #' @export

  #This code extracts ROIs from the VPR cast folder into folders corresponding to their autoID (from Visual Plankton)


  #modified for pulling reclassigfied images
  folder_names <- list.files(auto_id_folder)


  folder_names <- folder_names[folder_names %in% taxas.of.interest]



  day_hour <- paste0('d', day, '.h', hour)
  aid_day_hour <- paste0('aid.', day_hour)

  #read all days and hours

  # st_dat <- read.csv('C:/VPR_PROJECT/vp_info/station_names_IML2018051.csv')
  #
  # day_list <- st_dat$day
  # hour_list <- st_dat$hour
  #

  #
  # for(j in 1:length(day_list)){
  #
  #   day <- day_list[[j]]
  #   hour <- hour_list[[j]]
  #
  #   day_hour <- paste0('d', day, '.h', hour)
  #   aid_day_hour <- paste0('aid.', day_hour)


  for (i in folder_names) {

    #Get name of folder containing .txt files with roi paths within a category

    dir_roi <- file.path(auto_id_folder, i, "aid", fsep = "\\")

    #Get names of text files
    txt_roi <- list.files(dir_roi)

    subtxt <- grep(txt_roi, pattern = aid_day_hour, value = TRUE)
    txt_roi <- subtxt

    #subtxt2 <- grep(txt_roi, pattern = clf_name, value = TRUE)
    #txt_roi <- subtxt2

    for(ii in txt_roi) {

      setwd(dir_roi)

      roi_path_str <- read.table(ii, stringsAsFactors = FALSE)

      path_parts <- stringr::str_split(auto_id_folder, pattern = '/')


      auto_ind <- grep(path_parts[[1]], pattern = 'autoid')

      base_path_parts <- path_parts[[1]][-auto_ind]

      basepath <- stringr::str_c(base_path_parts,  collapse = '\\')

      auto_path <- stringr::str_c(path_parts[[1]], collapse = '\\')

      tt<- stringr::str_locate(string = roi_path_str$V1[1], pattern = 'rois')
      sub_roi_path <- substr(roi_path_str$V1, tt[1], nchar(roi_path_str))
      new_roi_path <- file.path(basepath, sub_roi_path, fsep = '\\')

      #Create a new folder for autoid rois
      roi_folder <- file.path(auto_path, i, paste0(ii, "_ROIS"), fsep = "\\")
      command1 <- paste('mkdir', roi_folder, sep = " ")
      shell(command1)

      #Copy rois to this directory
      for (iii in 1:length(new_roi_path)) {

        dir_tmp <- as.character(new_roi_path[iii])
        command2 <- paste("copy", dir_tmp, roi_folder, sep = " ")
        shell(command2)

        print(paste(iii, '/', length(new_roi_path),' completed!'))
      }

    }

    print(paste(i, 'completed!'))
  }

  print(paste('Day ', day, ', Hour ', hour, 'completed!'))
  # }

}

vpr_img_check <- function(folder_dir, basepath){

  #' Remove ROI strings from aid and aidmeas files based on a manually organized folder of images
  #'
  #' Should be used after \code{\link{vpr_img_copy}}, and manual image removal from created folders
  #'
  #'
  #'
  #' @param folder_dir directory path to day hour folders containing manually
  #'   reorganized images of a specific taxa eg.
  #'   'C:/data/cruise_IML2018051/krill/images/' where that folder contains
  #'   '......d123.h01/' which contains manually sorted images of krill
  #' @param basepath directory path to original Visual Plankton files, specified
  #'   down to the classification group. eg.
  #'   'C:/data/cruise_IML2018051/autoid/krill'
  #'@export
  #'
# this function can be used to edit aid and aidmeas files based on the images contained in a folder
  #useful if images were reorganized into classifciation groups manually in file explorer and then
  #user wants to translate this reorganization into data files


##part two of krill check to remove erroneus images

#after having copied krill images into folder using get_autoid_mod.R

#then manually removing any images which were not Krill

#once this is run, processing and plotting can be done

#E. Chisholm Sept 2019

  #if not supplied, assume folders are the same
  if(missing(basepath)){basepath <- folder_dir}

stfolders <- list.files(folder_dir, full.names = TRUE)

for (i in 1:length(stfolders)){ #for each day/hour loop

  krill_ver <- list.files(stfolders[i])


  #find matching original files
  #dh_ind <- stringr::str_locate(stfolders[i], pattern = 'aid.d')

  #dayhr <- substr(stfolders[i], dh_ind[2], nchar(stfolders[i])-5 )

  day <- vpr_day(stfolders[i])
  hour <- vpr_hour(stfolders[i])

  dayhr <- paste0('d', day, '.h', hour)

  #aid and aidmea
  aid_fns <- list.files(paste0(basepath, '/aid'), pattern = dayhr, full.names = TRUE)

  aidmeas_fns <- list.files(paste0(basepath, '/aidmea'), pattern =  dayhr, full.names = TRUE)

  #get indexes of aids which are not matched in krill_check and remove

  #read in aid file

  aid <- read.table(aid_fns ,stringsAsFactors = FALSE)

  #get roi substring
  #aid_old_gen <- substr(aid$V1, nchar(aid$V1) - 17, nchar(aid$V1))
  #aid_old_gen <- trimws(aid_old_gen, which = 'both')

  aid_old_gen <- vpr_roi(aid$V1)

  #find index of images which are in maually verified folder

  ver_ind <- aid_old_gen %in% krill_ver


  ver_ind_unl <- grep(ver_ind, pattern = 'FALSE')

  #remove any ROI number which do not match verified index
  aid_new <- aid$V1[ -ver_ind_unl]



  #read in aidmea file
  aidMea_old <- read.table(aidmeas_fns)

  #use same verified index to subset aidmea file
  aidMea_new <- aidMea_old[-ver_ind_unl, ]


  #check that files match

  if( length(aidMea_new$V1) != length(aid_new)){
    stop('roi and size files do not match!')
  }

  #print new aid and aidmea files

  write.table(file = aidmeas_fns, aidMea_new, sep = "    ", quote = FALSE, col.names = FALSE, row.names = FALSE)

  write.table(file = aid_fns, aid_new, quote = FALSE, col.names = FALSE, row.names = FALSE)

  cat(' \n')
  cat(aidmeas_fns, 'updated! \n')
  cat(aid_fns, 'updated! \n')
  cat('\n')

}
}

Try the vprr package in your browser

Any scripts or data that you put into this service are public.

vprr documentation built on Nov. 2, 2020, 5:07 p.m.