R/preprocessing.R

Defines functions preprocessing

Documented in preprocessing

#' Preprocessing VMS data
#'
#' Calculates required variables to calculate the efficiency indicators and make plots.
#'
#' @param vmsdata VMS dataset.
#' @param catchdata Dataset with landings.
#' @param vesseldata Vessel information.
#' @param cutoff_dc Minimum distance to the shoreline (in nautical miles) 
#' to identify a fishing trip.
#' @param cutoff_time Maximum time interval (in hours) allowed between 
#' arrival time and landing time.
#' @param min_trip_time Minimum trip time allowed (in hours). 
#' @return The VMS database with quantities calculated.
#' @export
preprocessing = function(vmsdata, catchdata, vesseldata, cutoff_dc = 5, cutoff_time = 8, min_trip_time = 5) {

  # Check column names:
  vmscols = c('EMPRESA', 'NAVE', 'CAPBODEGA', 'MATRICULA', 'LON', 'LAT', 'FECHA', 'HORA', 'DIST_COAST')
  catchcols = c('EMPRESA', 'EMBARCACION', 'CAPBODEGA', 'Fecha_Arribo', 'Hora_Arribo', 'Pesca_Descargada')
  vmstest = vmscols %in% colnames(vmsdata)
  catchtest = catchcols %in% colnames(catchdata)
  if(!(all(vmstest))) {
    stop(paste0('Column(s) ', paste(vmscols[!vmstest], collapse = ', '), ' in vmsdata are missing.'))
  }
  if(!(all(catchtest))) {
    stop(paste0('Column(s) ', paste(catchcols[!catchtest], collapse = ', '), ' in catchdata are missing.'))
  }

  # Create time column:
  vmsdata$TIME = paste(vmsdata$FECHA, vmsdata$HORA)
  vmsdata$TIME = lubridate::parse_date_time(vmsdata$TIME, c("%d/%m/%Y %H:%M:%S", "%d/%m/%y %H:%M"))
  vmsdata$EMB_NOMBRE = gsub(pattern = '[[:space:]]', replacement = '_', x = vmsdata$NAVE)

  catchdata$TIME2 = paste(catchdata$Fecha_Arribo, catchdata$Hora_Arribo) # intentar con fecha de zarpe luego
  catchdata$TIME = lubridate::parse_date_time(catchdata$TIME2, c("%m/%d/%Y %H:%M:%S"))
  catchdata$EMB_NOMBRE = gsub(pattern = '[[:space:]]', replacement = '_', x = catchdata$EMBARCACION)

  #Vessel info:
  listVessel = vesseldata$MATRICULA

  indData = list()
  list_ind = 1
  for(i in seq_along(listVessel)) {
    
    # select vessel i
    tmpData = dplyr::filter(vmsdata, MATRICULA == listVessel[i])
    name_vessel = unique(tmpData$EMB_NOMBRE)[1]
    catch_tmpData = dplyr::filter(catchdata, EMB_NOMBRE == name_vessel)

    if(nrow(catch_tmpData) == 0) {
      warning(paste0("No hay informacion de desembarques para la embarcacion ", name_vessel))
    }

    # order by time:
    tmpData = tmpData[order(tmpData$TIME), ]
    catch_tmpData = catch_tmpData[order(catch_tmpData$TIME), ]
    
    # find trip index
    tmpData$TRIP_IND = find_trip(ind_vec = tmpData$DIST_COAST, cutoff = cutoff_dc)
    
    # Find distances per trip:
    nTrips = max(tmpData$TRIP_IND)
    tmpData$DISTANCE_I_I1 = NA
    tmpData$TRIP_DISTANCE = NA
    tmpData$LANDING = NA
    tmpData$DISTANCE_FST_ALL = NA
    tmpData$DISTANCE_LST_ALL = NA
    tmpData$TRIP_TIME = NA
    tmpData$DIFF_IND_TRIP = NA
    tmpData$DISTANCE_RECT_LINE = NA
    tmpData$VELOCITY = NA
    tmpData$CG_LON = NA
    tmpData$CG_LAT = NA
    tmpData$CG_DC = NA
    tmpData$MIN_TIME = NA
    
    for(j in 1:nTrips) {
      
      tmpData_2 = dplyr::filter(tmpData, TRIP_IND == j)
      
      # only for trip with more than 1 row:
      if(nrow(tmpData_2) > 1) {
        
        # tiempo de viaje:
        tmpData_2$TRIP_TIME = difftime(max(tmpData_2$TIME), 
                                       min(tmpData_2$TIME), units = 'hours')
        tmpData_2$TRIP_TIME = as.numeric(tmpData_2$TRIP_TIME, units = 'hours')

        # find landing value:
        diffTimes = difftime(max(tmpData_2$TIME), catch_tmpData$TIME, units = 'hours')
        diffTimes = as.numeric(diffTimes, units = 'hours')
        posSel = which(diffTimes < 0)[1]
        tmpData_2$DIFF_IND_TRIP = diffTimes[posSel] # time difference selected btw landing and vms data
        tmpData_2$LANDING = catch_tmpData$Pesca_Descargada[posSel] # hacerlo con pesca descargada
        
        lonlatPoints = tmpData_2[,c('LON', 'LAT')]
        distMat = raster::pointDistance(lonlatPoints, lonlat=TRUE)
        distMat = (distMat/111000)*60 # units: nm. assuming 1 degree is 111 km
        dind = row(distMat) - col(distMat)
        
        tmpData_2$DISTANCE_I_I1 = c(split(distMat, dind)$`1`, NA)
        tmpData_2$DISTANCE_FST_ALL = distMat[,1]
        tmpData_2$DISTANCE_LST_ALL = distMat[nrow(distMat),]
        tmpData_2$DISTANCE_RECT_LINE = mean(max(tmpData_2$DISTANCE_FST_ALL, na.rm = TRUE),
                                            max(tmpData_2$DISTANCE_LST_ALL, na.rm = TRUE))
        tmpData_2$TRIP_DISTANCE = sum(tmpData_2$DISTANCE_I_I1, na.rm = TRUE)
        
        point_time_diff = difftime(tmpData_2$TIME[2:nrow(tmpData_2)], 
                                   tmpData_2$TIME[1:(nrow(tmpData_2)-1)], units = 'hours')
        point_time_diff = as.numeric(point_time_diff, units = 'hours')
        tmpData_2$VELOCITY = tmpData_2$DISTANCE_I_I1/c(as.numeric(point_time_diff), NA)
        cent_grav = 1/tmpData_2$VELOCITY
        cent_grav[is.infinite(cent_grav)] = 10 # max weight, assuming velocity = 0.1
        cent_grav = cent_grav/sum(cent_grav,na.rm = TRUE)
        tmpData_2$CG_LON = sum(tmpData_2$LON*cent_grav, na.rm = TRUE)
        tmpData_2$CG_LAT = sum(tmpData_2$LAT*cent_grav, na.rm = TRUE)
        tmpData_2$CG_DC = sum(tmpData_2$DIST_COAST*cent_grav, na.rm = TRUE)
        tmpData_2$MIN_TIME = min(tmpData_2$TIME)

        # save data:
        indData[[list_ind]] = tmpData_2
        list_ind = list_ind + 1
        
      }
      
    }
    
    print(paste0(i, ": Analizando datos para embarcacion ", name_vessel))
    
  }

  indData2 = data.table::rbindlist(indData)

  # Limpieza de datos: Eliminar los que no tienen datos de 
  outData = indData2[(!is.na(indData2$LANDING) & indData2$DIFF_IND_TRIP <= cutoff_time & indData2$TRIP_TIME >= min_trip_time & 
                            indData2$TRIP_DISTANCE <= 11*indData2$TRIP_TIME), ]

  veriData = indData2[indData2$TRIP_DISTANCE > 11*indData2$TRIP_TIME, ]

  outList = list(checked = outData, to_verify = veriData)

  return(outList)

}
gmoroncorrea/VMSIndicators documentation built on Dec. 20, 2021, 11:50 a.m.