R/marfissci.process.data.R

marfissci.process.data <- function(df, agg.minutes=2,
                                   agg.field = "RND_WEIGHT_KGS",
                                   agg.by = "SPECIES_CODE",
                                   agg.by.year =T,
                                   save.RDS = T,
                                   save.CSV = F,
                                   save.SHP = F,
                                   name.det = NULL,
                                   out.folder = "marfissci",
                                   output = "RDS"){
  #' MMM - Apr 6, 2016
  #' This function takes the input raw MARFIS data and summarizes it at
  #' "agg.minutes" of a degree.  The default output is a (spatially-enabled) rds file
  #' which can be plotted in R without any additional processing.  By default,
  #' the rds file is saved locally, and csv and shapefiles can also be generated
  #' if desired. Alternative, a csv file can be output instead.
  #'
  #' Because of the aggregation that occurs in this function, much detail will
  #' be lost - the original coordinates are rounded and the data contained in
  #' the column 'agg.field' is grouped by the field specified in "agg.by".
  #' The mean, count annd sum is calculated for all "agg.field" values that
  #' contributed to the aggregated position.

  agg = agg.minutes/60
  writeLines(paste0("Aggregating data to ",agg.minutes," minutes of a degree...
                    "))
  #SPATIAL IDENTIFICATION OF IMPROPERLY POSITIONED CATCH (i.e. on land)
  #use coastline to find those points that are inland
  #Anything other than NA is inland
  proj.metric = '+proj=aea +lat_1=20 +lat_2=60 +lat_0=23 +lon_0=-96
                 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m'
  if (!exists("coast.aea")) {
    
    writeLines("Building the coastline...
               ")
    coast.aea = aegis::coastline.db( DS="gshhg coastline highres",
                            crs=proj.metric,
                            p=NULL, level=1, xlim=NULL, ylim=NULL )
    #coast.aea<<-spTransform(coast,CRS(proj.metric))
  }
  buff = agg*78847  #meters in 1 degree of longitude @45N
  if (!exists("coast.aea.buff")) {
    writeLines(paste0("With the selected aggregation level (",agg.minutes," minutes), the coastline is being
buffered internally by ",round(buff/1000,1), " km.  Data for this project is centered around ~45N,
and at this latitude, the aggregate values for any points within this buffer
will still overlay the ocean.  Beyond this distance, aggregate points will lie
entirely on the land.
"))
    coast.aea.buff <<- gBuffer(coast.aea, byid=TRUE, width=-1*buff)
  }


  df$VALIDITY = "VALID"  #default validity assumed good - overwrite if otherwise
  #flag data with unknown positions (assigned 0N,0W by marfissci.get.data
  noNA = c(c("LAT","LON"),agg.field) # fields where NA will be replaced with 0
  df[noNA][is.na(df[noNA])] <- 0

  df$VALIDITY[which(df$LAT == 0 | df$LON == 0)] = "NO COORDS"
  df$VALIDITY[which(df$LAT > 90 | df$LAT < -90)] = "BAD COORDS"
  df$VALIDITY[which(df$LON > 180 | df$LON < -180)] = "BAD COORDS"
  if (NROW(df[df$VALIDITY=='VALID',])==0) return(NULL) #can't position anything!
  #create temporary spatial df for terrestrial check - this loses the lat and
  #lon fields, so we add them back in for later convenience
  df.sp = df[which(df$VALIDITY=='VALID'),]
  coordinates(df.sp) = c("LON", "LAT")
  proj4string(df.sp) = CRS("+proj=longlat +datum=WGS84")
  df.sp@data$LON = coordinates(df.sp)[,1]
  df.sp@data$LAT = coordinates(df.sp)[,2]
  df.sp.proj.metric= spTransform(df.sp,CRS(proj.metric))
  writeLines(paste0("Finding points that exceed the allowable distance from the sea for this
aggregation level (",round(buff/1000,1), " km), and marking them as 'INLAND'.
"))

   df.sp.proj.metric@data$INLAND =over(df.sp.proj.metric,coast.aea.buff)
#   df = cbind(df,INLAND = df.sp.proj.metric@data$INLAND)

  df.sp.proj.metric@data$VALIDITY[which(!is.na(df.sp.proj.metric@data$INLAND))] = "INLAND"
  df.sp.proj.metric@data$INLAND = NULL
  #reorder columns for imminent rbind
  if (out.folder=="marfissci"){
    df.sp.proj.metric@data = df.sp.proj.metric@data[,c(1:12,16,15,13,14)]
  }else if (out.folder=="cl") {
    df.sp.proj.metric@data = df.sp.proj.metric@data[,c(8,7,1:6)]
    }else if (out.folder=="whalesitings") {
      df.sp.proj.metric@data = df.sp.proj.metric@data[,c(1,2,6,5,3,4)]
  }else{
    writeLines("unrecognized data columns")
  return(NULL)
    }
  df = rbind(df[!df$VALIDITY=='VALID',],df.sp.proj.metric@data)
  #  round the coordinates to the desired aggregation so we can group by the new
  # 'points', giving a lot less data to process
  df$LATAGG = (round(df$LAT/agg)*agg)-0.25*agg
  df$LONAGG = (round(df$LON/agg)*agg)+0.25*agg
  if(NROW(df[df$VALIDITY!="VALID",])>0) {
    writeLines(paste0(
      NROW(df[df$VALIDITY!="VALID",]), " of ", NROW(df)," the total records (a sum of ",sum(df[df$VALIDITY!="VALID",][,c(agg.field)])," of the
aggregated field) were poorly positioned.  These had either no positions, or were
determined to be terrestrial.  This total amount will be applied proportionately
across all other validly positioned data in a new field called 'WEIGHTED'
"))
  }else{
    writeLines(paste0("No invalid data was found (i.e. all positions were reasonably close to
the ocean). Despite this, a new field called 'WEIGHTED' will be output, but will
simply be populated with identical values as ",agg.field))
  }
  #Beyond this step, we will lose all initial details - e.g. catch usages,
  #reported units, gear types etc will be combined
  if (agg.by.year){
    df.agg = as.data.frame(as.list(aggregate(list("AGG_FIELD"=df[,c(agg.field)]),
                                             by=list(
                                               YEAR_FISHED = df$YEAR_FISHED,
                                               AGG_BY_FIELD = df[,c(agg.by)],
                                               VALIDITY = df$VALIDITY,
                                               LAT = df$LATAGG,
                                               LON = df$LONAGG),
                                             FUN=function(x) c(MEAN =round(mean(x),4),
                                                               CNT=round(length(x),4),
                                                               SUM = round(sum(x),4))
    )))
  }else{
    df.agg = as.data.frame(as.list(aggregate(list("AGG_FIELD"=df[,c(agg.field)]),
                                             by=list(
                                               AGG_BY_FIELD = df[,c(agg.by)],
                                               VALIDITY = df$VALIDITY,
                                               LAT = df$LATAGG,
                                               LON = df$LONAGG),
                                             FUN=function(x) c(MEAN =round(mean(x),4),
                                                               CNT=round(length(x),4),
                                                               SUM = round(sum(x),4))
    )))
  }
  #rm(df)


  if(NROW(df[df$VALIDITY!="VALID",])>0) {
    valid.data=df.agg[df.agg$VALIDITY=="VALID",]
    #get the proportion of the total sum of all attributable to each (valid) loc
    valid.data$PROP = prop.table(as.table(as.matrix(valid.data$AGG_FIELD.SUM)))[,1]
    invalid.data = df.agg[!df.agg$VALIDITY=="VALID",]
    invalid.data$PROP = 0
    invalid.total = sum(invalid.data$AGG_FIELD.SUM)
    valid.data$WEIGHTED = (valid.data$PROP * invalid.total) + valid.data$AGG_FIELD.SUM
    invalid.data$WEIGHTED = 0
    df.agg = rbind(valid.data,invalid.data)
    df.agg$PROP = NULL
    rm(valid.data)
    rm(invalid.data)
  } else {
    df.agg$PROP = df.agg$AGG_FIELD.SUM
  }


  #make a descriptive name so we know what we've got
  if (!is.null(name.det)){
    name.detail=name.det
  }else{
    name.detail=""
  }
  agg.type=paste0(substr(agg.by, 1, 4),"_")
  if (range(df.agg$AGG_BY_FIELD)[1] == range(df.agg$AGG_BY_FIELD)[2]) {
    agg.file = paste0(range(df.agg$AGG_BY_FIELD)[1],"_")
  }else{
    agg.file = paste(range(df.agg$AGG_BY_FIELD),collapse = "_")
  }
  if (agg.by.year){
    if (range(df.agg$YEAR_FISHED)[1] == range(df.agg$YEAR_FISHED)[2]) {
      years.file = paste0(range(df.agg$YEAR_FISHED)[1],"_")
    }else{
      years.file = paste0(paste(range(df.agg$YEAR_FISHED),collapse = "_"),"_")
    }
  }else{
    years.file =""
  }
  agg.amt=paste0(agg.minutes,"min")
  the.filename = paste0(name.detail,agg.type,agg.file,years.file,agg.amt)

  #do any field renaming before generating files
  colnames(df.agg)[colnames(df.agg) == 'AGG_BY_FIELD'] <- agg.by
  colnames(df.agg) <- gsub("AGG_FIELD\\.", "", colnames(df.agg))
  colnames(df.agg)[colnames(df.agg) == 'MEAN'] <- paste0("MEAN_",agg.field)
  colnames(df.agg)[colnames(df.agg) == 'CNT'] <- paste0("CNT_",agg.field)
  colnames(df.agg)[colnames(df.agg) == 'SUM'] <- paste0("SUM_",agg.field)

  df.agg.sp = df.agg[which(df.agg$VALIDITY!="BAD COORDS"),]
  coordinates(df.agg.sp) = c("LON", "LAT")
  proj4string(df.agg.sp) = CRS("+proj=longlat +datum=WGS84")

  ldir = project.datadirectory("aegis", "data", out.folder)

  if (save.RDS) {
    fn = file.path( ldir, "rds", paste(the.filename,"rds", sep="." ) )
    saveRDS(df.agg.sp, file=fn )
    writeLines(paste0("rds file written to ", fn))
  }

  if (save.CSV) {
    fn = file.path( ldir, "csv", paste(the.filename, "csv", sep="." ) )
    write.csv(df.agg, file=fn )
    writeLines(paste0("csv file written to ", fn) )
  }

  if(save.SHP) {
    fnloc = file.path( ldir, "shapes" )
    writeOGR(df.agg.sp, dsn=fnloc, layer=the.filename, driver='ESRI Shapefile', overwrite_layer=TRUE)
    writeLines(paste0("shp file written to ", fnloc, the.filename, ".*"))
  }
  if (output == "RDS"){
    the.output = df.agg.sp
  } else{
    the.output = df.agg
  }
    return(the.output)
}
#df = read.csv(paste0(project.datadirectory("mpa"),"/csv/raw_2010_612.csv")) #gets raw data
#test = marfissci.process.data(df, agg.minutes=2, agg.by="SPECIES_CODE", save.RDS=F) #aggregates it
jae0/ecmgis documentation built on May 28, 2019, 9:57 p.m.