R/autoValidateWQPsites.r

Defines functions autoValidateWQPsites

Documented in autoValidateWQPsites

#' Auto-validate USEPA WQP stations
#'
#' Performs auto-validation on previously queried WQP stations combined with the existing master site list.
#' Auto-validates any new sites (i.e. those not currently in the master site list) and appends them into existing master site list flagged for acceptance, rejection, or review.
#' Checks for any new site types in new data. A warning message and a list of new site types is printed if new site types are encountered.
#' Also re-auto-validates any sites in the master site list that have previously only undergone auto-validation (to account for any changes in auto validation process) and checks existing master site list for changes in AUs, selected site types, or property boundaries.
#' 
#' @param sites_object Sites object queried from WQP to be reviewed.
#' @param master_site_file Full path and filename of master site list as generated by autoValidateWQPsites function and manual site review application. Also contains a sheet of waterbody types and their associated IR_FLAGs (.xlsx).
#' @return Exports a new, undated master site list to the location & filename provided by the user.

#' @import sp
#' @import lwgeom
#' @import sf
#' @import wqTools
#' @importFrom plyr rbind.fill

#' @export
autoValidateWQPsites=function(sites_object,master_site_file,correct_longitude=FALSE){

#########
###TESTING SETUP
#library(sp)
#library(sf)
#sites_object=read.csv("C:/Users/jvander/Documents/R/irTools-test-16/01-raw-data/sites-2019-04-04.csv")
#master_site_file="P:/WQ/Integrated Report/IR_2020/2020-IR-assessments/spatial_files/IR_master_site_file-including-manual-edits.xlsx"
#waterbody_type_file = "C:/Users/jvander/Documents/R/irTools-test-16/00-lookup-tables/waterbody_type_domain_table.csv"
#correct_longitude=FALSE
########


# Polygon intersection function
  intpoly <- function(polygon, sites_object, sites){
    isect=suppressMessages({suppressWarnings({st_intersection(sites, st_difference(st_make_valid(polygon)))})})
    st_geometry(isect)=NULL
    check=dim(sites_object)[1]
    sites_object=merge(sites_object,isect,all.x=TRUE)
    if(dim(sites_object)[1]!=check){
      stop("Spatial join and merge causing duplicated values.")
    }
    return(sites_object)
  }

print("Reading in sites_file and master_sites_file and checking for new waterbody types...")

# Read in master site table and waterbody type domain tables
waterbody_table = as.data.frame(readxl::read_xlsx(master_site_file, sheet = "waterbodyTypeTable"))
master_site = as.data.frame(readxl::read_xlsx(master_site_file, sheet = "masterSiteTable"))
master_site = dplyr::na_if(master_site, "NA")

# Read in WQP station and results data
stn = sites_object
stn[stn==""]=NA #Make sure all blanks are NA
stn=unique(stn)

# Turn any factors to character vectors for merging w/ master sites
i <- sapply(stn, is.factor)
stn[i] <- lapply(stn[i], as.character)

# Ensure waterbody type table does not contain any unreviewed waterbody types
if(any(is.na(waterbody_table$MonitoringLocationTypeName)|waterbody_table$MonitoringLocationTypeName=="")){
  stop("Monitoring location type table missing IR_FLAG data. Fill out table before proceeding.")
}
# Remove past InData column
waterbody_table=waterbody_table[,!names(waterbody_table)%in%"InData"]

# Add waterbody types from sites file to domain table
stn_site_types=data.frame(unique(stn$MonitoringLocationTypeName))
colnames(stn_site_types)="MonitoringLocationTypeName"
stn_site_types$InData="Y"
waterbody_merge <- merge(waterbody_table,stn_site_types, all=TRUE)

# Prompt user to fill out domain table before moving on to autovalidation.
if(dim(waterbody_merge)[1]>dim(waterbody_table)[1]){
  write.csv(waterbody_merge,"waterbody_type_table_new.csv",row.names = FALSE)
  new <- dim(waterbody_merge)[1]-dim(waterbody_table)[1]
  stop(paste(new,"new monitoring location types detected and added to domain table. Update IR_FLAG column in domain table before proceeding with autovalidation tool."))}else{print("No new monitoring location types detected.")
 }

site_type_keep <- subset(waterbody_merge$MonitoringLocationTypeName, waterbody_merge$IR_FLAG=="ACCEPT")

# Ensure no duplicates in master site file which could cause erroneous duplication during merges.
master_site=unique(master_site)
orig_master <- dim(master_site) # original count of the number of records in master_site
print(paste(orig_master[1],"total master site records in file."))

ms_dim=dim(master_site)[1]
if(dim(master_site)[1]>0){
  master_site[master_site==""]=NA
  master_site[master_site=="NA"]=NA} #Make sure all blanks are NA

if(length(master_site$ValidationType[is.na(master_site$ValidationType)==TRUE])>0){
  stop("NA's detected in ValidationType column of master_site_file. Correct NA's before re-running autovalidateWQPsites.")
}
suppressWarnings({class(stn$HorizontalAccuracyMeasure.MeasureValue)="numeric"}) #Non-numeric values introduced to this column were causing appearance of duplicates in master_site_file
suppressWarnings({class(master_site$HorizontalAccuracyMeasure.MeasureValue)="numeric"})

# Identify any new sites (all review columns == NA) and move to new data frame (stn_new)
stn2=merge(stn, master_site[,c('UID','OrganizationIdentifier','OrganizationFormalName','ProviderName','MonitoringLocationIdentifier','MonitoringLocationName','MonitoringLocationTypeName')], all.x=TRUE)
stn_new=stn2[is.na(stn2$UID),]
dim(stn_new)
print(paste(dim(stn_new)[1],"sites found in sites_file not present in master_site_file."))
if(dim(stn_new)[1]==0){
  readline(prompt="Press [enter] to continue with master_site autovalidation or [esc] to end function.")
}else{readline(prompt="Press [enter] to continue.")}

rm(stn2)


#Assign UID to new sites
if(dim(master_site)[1]>0&dim(stn_new)[1]>0){
	stn_new$UID=seq(1:dim(stn_new)[1])+max(master_site$UID)
}else{
	if(dim(stn_new)[1]>0){
		stn_new$UID=seq(1:dim(stn_new)[1])}
	}

# Read in polygons
au_poly=wqTools::au_poly
au_poly=au_poly[,c("ASSESS_ID","AU_NAME","AU_Type")]
bu_poly=wqTools::bu_poly
bu_poly=bu_poly[,!names(bu_poly) %in% 'Status']
names(bu_poly)[names(bu_poly)=='bu_class']='BEN_CLASS'
ss_poly=wqTools::ss_poly
ss_poly=ss_poly[,"ss_descrp"]
names(ss_poly)[names(ss_poly)=='ss_descrp']='ss_R317Descrp'
ut_poly=wqTools::ut_poly
ut_poly=ut_poly[,"jurisdict"]
#suppressWarnings({ut_poly=st_buffer(ut_poly, 0)})
gsl_poly=subset(au_poly, AU_NAME=='Gilbert Bay' | AU_NAME=='Gunnison Bay' | AU_NAME=='Farmington Bay' | AU_NAME=='Bear River Bay')
gsl_poly$gsl='y'
wmu_poly=wqTools::wmu_poly

##################################
####Master site reviews
if(dim(master_site)[1]>0){
	#Kick master sites that were previously accepted or merged, but are now at a non-assessed site type etc. to AUTO validation (convert review type to AUTO)
	#(where they will be rejected, doing this here keeps them out of later master site spatial count based checks)
		
  print("Performing master site reviews...")
  
	#Site type
	table(master_site$ValidationType)
	master_site$ValidationType[!master_site$MonitoringLocationTypeName %in% site_type_keep]="AUTO"
	table(master_site$ValidationType)

	# ID master sites where fundamental attributes have changed (MonitoringLocationTypeName, Lat/Long has been updated in WQP, send them to auto review
	stn2=stn[,c('OrganizationIdentifier','OrganizationFormalName','ProviderName','MonitoringLocationIdentifier','MonitoringLocationName','MonitoringLocationTypeName', 'LatitudeMeasure', 'LongitudeMeasure')]
	names(stn2)=c('OrganizationIdentifier','OrganizationFormalName','ProviderName','MonitoringLocationIdentifier','MonitoringLocationName','type', 'lat', 'long')
	mst2=merge(master_site, stn2, all.x=T)
	mst2=subset(mst2,
		(!is.na(type) & MonitoringLocationTypeName!=type) |
		(!is.na(LatitudeMeasure) & round(LatitudeMeasure, 5)!=round(lat, 5)) |
		(!is.na(LongitudeMeasure) & round(LongitudeMeasure, 5)!=round(long, 5))
	)
	master_site$ValidationType[master_site$UID %in% mst2$UID]="AUTO"
	rm(mst2, stn2)
	table(master_site$ValidationType)
		
	
	#Remove ASSESS_ID, & AU_NAME cols from master_site before re-assigning (this updates master list in case polygons change)
	master_site=master_site[,!names(master_site)%in%c("ASSESS_ID","AU_NAME","AU_Type","BEN_CLASS","R317Descrp","ss_R317Descrp","Water_Type", "Mgmt_Unit")]
	class(master_site$IR_FLAG)
	dim(master_site)
	
	#Generate spatial sites object
	sites=master_site
	sites$lat=ifelse(is.na(sites$IR_Lat), sites$LatitudeMeasure, sites$IR_Lat)
	sites$long=ifelse(is.na(sites$IR_Long), sites$LongitudeMeasure, sites$IR_Long)
	sites=sites[,c('UID','lat','long')]
	coordinates(sites)=c("long","lat")
	proj4string(sites)=CRS("+init=epsg:4326")
	sites=st_as_sf(sites, remove = FALSE)
	
	#Intersect sites w/ Utah poly, AU poly, BU poly, and SS poly 
	master_site <- intpoly(ut_poly,master_site, sites)
	master_site <- intpoly(au_poly,master_site, sites)
	master_site <- intpoly(bu_poly,master_site, sites)
	master_site <- intpoly(ss_poly,master_site, sites)
	master_site <- intpoly(gsl_poly,master_site, sites)
	master_site <- intpoly(wmu_poly,master_site, sites)
	
	rm(sites)

	#Send to AUTO review by is.na(STATE_NAME)
	table(master_site$ValidationType)
	master_site$ValidationType[is.na(master_site$jurisdict)]="AUTO"
	table(master_site$ValidationType)
	master_site=master_site[,!names(master_site) %in% "jurisdict"]
	
	#Send to AUTO review by is.na(AU)
	table(master_site$ValidationType)
	master_site$ValidationType[is.na(master_site$ASSESS_ID)]="AUTO"
	table(master_site$ValidationType)

	#Send to AUTO review if GSL
	table(master_site$ValidationType)
	master_site$ValidationType[master_site$gsl=='y']="AUTO"
	table(master_site$ValidationType)
	master_site=master_site[,!names(master_site) %in% "gsl"]

	#Send to AUTO where 	MonitoringLocationTypeName is a canal type & AU_Type!="Canal"
	master_site$ValidationType[
		(master_site$MonitoringLocationTypeName=="Stream: Canal" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="Stream: Ditch" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="Canal Transport" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="Canal Drainage" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="Canal Irrigation" & master_site$AU_Type != "Canal")
		]="AUTO"
	table(master_site$ValidationType)

	#Send to AUTO where MonitoringLocationTypeName is a stream or spring type & AU_Type!="River/Stream" or "Canal"
	master_site$ValidationType[
		(master_site$MonitoringLocationTypeName=="Stream" & master_site$AU_Type != "River/Stream")&
		(master_site$MonitoringLocationTypeName=="Stream" & master_site$AU_Type != "Canal") |
		(master_site$MonitoringLocationTypeName=="River/Stream" & master_site$AU_Type != "River/Stream")&
		(master_site$MonitoringLocationTypeName=="River/Stream" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="River/Stream Intermittent" & master_site$AU_Type != "River/Stream")&
		(master_site$MonitoringLocationTypeName=="River/Stream Intermittent" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="River/Stream Perennial" & master_site$AU_Type != "River/Stream")&
		(master_site$MonitoringLocationTypeName=="River/Stream Perennial" & master_site$AU_Type != "Canal")|
		(master_site$MonitoringLocationTypeName=="Spring" & master_site$AU_Type != "River/Stream")&
		(master_site$MonitoringLocationTypeName=="Spring" & master_site$AU_Type != "Canal")
		]="AUTO"
	table(master_site$ValidationType)

	#Send to AUTO where MonitoringLocationTypeName is a lake type & AU_Type!="Reservoir/Lake"
	master_site$ValidationType[
		(master_site$MonitoringLocationTypeName=="Lake, Reservoir, Impoundment" & master_site$AU_Type != "Reservoir/Lake")|
		(master_site$MonitoringLocationTypeName=="Lake" & master_site$AU_Type != "Reservoir/Lake")|
		(master_site$MonitoringLocationTypeName=="Reservoir" & master_site$AU_Type != "Reservoir/Lake")
		]="AUTO"
	table(master_site$ValidationType)

	mast_autval <- length(master_site$ValidationType[master_site$ValidationType=="AUTO"])
	mast_man <- length(master_site$ValidationType[master_site$ValidationType=="MANUAL"])
	new_autval <- length(stn_new[,1])
	if(mast_autval+mast_man!=orig_master[1]){
	  stop("Dimensions of master_site have changed following master site reviews. Function/file check needed.")
	}
	#Send master sites that have only undergone AUTO validation or were flagged for AUTO review above to stn_new (i.e. re-auto review those sites to account for any changes in automated review process)
	stn_new=plyr::rbind.fill(stn_new,master_site[master_site$ValidationType=="AUTO",c(names(stn_new), 'IR_Lat', 'IR_Long', 'Note')])
	dim(stn_new)
	table(stn_new$HorizontalCoordinateReferenceSystemDatumName)
	
	#Remove stn_new sites from master_site
	dim(master_site)
	master_site=master_site[!master_site$UID%in%stn_new$UID,]
	dim(master_site)
	dim(stn_new)
	print(paste(mast_autval,"master site(s) and", new_autval,"new site(s) sent to AUTO review.",mast_man,"master site(s) have undergone previous manual review and have not been flagged for further review."))
	readline(prompt="Press [enter] to continue.")
}else{
  mast_autval <- length(master_site$ValidationType[master_site$ValidationType=="AUTO"])
  mast_man <- length(master_site$ValidationType[master_site$ValidationType=="MANUAL"])
  new_autval <- dim(stn_new)[1]
}



######################################
######################################

print("Performing attribute based site checks...")

#Stop execution if there are no new sites
if(dim(stn_new)[1]==0){stop("No new sites identified. This is suspicious. Double check inputs. Coding modification may be required.",call.=FALSE)}

#Correct positive longitudes (if correct_longitude==TRUE) (JV note, moved to apply to all stations that will undergo auto review - stn_new)
if(correct_longitude==TRUE){
  stn_new$LongitudeMeasure[stn_new$LongitudeMeasure>0]<- -stn_new$LongitudeMeasure[stn_new$LongitudeMeasure>0]
}

# Create IR specific columns, all values filled w/ "REVIEW"
stn_new[,c("IR_MLID","IR_MLNAME","IR_FLAG","IR_COMMENT")] = "REVIEW"


##Auto review new sites & master sites re-flagged to AUTO...
rej_reasons_att=data.frame(matrix(nrow=0,ncol=2))

# If [MonitoringLocationDescriptionText] contains "Duplicate","Replicate","Dummy","replaced","Blank","QA", or "QC", reject as QAQC
reason_n = ifelse(grepl("Duplicate",stn_new$MonitoringLocationDescriptionText) | grepl("Replicate",stn_new$MonitoringLocationDescriptionText) | grepl("Dummy",stn_new$MonitoringLocationDescriptionText) | 
                        grepl("replaced",stn_new$MonitoringLocationDescriptionText) | grepl("Blank",stn_new$MonitoringLocationDescriptionText) | grepl("QA",stn_new$MonitoringLocationDescriptionText) | 
                        grepl("QC",stn_new$MonitoringLocationDescriptionText),"Attributes indicate dup, rep, blank, dummy, or QAQC site",NA)

if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

# If [OrganizationIdentifier] is test or demo, reject site.
reason_n=ifelse(stn_new$OrganizationIdentifier%in%c("OST_SHPD_TEST","DEMOTEST"),"Organization identifier indicates test/demo",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

# # Reject datums that != NAD27, NAD83, or WGS84 or is UNKWN
# reason_n=ifelse(!stn_new$HorizontalCoordinateReferenceSystemDatumName %in% c('NAD27','NAD83','WGS84'),"Horizontal datum not NAD27, NAD83, or WGS84 and/or is unknown",NA)
# if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with ConstructionDateText populated
reason_n=ifelse(!is.na(stn_new$ConstructionDateText),"Construction date text populated",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with WellDepthMeasure.MeasureValue populated
reason_n=ifelse(!is.na(stn_new$WellDepthMeasure.MeasureValue),"Well depth measure populated",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with WellDepthMeasure.MeasureUnitCode populated
reason_n=ifelse(!is.na(stn_new$WellDepthMeasure.MeasureUnitCode),"Well depth measure unit code populated",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with WellHoleDepthMeasure.MeasureValue populated
reason_n=ifelse(!is.na(stn_new$WellHoleDepthMeasure.MeasureValue),"Well hole depth measure populated",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with WellHoleDepthMeasure.MeasureUnitCode populated
reason_n=ifelse(!is.na(stn_new$WellHoleDepthMeasure.MeasureUnitCode),"Well hole depth measure unit code populated",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with AquiferName populated
reason_n=ifelse(!is.na(stn_new$AquiferName),"Aquifer name populated: associated with unassessed wells",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with FormationTypeText populated
reason_n=ifelse(!is.na(stn_new$FormationTypeText),"Formation type populated: associated with unassessed wells",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites with AquiferTypeName populated
reason_n=ifelse(!is.na(stn_new$AquiferTypeName),"Aquifer type name populated: associated with unassessed wells",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject sites where MonitoringLocationTypeName !%in% site_type_keep argument
reason_n=ifelse(!stn_new$MonitoringLocationTypeName%in%site_type_keep,"Non-assessed site type",NA)
if(length(reason_n)>0){rej_reasons_att=rbind(rej_reasons_att,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

names(rej_reasons_att)=c("MonitoringLocationIdentifier","Reason")
rej_reasons_att$ReasonType="Attribute based"
rej_reasons_att$FLAG="REJECT"
head(rej_reasons_att)

print("Attribute based site rejection reason count:")
print(table(rej_reasons_att$Reason))
readline(prompt="Press [enter] to continue.")

#Set stn_new IR_FLAG and reason for attribute based site rejections
stn_new$IR_FLAG[stn_new$MonitoringLocationIdentifier %in% rej_reasons_att$MonitoringLocationIdentifier]="REJECT"
table(stn_new$IR_FLAG)



####################
###Spatial checks###
####################

print("Performing spatial site checks...")

#Generate spatial sites object
sites=stn_new
sites$lat=ifelse(is.na(sites$IR_Lat), sites$LatitudeMeasure, sites$IR_Lat)
sites$long=ifelse(is.na(sites$IR_Long), sites$LongitudeMeasure, sites$IR_Long)
sites=sites[,c('OrganizationIdentifier','OrganizationFormalName','MonitoringLocationIdentifier','MonitoringLocationName','MonitoringLocationTypeName','ProviderName',"LatitudeMeasure","LongitudeMeasure",'lat','long')]
coordinates(sites)=c("long","lat")
proj4string(sites)=CRS("+init=epsg:4326")
sites=st_as_sf(sites, remove = FALSE)


#Intersect sites w/ Utah poly, AU poly, BU poly, SS poly, and GSL poly 

stn_new <- intpoly(ut_poly,stn_new, sites)
stn_new <- intpoly(au_poly,stn_new, sites)
stn_new <- intpoly(bu_poly,stn_new, sites)
stn_new <- intpoly(ss_poly,stn_new, sites)
stn_new <- intpoly(gsl_poly,stn_new, sites)
stn_new <- intpoly(wmu_poly,stn_new, sites)

rm(sites)

###Spatial rejections
rej_reasons_spat=data.frame(matrix(nrow=0,ncol=2))

#Reject by is.na(AU)
reason_n=ifelse(is.na(stn_new$ASSESS_ID),"Undefined AU",NA)
if(length(reason_n)>0){rej_reasons_spat=rbind(rej_reasons_spat,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject by is.na(STATE_NAME)
reason_n=ifelse(is.na(stn_new$jurisdict),"Non-jurisdictional: out of state or within tribal boundaries",NA)
if(length(reason_n)>0){rej_reasons_spat=rbind(rej_reasons_spat,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Reject by GSL poly
reason_n=ifelse(stn_new$gsl=='y',"GSL assessed through separate program",NA)
if(length(reason_n)>0){rej_reasons_spat=rbind(rej_reasons_spat,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#Remove unneeded spatial join columns
stn_new=stn_new[,!names(stn_new)%in%c("jurisdict","gsl")]

#Reject where 	MonitoringLocationTypeName is a canal type & AU_Type!="Canal"
reason_n=ifelse(
				(stn_new$MonitoringLocationTypeName=="Stream: Canal" & stn_new$AU_Type != "Canal")|
				(stn_new$MonitoringLocationTypeName=="Stream: Ditch" & stn_new$AU_Type != "Canal")|
				(stn_new$MonitoringLocationTypeName=="Canal Transport" & stn_new$AU_Type != "Canal")|
				(stn_new$MonitoringLocationTypeName=="Canal Drainage" & stn_new$AU_Type != "Canal")|
				(stn_new$MonitoringLocationTypeName=="Canal Irrigation" & stn_new$AU_Type != "Canal")
		,"Non-assessed canal or ditch",NA)
if(length(reason_n)>0){rej_reasons_spat=rbind(rej_reasons_spat,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}


#Reject where 	MonitoringLocationTypeName is a stream or spring type & AU_Type!="River/Stream"
reason_n=ifelse(
		(stn_new$MonitoringLocationTypeName=="Stream" & stn_new$AU_Type != "River/Stream")|
		(stn_new$MonitoringLocationTypeName=="River/Stream" & stn_new$AU_Type != "River/Stream")|
		(stn_new$MonitoringLocationTypeName=="River/Stream Intermittent" & stn_new$AU_Type != "River/Stream")|
		(stn_new$MonitoringLocationTypeName=="River/Stream Perennial" & stn_new$AU_Type != "River/Stream")|
		(stn_new$MonitoringLocationTypeName=="Spring" & stn_new$AU_Type != "River/Stream")
	,"Stream or spring site type in non-River/Stream AU",NA)

rej_reasons_spat=rbind(rej_reasons_spat,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))

names(rej_reasons_spat)=c("MonitoringLocationIdentifier","Reason")
rej_reasons_spat$ReasonType="Spatial"
rej_reasons_spat$FLAG="REJECT"
head(rej_reasons_spat)

print("Spatial site rejection reason count:")
print(table(rej_reasons_spat$Reason))
readline(prompt="Press [enter] to continue.")

#Set stn_new IR_FLAG and reason for spatial site rejections
stn_new$IR_FLAG[stn_new$MonitoringLocationIdentifier %in% rej_reasons_spat$MonitoringLocationIdentifier]="REJECT"
table(stn_new$IR_FLAG)


##########
##Calculate full distance matrix, lat/long, mlid, and site 100m counts.
##For new sites and master flagged for re-AUTO (stn_new), apply lat/long, 100 m site count logic to determine review status
##For previously manually reviewed master sites, check if MLID_Count, Lat_Count, Long_Count, or sites100m_count have increased, flag these for review

print("Performing 100m/duplicate MLID/duplicate lat-long checks...")

#rbind new auto-validated sites back to master file and calculate distances on all non-rejected sites in master file (this way if polygons change, it is automatically accounted for in master file)
spatial_check_data=rbind(master_site[,names(stn_new)],stn_new)
dim(spatial_check_data)

#Splitting off rejected sites prior to other spatial analyses (including all previously accepted/merged sites allows calc of distances including previously reviewed sites.)

accept_review=spatial_check_data[spatial_check_data$IR_FLAG!="REJECT",]
rejected=spatial_check_data[spatial_check_data$IR_FLAG=="REJECT",]
class(accept_review$IR_FLAG)

table(accept_review$IR_FLAG)
sum(table(accept_review$IR_FLAG))

#Count MLIDs, add as column to accept_review, MLID_Count>1 means duplicated MLID
MLID_Count=as.vector(table(accept_review$MonitoringLocationIdentifier)[accept_review$MonitoringLocationIdentifier])
accept_review$MLID_Count=MLID_Count

#Count Latitudes, add as column to accept_review, Lat_Count>1 means duplicated lat
Lat_Count=as.vector(table(accept_review$LatitudeMeasure))[as.factor(accept_review$LatitudeMeasure)]
accept_review$Lat_Count=Lat_Count

#Count Longitudes, add as column to accept_review, Long_Count>1 means duplicated long
Long_Count=as.vector(table(accept_review$LongitudeMeasure))[as.factor(accept_review$LongitudeMeasure)]
accept_review$Long_Count=Long_Count

#Identify non-rejected sites w/in 100 m (0.1 km)
accept_review$lat=ifelse(is.na(accept_review$IR_Lat), accept_review$LatitudeMeasure, accept_review$IR_Lat)
accept_review$long=ifelse(is.na(accept_review$IR_Long), accept_review$LongitudeMeasure, accept_review$IR_Long)
distmat=spDists(cbind(accept_review$long,accept_review$lat),longlat=TRUE)
row.names(distmat)=accept_review$MonitoringLocationIdentifier
colnames(distmat)=accept_review$MonitoringLocationIdentifier
accept_review=accept_review[,!names(accept_review) %in% c('lat','long')]

sum(table(accept_review$IR_FLAG))

countSites100m=function(data){
	count=sum(data>0&data<=0.1)
	#names(data[data>0&data<=0.1])
	return(count)
	}

sites100m_count=apply(distmat,1,FUN='countSites100m')
accept_review$sites100m_count=sites100m_count

rejected$sites100m_count=NA
rejected$MLID_Count=NA
rejected$Lat_Count=NA
rejected$Long_Count=NA

#Re-appending rejected data
spatial_check_data=rbind(accept_review,rejected)
table(spatial_check_data$IR_FLAG)
sum(table(spatial_check_data$IR_FLAG))
rm(accept_review)
rm(rejected)


#Join spatial checks to stn_new (drop spatial count columns then merge)
dim(stn_new)
stn_new=stn_new[,!names(stn_new)%in%c("ASSESS_ID","AU_NAME","AU_Type","BEN_CLASS","R317Descrp","ss_R317Descrp","OWNERSHIP","ValidationType","MLID_Count","Lat_Count","Long_Count","sites100m_count")]
stn_new=merge(stn_new,spatial_check_data,all.x=T)
dim(stn_new)
stn_new$ValidationType="AUTO"


#Spatial review flags & reasons (Apply to stn_new only)
#Populate stn_new$MLID & lat/long for new sites w/ no duplicate MLIDS, lats, longs, and 0 other sites w/in 100m (IR_FLAG=="REVIEW" for all non-rejected new sites at this point)
stn_new$IR_MLID = ifelse(stn_new$IR_FLAG=="REVIEW"&stn_new$MLID_Count==1&stn_new$Lat_Count==1&stn_new$Long_Count==1&stn_new$sites100m_count==0,as.vector(stn_new$MonitoringLocationIdentifier),"REVIEW")
stn_new$IR_MLNAME = ifelse(stn_new$IR_FLAG=="REVIEW"&stn_new$MLID_Count==1&stn_new$Lat_Count==1&stn_new$Long_Count==1&stn_new$sites100m_count==0,as.vector(stn_new$MonitoringLocationName),NA)
stn_new$IR_Lat = ifelse(stn_new$IR_FLAG=="REVIEW"&stn_new$MLID_Count==1&stn_new$Lat_Count==1&stn_new$Long_Count==1&stn_new$sites100m_count==0,stn_new$LatitudeMeasure,NA)
stn_new$IR_Long = ifelse(stn_new$IR_FLAG=="REVIEW"&stn_new$MLID_Count==1&stn_new$Lat_Count==1&stn_new$Long_Count==1&stn_new$sites100m_count==0,stn_new$LongitudeMeasure,NA)

#Populate rejected MLID, lat, and long w/ REJECT
stn_new$IR_MLID = ifelse(stn_new$IR_FLAG=="REJECT","REJECT",as.vector(stn_new$IR_MLID))
stn_new$IR_MLNAME = ifelse(stn_new$IR_FLAG=="REJECT","REJECT",as.vector(stn_new$IR_MLNAME))
stn_new$IR_Lat = ifelse(stn_new$IR_FLAG=="REJECT",NA,stn_new$IR_Lat)
stn_new$IR_Long = ifelse(stn_new$IR_FLAG=="REJECT",NA,stn_new$IR_Long)


#Review reasons
review_reasons=data.frame(matrix(nrow=0,ncol=2))

#MLID, lat/long, and site 100 m counts
reason_n=ifelse(stn_new$MLID_Count>1,"Duplicated MLID",NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

reason_n=ifelse(stn_new$Lat_Count>1 | stn_new$Long_Count>1,"Duplicated lat or long",NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

reason_n=ifelse(stn_new$sites100m_count>=1,"One or more sites w/in 100 m",NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}

#MonitoringLocationTypeName is a stream or spring type & AU_Type=="Canal"
reason_n=ifelse(
			(stn_new$MonitoringLocationTypeName=="Stream" & stn_new$AU_Type == "Canal")|
			(stn_new$MonitoringLocationTypeName=="River/Stream" & stn_new$AU_Type == "Canal")|
			(stn_new$MonitoringLocationTypeName=="River/Stream Intermittent" & stn_new$AU_Type == "Canal")|
			(stn_new$MonitoringLocationTypeName=="River/Stream Perennial" & stn_new$AU_Type == "Canal")|
			(stn_new$MonitoringLocationTypeName=="Spring" & stn_new$AU_Type == "Canal")
		,"Stream or spring site type in canal AU type",NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}
table(review_reasons$reason_n)

#MonitoringLocationTypeName is a lake type & AU_Type!="Reservoir/Lake"
reason_n=ifelse(
	(stn_new$MonitoringLocationTypeName=="Lake, Reservoir, Impoundment" & stn_new$AU_Type != "Reservoir/Lake")|
	(stn_new$MonitoringLocationTypeName=="Lake" & stn_new$AU_Type != "Reservoir/Lake")|
	(stn_new$MonitoringLocationTypeName=="Reservoir" & stn_new$AU_Type != "Reservoir/Lake")
	,"MLID type is lake/reservoir, but AU_Type is not - potential new AU needed",NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(stn_new$MonitoringLocationIdentifier,reason_n)))}
table(review_reasons$reason_n)


#Join spatial checks to master_site
spatial_check_data=spatial_check_data[,names(spatial_check_data) %in% c("UID","MLID_Count","Lat_Count","Long_Count","sites100m_count")]
names(spatial_check_data)=c("UID","MLID_Count2","Lat_Count2","Long_Count2","sites100m_count2")
master_site=merge(master_site,spatial_check_data,all.x=T)

#Check if MLID_Count, Lat_Count, Long_Count, or sites100m_count have increased, flag these for review
reason_n=ifelse(master_site$MLID_Count2>master_site$MLID_Count,"Master site MLID count has increased", NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(master_site$MonitoringLocationIdentifier,reason_n)))}
table(review_reasons$reason_n)

reason_n=ifelse(master_site$Lat_Count2>master_site$Lat_Count,"Master site lat count has increased", NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(master_site$MonitoringLocationIdentifier,reason_n)))}
table(review_reasons$reason_n)

reason_n=ifelse(master_site$Long_Count2>master_site$Long_Count,"Master site long count has increased", NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(master_site$MonitoringLocationIdentifier,reason_n)))}
table(review_reasons$reason_n)

reason_n=ifelse(master_site$sites100m_count2>master_site$sites100m_count,"Master site sites w/in 100 m count has increased", NA)
if(length(reason_n)>0){review_reasons=rbind(review_reasons,na.omit(cbind(master_site$MonitoringLocationIdentifier,reason_n)))}
table(review_reasons$reason_n)


#Rename review reason columns
names(review_reasons)=c("MonitoringLocationIdentifier","Reason")
review_reasons$ReasonType="Spatial"
review_reasons$FLAG="REVIEW"


print("Spatial site review reason count:")
print(table(review_reasons$Reason))


#rbind reasons together
reject_reasons=rbind(rej_reasons_att,rej_reasons_spat)
reasons_all=rbind(reject_reasons, review_reasons)


#Update master_site MLID_Count, Lat_Count, Long_Count, or sites100m
master_site$MLID_Count=master_site$MLID_Count2
master_site$Lat_Count=master_site$Lat_Count2
master_site$Long_Count=master_site$Long_Count2
master_site$sites100m_count=master_site$sites100m_count2

#Drop master_site MLID_Count2, Lat_Count2, Long_Count2, and sites100m2
master_site=master_site[,!names(master_site) %in% c("MLID_Count2","Lat_Count2","Long_Count2","sites100m_count2")]


#Populate ACCEPT for new sites w/ no duplicate MLIDS, lats, longs, and 0 other sites w/in 100m (IR_FLAG=="REVIEW" for all non-rejected new sites at this point)
stn_new=within(stn_new,{
	IR_FLAG[!MonitoringLocationIdentifier %in% reasons_all$MonitoringLocationIdentifier &IR_FLAG!="REJECT" & MLID_Count==1 & Lat_Count==1 & Long_Count==1 & sites100m_count==0]<-"ACCEPT"
})

print(paste(" Site validation complete and autovalidation resulted in", table(stn_new$IR_FLAG)[1],"accepted sites,",table(stn_new$IR_FLAG)[2],"rejected sites, and",table(stn_new$IR_FLAG)[3],"sites in need of review."))


#rbind master_site & stn_new to make full list of all sites (master_new)
master_site$IR_COMMENT=as.factor(master_site$IR_COMMENT)
stn_new$IR_COMMENT=as.factor(stn_new$IR_COMMENT)
master_new=plyr::rbind.fill(master_site, stn_new)

if(any(master_new$MonitoringLocationIdentifier[master_new$IR_FLAG=="ACCEPT" & master_new$ValidationType=="AUTO"] %in% reasons_all$MonitoringLocationIdentifier)){
  stop("Accepted site represented in reject/review site log. Check function code rules for unintended exceptions.")
}


###Set IR_FLAG for REJECT & REVIEW 
master_new=within(master_new,{
	IR_FLAG[MonitoringLocationIdentifier %in% review_reasons$MonitoringLocationIdentifier]="REVIEW"
	IR_FLAG[MonitoringLocationIdentifier %in% reject_reasons$MonitoringLocationIdentifier]="REJECT"
})


#Set IR_COMMENT
master_new$IR_COMMENT=as.character(master_new$IR_COMMENT)
master_new=within(master_new,{
	IR_COMMENT[IR_FLAG=="REJECT" & ValidationType=="AUTO"]="Automatically flagged for rejection"
	IR_COMMENT[IR_FLAG=="REVIEW" & ValidationType=="AUTO"]="Automatically flagged for review"
	IR_COMMENT[IR_FLAG=="ACCEPT" & ValidationType=="AUTO"]="Automatically accepted"
})

#Set ValidationType to AUTO for any MLID in review reasons (this is for previously manually reviewed sites that now have a review reason e.g. MLID count has increased)
master_new=within(master_new,{
	ValidationType[MonitoringLocationIdentifier %in% reasons_all$MonitoringLocationIdentifier]="AUTO"
})


####Sort by UID and re-order columns before writing
master_new=master_new[order(master_new$UID),]
master_new=master_new[,names(master_site)]

levels(master_new$AU_Type)=c(levels(master_new$AU_Type),"Undefined")
master_new$AU_Type[is.na(master_new$AU_Type)]="Undefined"

newsitesadded <- dim(master_new)[1]-orig_master[1]
if(newsitesadded!=new_autval){
  print("WARNING: discrepancy in number of sites added to master site list and number of new sites detected. Code/data review needed.")
}
if(dim(master_new)[2]!=orig_master[2]){
  print("WARNING: discrepancy in number of columns in old and updated master site list. Code/data review needed.")
}
print(paste("Updated master site list has",dim(master_new)[1],"sites, with", newsitesadded,"new sites added to original",orig_master[1],"sites in master list."))


# Export master site file
writexl::write_xlsx(list(sites=master_new, reasons=reasons_all),
		path = choose.files(caption="Save As...", filters = c("Excel Workbook (.xlsx)","*.xlsx"), multi=F), format_headers=F, col_names=T)
print("Master site file updated and review/rejection reasons file created.")

}
ut-ir-tools/irTools documentation built on Jan. 19, 2024, 6:55 p.m.