Nothing
#### FUNCTIONS FOR ANALYZING DATA IN THE WORKFLOW
# filterMissing ====
#' @title Check for missing coordinates
#' @description checks for missing coordinates in the occurrence dataframe
#' @param df dat.frame of species occurrences
#' @param xf character. The field in the data.frame containing the x coordinates
#' @param yf character. The field in the data.frame containing the y coordinates
#' @param verbose logical. Print messages? Default FALSE
#' @return List with two dataframes: stay = coordinates missing and continue = occurrence that you can retain for further analysis
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr)
#' @examples
#' k <- data.frame (x=c(runif (n = 100),NA),y=c(runif (n = 100),1000))
#' filterMissing(k,xf='x',yf='y')
#' @export
filterMissing <- function (df, xf , yf , verbose=FALSE){
coordIssues_coordMissing_value = !stats::complete.cases (df[,c(xf,yf)])
df$coordIssues_coordMissing_value = coordIssues_coordMissing_value
df$coordIssues_coordMissing_test = as.logical (coordIssues_coordMissing_value)
df.out <- df [coordIssues_coordMissing_value,]
df.continue <- df [!coordIssues_coordMissing_value,]
return (list (stay=df.out,continue=df.continue))
}
# duplicatesexcludeAnalysis ====
#' @title Duplicated records
#' @descriptions checks for duplicated coordinates in the occurrence data frame
#' @details it differentiates the exact duplicates and the duplicates for a occurrences falling in the same pixel
#' @param df Data.frame of species occurrences
#' @param xf the field in the data frame containing the x coordinators
#' @param yf the field in the data frame containing the y coordinates
#' @param resolution.in.minutes the resolution of environmental data used, specified in minutes
#' @param raster.grid An optional raster grid
#' @param verbose logical. Print messages? Default FALSE
#' @return list of three components: Dups.Exact = exact duplicate records, Dups.Grid= duplicates within environmental gridcell, continue = dataframe with good records
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr)
#' @family analysis
#' @examples
#' k <- data.frame (x=c(runif (n = 100),1000),y=c(runif (n = 100),1000),Reason=NA)
#' duplicatesexcludeAnalysis(k,xf='x',yf='y',resolution.in.minutes=60)
#' @export
duplicatesexcludeAnalysis <- function (df, xf, yf,
resolution.in.minutes,
raster.grid=NULL, verbose=FALSE){
#exact duplicates
duplicates_dupExact_value <- duplicated (x = df[,c(xf,yf)]) *1
duplicates_dupExact_test <- as.logical (duplicates_dupExact_value)
exact.dups = duplicates_dupExact_value
df$Exclude <- exact.dups
df$Reason [which (exact.dups==1)] <-'Exact Duplicated coordinates'
df$duplicates_dupExact_value = duplicates_dupExact_value
df$duplicates_dupExact_test = duplicates_dupExact_test
df.exact.dups <- df[which(df$Exclude==1),]
#update df
df <- df[which(df$Exclude==0),]
#build the grid for the duplicates
if(!is.null(raster.grid)){
ext <- raster::extent (raster.grid)
resolution.raster <- raster::res (raster.grid)[1]
rst <- raster::raster(xmn = ext@xmin, xmx = ext@xmax, ymn = ext@ymin, ymx = ext@ymax,
res = resolution.raster)
}
if(is.null(raster.grid)) {
rst <- raster::raster(xmn = -180, xmx = 180, ymn = -90, ymx = 90,
res = resolution.in.minutes/60)
}
#get dups in gridcell
rst[] <- 1:raster::ncell(rst)
spp <- as.character(unique(df$Species))
xy <- data.frame(biogeo::coord2numeric(df[,xf]),
biogeo::coord2numeric(df[,yf]))
cid <- raster::cellFromXY(rst, xy)
dups <- (duplicated(cid)) * 1
df$duplicates_dupGrid_value = dups
df$duplicates_dupGrid_test = as.logical (dups)
f1 <- which (dups==1)
df$Exclude <- dups
df$Reason[f1] <- "Duplicated--GridCell"
#indicate duplicates by grid cell (these needs updateing for efficiency)
df.grid.dups <- df[which(df$Exclude==1),]
#update df
df <- df[which(df$Exclude==0),]
#return
return (list (Dups.Exact=df.exact.dups, Dups.Grid= df.grid.dups, continue = df))
}
# SeaLand reassignment ====
#' @title SeaLand reassignment
#' @descriptions Reassign coastal coordinates as needed to be in
#' @details (function inspired in nearestcell in biogeo but modified
#' @param dat Data.frame of species occurrences
#' @param rst raster class object.
#' @param xf the field in the data frame containing the x coordinators
#' @param yf the field in the data frame containing the y coordinators
#' @param verbose logical. Print messages? Default FALSE
#' @return list
#' @keywords internal
#' @author Mark Robertson and Veron Visser (original \link[biogeo]{biogeo-package}), Josep M Serra-Diaz (modifs)
#' @family analysis
#' @seealso original code at \link[biogeo]{nearestcell}
.nearestcell3 <- function (dat,
rst,
xf,
yf,
verbose=FALSE) {
dat$irow <- 1:nrow (dat)
dat$Correction <- as.character(dat$Correction)
fx <- which(dat$Exclude == 0)
x1 <- biogeo::coord2numeric(dat[,xf][fx])
y1 <- biogeo::coord2numeric(dat[,yf][fx])
datid <- dat$irow[fx]
dd <- data.frame(x1, y1)
ce0 <- raster::cellFromXY(rst, dd)
vals <- raster::extract(rst, dd)
#if the input raster is a dataframe
if ('matrix' %in% class(vals)){
vals <- apply(vals,1,FUN = sum)
f <- which(is.na(vals))
} else {
f <- which(is.na(vals))
}
if (length(f) == 0){ if(verbose) print ("There are no missing values") ; return (dat)}
if (length (f) != 0){
id <- datid[f]
ce1 <- raster::cellFromXY(rst, dd[f, ])
if (any (is.na(ce1))) {
if (length(ce1) == 1) {if(verbose) print("Coordinates in sea are out of environmental extent"); return(dat)}
if(verbose) print("Some coordinates out of raster extent")
}
ff <- which(!is.na(ce1))
ce3 <- ce1[ff]
dd2 <- dd[ff, ]
#to eliminiate
id2 <- id[ff]
bb <- {}
for (i in 1:length(ce3)) {
a <- raster::adjacent(rst, ce3[i], directions = 8, pairs = FALSE,
target = NULL, sorted = FALSE, include = FALSE, id = FALSE)
idx <- id2[i]
# idx <- ce3[i]
b <- data.frame(i, a, id2 = idx)
bb <- rbind(bb, b)
}
xy <- raster::xyFromCell(rst, bb$a, spatial = FALSE)
vals <- raster::extract(rst, xy)
g <- data.frame(bb, xy, vals)
if (any(!is.na(vals))) {
fv <- which(!is.na(vals))
id <- bb$i[fv]
uid <- unique(id)
g1 <- stats::na.omit(g)
near <- {
}
for (j in 1:length(uid)) {
uj <- uid[j]
fx <- which(g1$i == uj)
nce <- g1[fx, ]
nce2 <- as.matrix(cbind(nce$x, nce$y))
pts <- dd2[uj, ]
dst <- raster::pointDistance(c(pts$x1, pts$y1), nce2, longlat = FALSE)
fm <- which.min(dst)
nr <- nce[fm, ]
near <- rbind(near, nr)
}
mod <- format(Sys.time(), "%d-%m-%Y %H:%M:%S")
dat$Modified <- as.character(dat$Modified)
for (i in 1:nrow(near)) {
f <- which(dat$irow == near$id2[i])
dat$x_original[f] <- dat[,xf][f]
dat$y_original[f] <- dat[,yf][f]
dat[,xf][f] <- near$x[i]
dat[,yf][f] <- near$y[i]
dc <- dat$Correction[f]
dc <- stringr::str_replace(dc, "[.]", "7")
dat$Correction[f] <- dc
dat$Modified[f] <- mod
}
moved <- data.frame(ID = near$id2, x = near$x, y = near$y)
names(moved) <- c('ID',xf,yf)
#remove irow column
columnOut= which (names(dat)=='irow')* (-1)
dat <- dat[,columnOut]
datx <- list(dat = dat, moved = moved)
return(datx)
} else {
warning("There are no records close enough to the nearest land/sea cells")
return(dat)
}
}
}
# countryStatusRange analysis ====
#' @title Range analysis
#' @description Identify and filter species records based on countries where species is considered native or alien.
#' @details It returns a list with two elements: stay (records that are excluded/filtered), continue (records that are not excluded from the filtering process)
#' @param df data.frame of species occurrences
#' @param xf character. The field in the data.frame containing the x coordinates
#' @param yf character. The field in the data.frame containing the y coordinates
#' @param .ntv.ctry character. Native country in ISO3 coding
#' @param .inv.ctry character. Invasive country in ISO3 coding
#' @param .c.field character. Country field in the species data.frame (df)
#' @param .points.proj4string Proj4string for the occurrence data
#' @param .countries.shapefile spatialPolygonDataFrame of political divisions
#' @param cfsf character. Column name of the .country spatialPolygonDataFrame indicaing the country in ISO3 codeing.
#' @param excludeUnknownRanges logical. Should records be filtered if located in countries outside .ntv.ctry or .inv.ctry? Defalut FALSE
#' @param excludeNotmatchCountry Should records be if the reported country is different than the locatoin country? . Default FALSE
#' @param doRangeAnalysis logical. Should range analysis be performed?
#' @param verbose logical. Print messages? Default FALSE
#' @return list
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr)
#' @family analysis
#' @export
countryStatusRangeAnalysis=function(df,
xf,
yf,
.ntv.ctry,
.inv.ctry,
.c.field,
.points.proj4string,
.countries.shapefile,
cfsf,
excludeUnknownRanges = FALSE,
excludeNotmatchCountry = FALSE,
doRangeAnalysis=TRUE,
verbose=FALSE) {
if (!doRangeAnalysis) {
df$countryStatusRange_wrongNTV_value <- NA
df$countryStatusRange_wrongNTV_test <- NA
df$countryStatusRange_wrongCTRY_value <- NA
df$countryStatusRange_wrongCTRY_test <- NA
df$countryStatusRange_wrongINV_value <- NA
df$countryStatusRange_wrongINV_test <- NA
df$countryStatusRange_score <-NA
stay = df[-1*1:nrow(df),]
out <- list (stay=stay,continue = df)
return (out)
}
#initial user information printing
if (is.null (.ntv.ctry) ) {if(verbose) print ('INFO: Species with no associated country. We assume all locations are native range')}
if (is.null (.inv.ctry) ) {if(verbose) print ('INFO: No invasive country provided. Analysis of invasive ranges not performed')}
if (is.null (.c.field) ) {if(verbose) print ('INFO: No info on table of country of registration. Analysis of country recorded vs. coordinates not performed')}
xydat <- df[,c(xf,yf)]
#check country of coordinates
#extract countries of the points with a given spdf object
if (!is.null(.countries.shapefile)){
if (is.null(.points.proj4string)) {
.points.proj4string <- .countries.shapefile@proj4string
if(verbose) print (paste ('ASSUMING points in projection',.countries.shapefile@proj4string))
}
sp.xydat <- sp::SpatialPoints(xydat,proj4string = .points.proj4string)
overlay.sp.xydat <- sp::over(sp.xydat, .countries.shapefile)
fieldname <- match(cfsf, names(overlay.sp.xydat))
country_ext <- overlay.sp.xydat[, fieldname]
}
#extract countries of the points with when a country spdf is not provided
if (is.null(.countries.shapefile)){
country_ext <- .coords2country (xydat)
}
#inform for whether reported and country in coordinates (extracted country) differ
if (is.null (.c.field)) {
if(verbose) print ("No country reported in occurrence database")
wrong.ctry.reported <- rep (NA,length(country_ext))
}
if (!is.null (.c.field)) {
#check if ISO3 format in country reported [FUTURE: use GNRS to automatically correct]
ctry.reported = as.character (df[,.c.field])
if (all (nchar (ctry.reported)!=3,na.rm=TRUE) | all (is.na(ctry.reported)) ) {
warning ("the countries in your database do not match ISO3 codes, test of reported vs. match dropped")
wrong.ctry.reported <- rep (NA,length(country_ext))
} else {
#match reported country with extracted country
wrong.ctry.reported <- (! as.character (df[,.c.field]) %in% as.character(country_ext)) * 1
nonMissingVals = ifelse (test = is.na (as.character (df[,.c.field])) | as.character (df[,.c.field]) == '',yes = NA,no = TRUE)
wrong.ctry.reported <- wrong.ctry.reported * nonMissingVals
}
}
#inform for which locations are not in the native range
if(is.null (.ntv.ctry)) {
if(verbose) print ('No info of native country. Assuming all records are native')
wrong.ntv.ctry.xy <- rep (NA,length(country_ext))
}
if(!is.null(.ntv.ctry) ) {
wrong.ntv.ctry.xy <- (! country_ext %in% .ntv.ctry) * 1
}
#inform for which locations are not in the invasive range
if (is.null (.inv.ctry)) {
if(verbose) print ('No info of invasive countries')
wrong.inv.ctry.xy <- rep (NA,length(country_ext))
}
if (!is.null (.inv.ctry)) {wrong.inv.ctry.xy <- (! country_ext %in% .inv.ctry) * 1}
#add to df
df$countryStatusRange_wrongNTV_value <- wrong.ntv.ctry.xy
df$countryStatusRange_wrongNTV_test <- as.logical (wrong.ntv.ctry.xy)
df$countryStatusRange_wrongCTRY_value <- wrong.ctry.reported
df$countryStatusRange_wrongCTRY_test <- as.logical (wrong.ctry.reported)
df$countryStatusRange_wrongINV_value <- wrong.inv.ctry.xy
df$countryStatusRange_wrongINV_test <- as.logical (wrong.inv.ctry.xy)
#Exclusion from subsequent analysis for wrong native and invasive ranges
# (depending how much data has been provided)
if (excludeUnknownRanges == TRUE & !is.null(.ntv.ctry) & !is.null(.inv.ctry) ){
total.data <- (!is.na (wrong.ntv.ctry.xy) *1) + (!is.na(wrong.inv.ctry.xy)*1)
sum.wrong.extracted.ranges <- mapply (function (x,y) (sum (x,y,na.rm = TRUE)),wrong.ntv.ctry.xy, wrong.inv.ctry.xy)
exclusion.based.on.wrong <- sum.wrong.extracted.ranges / total.data
df$Exclude <- (exclusion.based.on.wrong == 1)*1
exclude.rows <- which (df$Exclude == 1)
#add the reason of exclusion
if (length(exclude.rows) > 0) {df$Reason [exclude.rows] <-"XY in not in invasive or native ranges"}
}
if (excludeUnknownRanges == FALSE) {df$Exclude <- 0}
# Exclusion based on wrong country reported
if (excludeNotmatchCountry == TRUE & !is.null (.c.field)) {
exclusion.based.on.wrong <- df$Exclude + (10*wrong.ctry.reported)
only.wrong.reported.range <- which (exclusion.based.on.wrong == 10 )
wrong.reported.range.and.informedrange <- which (exclusion.based.on.wrong == 11 )
df$Reason [only.wrong.reported.range] <- "Incongruence reported country and coordinates"
df$Exclude [only.wrong.reported.range] <- 1
df$Reason [wrong.reported.range.and.informedrange] <-"XY in not in invasive or native ranges;Incongruence reported country and coordinates"
}
#output
df$countryStatusRange_score = .gimme.score (df %>% dplyr::select(dplyr::starts_with('countryStatusRange')))
out <- list (stay = df[which(df$Exclude==1),], continue = df[which(df$Exclude!=1),])
return (out)
}
# Centroid detection ====
#' @title Centroid detection function
#' @description Identify occurrence records located near centroids.
#' @details
#' @param df data.frame of species occurrences
#' @param xf character. column name in df containing the x coordinates
#' @param yf character. column name in df containing the y coordinates
#' @param .c.field character. Country field in the species data.frame (df)
#' @param .ntv.ctry character. ISO3 country codes where species are considered native
#' @param idf character. Column with the taxon observation ID.
#' @param .inv.ctry character. ISO3 country codes where species are considered alien
#' @param .points.proj4string Proj4string argument. Coordinate reference system
#' @param .r.env raster. Raster of environmental variables considered in the analysis
#' @param .countries.shapefile spatialPolygonDataFrame of political divisions
#' @param cfsf character. Column name of the .country spatialPolygonDataFrame indicating the country in ISO3 coding.
#' @param method character. Vector with the methods to detect centroids
#' @param do logical. Should range analysis be performed? Default TRUE
#' @param verbose logical. Print messages? Default FALSE
#' @return data.frame
#' @keywords internal
#' @family analysis
#' @details Current methods implemented for centroid detection are 'BIEN' (uses iterative procedure with threshold selection of distance to centroid)\cr
#' , and method 'CoordinateCleaner' implementing methods used in \link[CoordinateCleaner]{CoordinateCleaner-package} package.
#' @author Brian Maitner, Josep M Serra-Diaz, Cory Merow. Functions from CoordinateCleaner implemented by A Zizka (CoordinateCleaner)
#' @seealso \link[CoordinateCleaner]{cc_cap} \link[CoordinateCleaner]{cc_cen}
#' @export
centroidDetection <- function (df,
xf,
yf,
cf,
idf,
.ntv.ctry,
.inv.ctry,
.points.proj4string,
.r.env,
.countries.shapefile,
cfsf,
method='all',
do = TRUE, verbose=FALSE){
#table of outputs
out = data.frame (centroidDetection_BIEN_value=NA,
centroidDetection_BIEN_test=NA,
centroidDetection_BIEN_comments =NA,
centroidDetection_CoordinateCleaner_value=NA,
centroidDetection_CoordinateCleaner_test=NA,
centroidDetection_CoordinateCleaner_comment=NA,
centroidDetection_score=NA
)[1:nrow (df),]
row.names(out) <- NULL
if(!do) { return (out)}
#Method BIEN
if (any(method %in% c('BIEN','all'))){
#load centroid data
dest_url = 'https://github.com/pepbioalerts/vignetteXTRA-occTest/raw/main/ext/centroids.rds'
outFile = paste0(tempdir(),'/centroids.rds')
if (!file.exists(outFile)) utils::download.file(url=dest_url,destfile = outFile)
centroid_data = readRDS (outFile)
#reformat dataframe of occurrences to be the same as in centroid Maitner methods
if (is.null(idf)) { toid <- 1:nrow (df) } else {toid <- df[,idf]}
occurrences.df = data.frame (taxonobservation_id=toid,
country ='',
state_province = '',
county = '',
latitude = df[,yf],
longitude = df[,xf])
#library(GNRS)
#cleaned_countries<-GNRS_super_simple(country = df[,cf])
#if (!is.null(cf)) {occurrences.df[,'country'] = as.character(df[,cf])}
#if ('state_province' %in% names (df)) {occurrences.df$state_province <- df$state_province}
#if ('county' %in% names (df)) {occurrences.df$county <- df$county}
#occurrences.df$cleaned_country<-NA
#get admin unites from coordinates
#### TO REPLACE BY A FUNCTION .coords2country
xydat <- df[,c(xf,yf)]
if (!is.null(.countries.shapefile)){
if (is.null(.points.proj4string)) {.points.proj4string <- .countries.shapefile@proj4string; if(verbose) print (paste ('ASSUMING points in projection',.countries.shapefile@proj4string))}
sp.xydat <- sp::SpatialPoints(xydat,proj4string = .points.proj4string)
overlay.sp.xydat <- sp::over(sp.xydat, .countries.shapefile)
country_ext <- overlay.sp.xydat[,cfsf]
occurrences.df$country <- country_ext
}
#extract countries of the points with when a country shapefile is not provided
if (is.null(.countries.shapefile)){
country_ext <- .coords2country (xydat)
occurrences.df$country <- country_ext
}
cleaned_countries<-try ({GNRS::GNRS_super_simple(country = country_ext)},silent=TRUE)
if (inherits(cleaned_countries,'error') | is.null(cleaned_countries)){
out$centroidDetection_BIEN_value <- NA
out$centroidDetection_BIEN_test <- NA
out$centroidDetection_BIEN_comments <- 'Not performed. GNRS error'
} else{
matchedCtry <- cleaned_countries$match_status=='full match'
cleaned_countries <- cleaned_countries [matchedCtry,]
maxValLoop = nrow(cleaned_countries)
for(i in 1:nrow(cleaned_countries)){
svMisc::progress(i,max.value = maxValLoop)
occurrences.df$country <- as.character(occurrences.df$country)
occurrences.df$country[which(occurrences.df$country ==cleaned_countries$country_verbatim[i])] <- unlist(cleaned_countries$country[i])
}
for(i in 1:nrow(cleaned_countries)){
svMisc::progress(i,max.value = maxValLoop)
occurrences.df$state_province <- as.character(occurrences.df$state_province)
occurrences.df$state_province[which(occurrences.df$country ==cleaned_countries$country[i] &
occurrences.df$state_province ==cleaned_countries$state_province_verbatim[i])] <- unlist(cleaned_countries$state_province[i])
}
for(i in 1:nrow(cleaned_countries)){
svMisc::progress(i,max.value = maxValLoop)
occurrences.df$county <- as.character(occurrences.df$county)
occurrences.df$county[which(occurrences.df$country ==cleaned_countries$country[i] &
occurrences.df$state_province ==cleaned_countries$state_province[i]&
occurrences.df$county==cleaned_countries$county_parish_verbatim[i])] <- unlist(cleaned_countries$county_parish[i])
}
#Get relative distances
k <- centroid_assessment(occurrences = occurrences.df, centroid_data = centroid_data)
#convert relative distances to binary using threshold
centroid_threshold <- 0.002 #smaller values require points to be closer to a centroid in order to be flagged
k$is_centroid <- apply(X = k,MARGIN = 1,FUN = function(x){
y<-as.vector(stats::na.omit(unlist(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative']))))
#print(y)
logiCentroid = any(as.numeric(y)<centroid_threshold)
logiCentroid
})
k$is_centroid_val <- apply(X = k,MARGIN = 1,FUN = function(x){
#print(x)
if(all(is.na(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative'])))){
return(NA)
}
valOut = as.numeric(unlist(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative'])))[which.min(as.numeric(unlist(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative']))))]
valOut
})
#construct comments
k$comments <-apply(X = k,MARGIN = 1,FUN = function(x){
#print(x)
if(x['is_centroid']==1){ineq<-"<="}else{ineq<-">"}
if(all(is.na(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative'])))){
return("Centroid determination not conducted due to missing/unmatched political divison data" )
}
paste("Centroid determination based on ",
c('country_cent_dist_relative','state_cent_dist_relative','county_cent_dist_relative')[which.min(as.numeric(unlist(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative']))))],
": ",
as.numeric(unlist(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative'])))[which.min(as.numeric(unlist(c(x['country_cent_dist_relative'],x['state_cent_dist_relative'],x['county_cent_dist_relative']))))],
ineq,
"threshold value of",
centroid_threshold
)#paste
} )#apply
colnames(k)[which(colnames(k)=="is_centroid")]<-"centroidDetection_BIEN_test"
colnames(k)[which(colnames(k)=="comments" )]<-"centroidDetection_BIEN_comments"
out$centroidDetection_BIEN_value <- k$is_centroid_val
out$centroidDetection_BIEN_test <- as.logical(k$centroidDetection_BIEN_test)
out$centroidDetection_BIEN_comments <- k$centroidDetection_BIEN_comments
}
}
#Method CoordinateCleaner
if (any(method %in% c('CoordinateCleaner','all'))){
cc_cap_test = CoordinateCleaner::cc_cap(x = df,lon =xf,lat=yf,value='flagged', verbose = FALSE)
cc_cap_test = (!cc_cap_test) *1
cc_cen_test <- CoordinateCleaner::cc_cen (x = df, lon =xf,lat=yf ,value='flagged', verbose = FALSE)
cc_cen_test <- (!cc_cen_test) * 1
out$centroidDetection_CoordinateCleaner_value = cc_cen_test+cc_cen_test
out$centroidDetection_CoordinateCleaner_test = ifelse ((cc_cap_test==1 | cc_cen_test==1),TRUE,FALSE)
out$centroidDetection_CoordinateCleaner_comment = ''
out$centroidDetection_CoordinateCleaner_comment [cc_cap_test==1] = 'capital centroid'
out$centroidDetection_CoordinateCleaner_comment [cc_cen_test==1] = 'ctry/prov centroid'
}
out$centroidDetection_score = .gimme.score (x = out)
return (out)
}
# human detection ====
#' @title Hyper Human Influence
#' @description Detect occurrences in heavily human-impacted environments
#' @details It uses several methods to detect records in high human influence records.\cr
#' Current implemented methods are: \cr
#' 'hii' using a raster and a threhold of human influence
#' 'urban' using a layer of urban areas. Method implemented in CoordinateCleaner package.
#' @param df Data.frame of species occurrences
#' @param xf the field in the data frame containing the x coordinates
#' @param yf the field in the data frame containing the y coordinates
#' @param ras.hii Raster of human influence index
#' @param .th.human.influence threshold of human influence index
#' @param method character. Indicate which tests to use. Default 'all'. See Details
#' @param .points.proj4string character. Points coordinate projection.
#' @param ras.hii raster. Raster map of human influence index use
#' @param .th.human.influence numeric. Threhold to identify places of high human influence
#' @param .points.proj4string proj4string argument for df
#' @param do logical. Should range analysis be performed? Default TRUE
#' @param verbose logical. Print messages? Default FALSE
#' @param output.dir character. Output directory
#' @return data.frame
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr). A Zizka (CoordinateCleaner functions)
#' @seealso \link[CoordinateCleaner]{cc_urb} for CoordinateCleaner functions
#' @family analysis
#' @export
humanDetection <- function (df,
xf,
yf,
method='all',
ras.hii,
.th.human.influence,
.points.proj4string,
do=TRUE, verbose=FALSE,output.dir){
out <- data.frame (HumanDetection_HumanInfluence_value=NA,
HumanDetection_HumanInfluence_test=NA,
HumanDetection_HumanInfluence_comments= NA,
HumanDetection_UrbanAreas_value=NA,
HumanDetection_UrbanAreas_test=NA,
HumanDetection_UrbanAreas_comments=NA,
HumanDetection_score=NA
)[1:nrow (df),]
row.names(out) <- NULL
if (!do) {return (out)}
#start human influence analysis
xydat <- df[,c(xf,yf)]
#get species presence
data.sp.pres <- as.data.frame (xydat)
sp::coordinates (data.sp.pres) <- stats::as.formula (paste0('~',xf,'+',yf))
#start human influence index test
if (any (method %in% c('hii','all'))) {
class(ras.hii)=='RasterLayer'
#accomodate projections
if (is.null (.points.proj4string)){sp::proj4string(data.sp.pres) <- raster::projection(ras.hii); warning ('ASSUMING Points and HumanRaster data have the same projection')}
if (!is.null (.points.proj4string)) {sp::proj4string(data.sp.pres) <- .points.proj4string}
if (sp::proj4string(data.sp.pres) != raster::projection (ras.hii)){
if (is.na (raster::projection (ras.hii)) ) {sp::proj4string(data.sp.pres) <- NA ; warning ('ASSUMING Points and HumanRaster data have the same projection')}
if (!is.na (raster::projection (ras.hii)) ) {data.sp.pres <- sp::spTransform(data.sp.pres,CRSobj = raster::projection(ras.hii) )}
}
hii.sp.pres <-raster::extract(ras.hii, y = data.sp.pres, cellnumbers=FALSE, df=TRUE)
row.id.hii.NA <- which(is.na(hii.sp.pres[,2]))
if (length (row.id.hii.NA) != 0) {
output.comments <- rep (NA,length =nrow (xydat))
output.comments [row.id.hii.NA] <- paste0('No human filter available for this record;' )
}
hii.sp.value = hii.sp.pres[,2]
hii.sp.pres <-( hii.sp.value >= .th.human.influence)
out$HumanDetection_HumanInfluence_value=hii.sp.value
out$HumanDetection_HumanInfluence_test=hii.sp.pres
out$HumanDetection_HumanInfluence_comments= paste('Threshold influence=',.th.human.influence)
}
#start urban areas
if (any (method %in% c('urban','all'))) {
#check ref exists
newoutdir = paste0(tempdir(),'/spatialData')
alreadyDownloaded = file.exists (x = paste0(newoutdir,'/NE_urbanareas.shp'))
if (alreadyDownloaded) myRef = rgdal::readOGR(dsn = newoutdir,layer = 'NE_urbanareas',verbose = FALSE)
if (!alreadyDownloaded) myRef= NULL
cc_urb_test = .cc_urb_occTest(x = df, lon =xf,lat=yf ,value='flagged', verbose = FALSE,ref = myRef,outdir = output.dir )
cc_urb_test <- (! cc_urb_test) * 1
out$HumanDetection_UrbanAreas_value <- cc_urb_test
out$HumanDetection_UrbanAreas_test <- as.logical(cc_urb_test)
out$HumanDetection_UrbanAreas_comments <- c('Urban areas from rnaturalearth')
}
#compute score
out$HumanDetection_score <- .gimme.score (out)
return (out)
}
# Biodiversity institutions ====
#' @title Detect biodiversity institutions
#' @description Detect occurrences potentially in biodiversity institutions using different methods
#' @param df data.frame of species occurrences
#' @param xf character. column name in df containing the x coordinates
#' @param yf character. column name in df containing the y coordinates
#' @param lf The locality field in df.
#' @param method charcter. Vector of methods to be used. See details. Default 'all'
#' @param .points.proj4string proj4string argument for df
#' @param do logical. Should range analysis be performed?
#' @param verbose logical. Print messages? Default FALSE
#' @return data.frame
#' @details current implemented methods are : \cr
#' "fromCoordinates" (use information on localities of institutions, from \link[CoordinateCleaner]{CoordinateCleaner-package}) \cr
#' "fromBotanicLocalityName" (using information on locality names that include keywords of institutions)
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr)
#' @seealso \link[CoordinateCleaner]{cc_inst} \link[CoordinateCleaner]{cc_gbif}
#' @export
institutionLocality <- function (df,
xf,
yf,
lf,
method='all',
.points.proj4string,
do=TRUE, verbose=FALSE
){
#output table
out = data.frame (institutionLocality_fromBotanicLocalityName_value=NA,
institutionLocality_fromBotanicLocalityName_test=NA,
institutionLocality_fromBotanicLocalityName_comments=NA,
institutionLocality_fromCoordinates_value=NA,
institutionLocality_fromCoordinates_test=NA,
institutionLocality_fromCoordinates_comments=NA,
institutionLocality_score=NA
)[1:nrow (df),]
row.names (out) <- NULL
if (!do) {return (out)}
#method fromBotanicLocalityName
if (any (method %in% c('fromBotanicLocalityName','all'))) {
#if no locality data is provided
if(is.null (lf)) {
if(verbose) print ('No locality in input data')
}
if(!is.null (lf)) {
#if locality data is provided
loc=df[,lf]
potential.names <- c('botanic', 'botanische', 'botanico', 'jardin', 'garden', 'botanical')
localities <- as.character (loc)
bot.garden <-sapply (localities, function (x) {
if (is.na(x)){return (NA)}
if (x==''){return (0)}
splits = c(' ',',',';',':','/','\\\\','[.]')
if (x %in% splits ) {return (0)}
m <- tolower(.multiple.strsplit(x,multiple.splits = splits))
potential.bot.garden <- sum ((m %in% potential.names) *1)
potential.bot.garden <- ifelse(potential.bot.garden>0, 1, 0)
return (potential.bot.garden)
})
bot.garden <- as.numeric (bot.garden)
out$institutionLocality_fromBotanicLocalityName_value = bot.garden
out$institutionLocality_fromBotanicLocalityName_test = as.logical (bot.garden)
out$institutionLocality_fromBotanicLocalityName_comments = 'likely botanical garden'
}
}
#method from Coordinates
if (any (method %in% c('fromCoordinates','all'))) {
cc_inst_test = CoordinateCleaner::cc_inst(x = df, lon =xf,lat=yf ,value='flagged', verbose = FALSE )
cc_inst_test <- (! cc_inst_test) * 1
cc_gbif_test = CoordinateCleaner::cc_gbif(x = df, lon =xf,lat=yf ,value='flagged', verbose = FALSE )
cc_gbif_test <- (! cc_gbif_test) * 1
cc_gbif_comments = ifelse(cc_gbif_test==1,'GBIFheadquarters',NA)
cc_inst_comments = ifelse(cc_inst_test==1,'BiodivInst',NA)
out$institutionLocality_fromCoordinates_value = ifelse ((cc_gbif_test==1 | cc_inst_test==1),1,0)
out$institutionLocality_fromCoordinates_test = as.logical (ifelse ((cc_gbif_test==1 | cc_inst_test==1),1,0))
out$institutionLocality_fromCoordinates_comments = .paste3(cc_gbif_comments,cc_inst_comments)
}
out$institutionLocality_score <- .gimme.score (out)
return (out)
}
# geoOutliers ====
#' @title Detect geographic outliers
#' @descripion Detect geographical outliers using several tests
#' @param df data.frame of species occurrences
#' @param xf character. column name in df containing the x coordinates
#' @param yf character. column name in df containing the y coordinates
#' @param .alpha.parameter parameter setting for alphahull
#' @param .distance.parameter numeric. Maximum distance allowed. Default to 1000
#' @param .medianDeviation.parameter numeric. Deviation parameter to . Default to 0.1
#' @param method charcter. Vector of methods to be used. See details. Default 'all'
#' @param .projString proj4string character. Indicate coordinate reference system
#' @param do logical. Should tests be performed? Default TRUE
#' @param verbose logical. Print messages? Default FALSE
#' @details Methods implented are 'alphaHull', detecting species outside an alphaHull level (default 2), \cr
#' 'alphaHull' \cr
#' 'distance' corresponds to 'distance' method in CoordinateCleaner::cc_outl(method='distance'). See ?CoordinateCleaner::cc_outl \cr
#' 'median' corresponds to 'mad' method in CoordinateCleaner::cc_outl(method='mad'). See ?CoordinateCleaner::cc_outl \cr
#' 'quantSamplingCorrected' corresponds to 'mad' method in CoordinateCleaner::cc_outl(method='quantile'). See ?CoordinateCleaner::cc_outl \cr
#' 'grubbs' implements Grubbs test to find spatial outliers. See ?findSpatialOutliers for details \cr
#' @return data.frame
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr), Cory Merow (cmerow@@gmail.com)
#' @seealso getPointsOutAlphaHull, \link[CoordinateCleaner]{cc_outl}, findSpatialOutliers
#' @export
geoOutliers <- function (df,
xf,
yf,
.alpha.parameter,
.distance.parameter=1000,
.medianDeviation.parameter=5,
.samplingIntensThreshold.parameter=0.1,
method = 'all',
.projString ,
do=TRUE, verbose=FALSE){
#build dataframe of all potential results
out = data.frame (geoOutliers_alphaHull_value=NA,
geoOutliers_alphaHull_test=NA,
geoOutliers_alphaHull_comments = NA,
geoOutliers_distance_value=NA,
geoOutliers_distance_test=NA,
geoOutliers_distance_comments = NA,
geoOutliers_median_value=NA,
geoOutliers_median_test=NA,
geoOutliers_median_comments=NA,
geoOutliers_quantileSamplingCorr_value=NA,
geoOutliers_quantileSamplingCorr_test=NA,
geoOutliers_quantileSamplingCorr_comments=NA,
geoOutliers_Grubbs_value=NA,
geoOutliers_Grubbs_test=NA,
geoOutliers_Grubbs_comments=NA,
geoOutliers_score=NA
)[1:nrow (df),]
row.names (out) <- NULL
if (!do) {return (out)}
#prepare df for different methods of outliers in different packages
df$species <- 'MyFakeSp'
rownames(df) <-1:nrow(df)
xydat <- df[,c(xf,yf)]
if (nrow (xydat)<5) {warning('geoOutliers not perfomred. N<5'); return(out)}
if (any (method %in% c('alphaHull','all'))){
#We typically will consider by default an alpha 2 -like ALA
if (nrow (df) > 5 & !is.na(.alpha.parameter)) {
points.outside.alphahull <- try (getPointsOutAlphaHull (xydat,alpha = .alpha.parameter), silent=TRUE)
if (inherits(points.outside.alphahull, 'try-error')) points.outside.alphahull <- rep (NA,length.out=nrow (xydat))
out.comments <- paste0('GeoIndicator alphaHull (Alpha=',.alpha.parameter,')')
}
if (nrow (df) <=5 ) {
points.outside.alphahull <- rep (NA, nrow(df) )
out.comments <- paste0('N<=5. Analysis not run')
}
#write results into the results dataframe
out$geoOutliers_alphaHull_value <- points.outside.alphahull
out$geoOutliers_alphaHull_test <- as.logical (points.outside.alphahull)
out$geoOutliers_alphaHull_comments <- out.comments
### NOT IMPLEMENTED YET
#This is a bit experimental, when the user does not provide a predetermined value of alpha
# we perform some analysis to get an "optimal" alpha parameter.
# not implemented yet
# if (nrow (df) >5 & is.na(.alpha.parameter) ) {
#
# #build alpha hulls and determine alpha parameter for each from 1 to 10
# ah.hulls.list <- lapply (1:10, function (a){
#
# try(ah <- alphahull::ahull (x = df[,xf], y= df[,yf], alpha=a), silent = TRUE)
# if (exists("ah")){return (ah)}
#
# })
#
# ah.hulls.thresholds <- lapply (1:10, function (i) {
# if((is.null(ah.hulls.list[[i]]) == FALSE) & (is.list (ah.hulls.list[[i]]) == TRUE) ) {inahull(ah.hulls.list[[i]] , as.matrix(xydat) ) * i}
# })
#
# is.not.null <- lapply (ah.hulls.thresholds, function (x) !is.null(x))
# ah.hulls.thresholds <- ah.hulls.thresholds[unlist(is.not.null)]
# ah.hulls.thresholds <- Reduce (cbind, ah.hulls.thresholds)
#
# #check if error in alphahulls, if not continue
# if ( is.null (ah.hulls.thresholds)) {
# ah.hulls.level <- rep (NA, nrow(df) )
# for (i in 1:nrow(df)) {df$comments [i] <- paste0(df$comments [i],'Error calculating AlphaHull;' )}
#
# }
# if (!is.null (ah.hulls.thresholds)) {
# ah.hulls.level <- apply (ah.hulls.thresholds, 1, function (x){
#
# x [which (x==0)] <- NA
# m <- min (x, na.rm=TRUE)
# out <- ifelse (m==Inf, NA, m)
# return (out)
#
#
# })
# ah.hulls.level[which(is.na(ah.hulls.level))] <- 9999
#
# #build dataframe for optimal alpha value choice
# ah.dataframe <- lapply (1:length(ah.hulls.list), function (i){
#
# ah <- ah.hulls.list[[i]]
# if (exists("ah") & is.null(ah) == FALSE & is.list(ah)==TRUE ){
# perc.points.in.hull <- round ((sum (alphahull::inahull(ah,as.matrix(xydat)))/nrow (xydat))*100,digits=2)
# area.hull <-alphahull::areaahull(ah)
# return (data.frame (perc.points.in.hull, area.hull))
#
# } else {
# perc.points.in.hull <- NA
# area.hull <- NA
# return (data.frame (perc.points.in.hull, area.hull))
#
# }
#
#
#
#
# })
# ah.dataframe <- Reduce (rbind,ah.dataframe)
# ah.dataframe$alpha <- 1:10
# ah.dataframe$optim.alpha.values <- ah.dataframe$perc.points.in.hull/ah.dataframe$area.hull
# dat.ts <- stats::ts(ah.dataframe$area.hull, frequency=1)
# ah.dataframe$A.increase <- c (dat.ts/stats::lag(x = dat.ts, 1) - 1,NA)
#
# #choose what I would call the optimal alpha based on two criteria
#
# alpha.choice <- subset (ah.dataframe, subset=perc.points.in.hull > 90)
# if (nrow (alpha.choice) == 0) {print('No optimal alpha choice with the criteria used. Defaulting to alpha = 2')
# .alpha.parameter <- 2
#
# }
#
# if (nrow (alpha.choice) >= 0){
# alpha.choice <- alpha.choice[order(alpha.choice$optim.alpha.values,decreasing = TRUE,na.last = TRUE),]
# alpha.choice <- alpha.choice[1,'alpha']
# if(verbose) print (paste("Optimal alpha selected =", alpha.choice, "for alpha geoindicator of outliers"))
#
# .alpha.parameter <- alpha.choice
#
# }
#
#
# }
#
#
#
#
# }
#
#
}
if (any (method %in% c('distance','all'))){
#create fake species list for Coordinate cleaner
df$species <- 'MyFakeSp'
cc_dist_test = .cc_outl_occTest(x = df,lon = xf,lat = yf,method = 'distance',value = 'flagged',tdi = .distance.parameter, verbose=FALSE)
cc_dist_test <- (! cc_dist_test)
out$geoOutliers_distance_value = cc_dist_test * 1
out$geoOutliers_distance_test = cc_dist_test
out$geoOutliers_distance_comments = rep(paste('dist.threshold=',.distance.parameter,'m'),times=nrow(out))
if (nrow (df)<7) {
out$geoOutliers_distance_value = rep(NA,times=nrow(out))
out$geoOutliers_distance_test = rep(NA,times=nrow(out))
out$geoOutliers_distance_comments = rep(paste('N<7. Analysis not run'),times=nrow(out))
}
}
if (any (method %in% c('median','all'))){
cc_med_test = .cc_outl_occTest(x = df,lon = xf,lat = yf,method = 'mad',value = 'flagged',mltpl = .medianDeviation.parameter , verbose=FALSE)
cc_med_test <- (! cc_med_test) * 1
out$geoOutliers_median_value = cc_med_test
out$geoOutliers_median_test = as.logical(cc_med_test)
out$geoOutliers_median_comments = rep(paste('medDeviation=', .medianDeviation.parameter),times=nrow(out))
if (nrow (df)<7) {
out$geoOutliers_median_value = rep(NA,times=nrow(out))
out$geoOutliers_median_test = rep(NA,times=nrow(out))
out$geoOutliers_median_comments = rep(paste('N<7. Analysis not run'),times=nrow(out))
}
}
if (any (method %in% c('quantSamplingCorrected','quantileSamplingCorr','all'))){
cc_qsc_tst = .cc_outl_occTest(x = df,lon = xf,lat = yf,method='quantile',value = 'flagged',
mltpl = .medianDeviation.parameter ,
sampling_thresh = .samplingIntensThreshold.parameter ,
verbose=FALSE)
cc_qsc_tst <- (! cc_qsc_tst) * 1
out$geoOutliers_quantileSamplingCorr_value = cc_qsc_tst
out$geoOutliers_quantileSamplingCorr_test = as.logical(cc_qsc_tst)
out$geoOutliers_quantileSamplingCorr_comments = rep(paste('IQRmultiplier=', .medianDeviation.parameter,';SamplIntensityThres=',.samplingIntensThreshold.parameter),times=nrow(out))
if (nrow (df)<7) {
out$geoOutliers_quantileSamplingCorr_value = rep(NA,times=nrow(out))
out$geoOutliers_quantileSamplingCorr_test = rep(NA,times=nrow(out))
out$geoOutliers_quantileSamplingCorr_comments = rep(paste('N<7. Analysis not run'),times=nrow(out))
}
}
if (any (method %in% c('grubbs','all','Grubbs'))) {
spdf = sp::SpatialPoints(coords = xydat,proj4string =.projString )
if (length (spdf)<=5) {
out$geoOutliers_Grubbs_value = rep(NA,times=nrow(out))
out$geoOutliers_Grubbs_test = rep(NA,times=nrow(out))
out$geoOutliers_Grubbs_comments = rep(paste('N<5. Analysis not run'),times=nrow(out))
}
if (length (spdf)>5){
grubbs_tst = rep(0,times=nrow(out))
grubbs.outl = try(occTest::findSpatialOutliers(myPres = spdf,verbose = FALSE),silent=TRUE)
if (inherits(grubbs.outl,what = 'try-error')) {
grubbs_tst = rep(NA,times=nrow(out))
grubbs_comment <- rep (paste('Error in findSpatialOutliers'),times=nrow(out))
}
if (!inherits(grubbs.outl,what = 'try-error')) {
grubbs_tst [grubbs.outl] = 1
grubbs_comment <- rep (paste('Pval= 1e-5'),times=nrow(out))
}
out$geoOutliers_Grubbs_value = grubbs_tst
out$geoOutliers_Grubbs_test = as.logical(grubbs_tst)
out$geoOutliers_Grubbs_comments = grubbs_comment
}
}
out$geoOutliers_score <- .gimme.score (out)
return (out)
}
# env outliers ====
#' @title Detect environmental outliers
#' @description Detect environmental outliers using different methods
#' @param df data.frame of species occurrences
#' @param xf character. column name in df containing the x coordinates
#' @param yf character. column name in df containing the y coordinates
#' @param .r.env raster. Raster with environmental data
#' @param .th.perc.outenv numeric. Value from 0 to 1 to identify the rate of variables not passing the test to consider the record an environmental outlier.
#' @param .sp.name Species name
#' @param method charcter. Vector of methods to be used. See details. Default 'all'
#' @param .projString proj4string character. Indicate coordinate reference system
#' @param do logical. Should tests be performed? Default TRUE
#' @param verbose logical. Print messages? Default FALSE
#' @return data.frame
#' @details Implemented methods are :\cr
#' 'bxp' based on boxplot distribution (1.5 Interquartile range) in a variable to identify the record as outlier (\link[biogeo]{outliers}) \cr
#' 'Grubbs' based on Grubbs test. See ?findEnvOutliers \cr
#' Regardless of the method, the function already tests for missing environmental variables and it outputs the result in the output data.frame
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr), Cory Merow (cmerow@@gmail.com). Mark Robertson (biogeo package)
#' @seealso findEnvOutliers \link[biogeo]{outliers}
#' @export
envOutliers <- function (
df,
xf,
yf,
.r.env,
.th.perc.outenv ,
.sp.name ,
method='all',
.projString ,
do=TRUE, verbose=FALSE){
#output results
out = data.frame (envOutliers_missingEnv_value = NA,
envOutliers_missingEnv_test = NA,
envOutliers_bxp_value = NA,
envOutliers_bxp_test = NA,
envOutliers_bxp_comments = NA,
envOutliers_Grubbs_value=NA,
envOutliers_Grubbs_test=NA,
envOutliers_Grubbs_comments=NA,
envOutliers_score=NA
)[1:nrow (df),]
if (!do) {return (out)}
#prepare df for different methods of outliers in different packages
df$species <- 'MyFakeSp'
rownames(df) <-1:nrow(df)
xydat <- df[,c(xf,yf)]
if (nrow (xydat)<5) {
warning('envOutliers not perfomred. N<5')
return(out)}
dat.environment <- raster::extract(x = .r.env, y= df[,c(xf,yf)], df = TRUE)
missingEnvironment <- (!stats::complete.cases(dat.environment))
out$envOutliers_missingEnv_value <- missingEnvironment * 1
out$envOutliers_missingEnv_test <- missingEnvironment
out$envOutliers_missingEnv_comment <- ''
if (any (missingEnvironment)){
ids = which (missingEnvironment==TRUE)
for (i in ids){
de = dat.environment[i,]
layersWithNA = which (is.na(de))
out$envOutliers_missingEnv_comment[i] <- paste ("layers with NA:",paste(names (de)[layersWithNA],collapse =','))
}
}
if (any (method %in% c('bxp','all'))) {
outlier.method.biogeo = '-bxp'
# dat.environment <- extract(x = .r.env, y= df[,c(xf,yf)], df = TRUE)
# missingEnvironment <- (!stats::complete.cases(dat.environment))*1
species <- rep(.sp.name, nrow(dat.environment))
dups <- rep(0, nrow(dat.environment))
environmental.columns <- 2:ncol(dat.environment)
a.df <- lapply (environmental.columns, function (i){
ev<-dat.environment[,i]
try (a<-biogeo::outliers(rid=1:nrow(dat.environment), species, dups, ev), silent=TRUE)
if (exists ('a') ) {
a<- as.data.frame (a)
names (a) <- paste0(names(dat.environment)[i],c('-bxp','-rjk') )
return (a)}
if (exists ('a') == FALSE ) {
#in case there is no variation across the variable and then outliers function throws an error
a <- data.frame (col1=rep (0,nrow(dat.environment)), col2 = rep (0,nrow(dat.environment) ))
names (a) <- paste0(names(dat.environment)[i],c('-bxp','-rjk') )
return (a)}
})
a.df <- do.call (cbind, a.df)
choice.of.method.df <- a.df [,grep (names(a.df),pattern = outlier.method.biogeo)]
if (inherits(.r.env,"RasterLayer")) {outlier.level <- choice.of.method.df}
if (inherits(.r.env,"RasterStack")) {outlier.level <- round (rowMeans(choice.of.method.df) ,digits=2)}
if (inherits(.r.env,"RasterBrick")) {outlier.level <- round (rowMeans(choice.of.method.df) ,digits=2)}
outlier.env <- (outlier.level >= .th.perc.outenv)*1
comments.outlier.env <- paste (paste("Outlier",outlier.method.biogeo,".level="), outlier.level)
out$envOutliers_bxp_value = outlier.env
out$envOutliers_bxp_test = as.logical(outlier.env)
out$envOutliers_bxp_comments = comments.outlier.env
}
if (any (method %in% c('grubbs','all'))) {
#spdf = sp::SpatialPointsDataFrame(data = dat.environment[,-1],coords = df[,c(xf,yf)] ,proj4string =.projString )
env.grubbs_tst = rep(0,times=nrow(out))
env.grubbs.outl = occTest::findEnvOutliers(myPres = dat.environment[,-1],myEnv = NULL,verbose = FALSE)
env.grubbs_tst [env.grubbs.outl] = 1
env.grubbs_comment <- rep (paste('Pval= 1e-5'),times=nrow(out))
out$envOutliers_Grubbs_value = env.grubbs_tst
out$envOutliers_Grubbs_test = as.logical(env.grubbs_tst)
out$envOutliers_Grubbs_comments = env.grubbs_comment
}
out$envOutliers_score <- .gimme.score (out)
return (out)
}
# geoEnvAccuracy ====
#' @title Coordinate accuracy
#' @description Detect records with low accuracy in space and time
#' @param df data.frame of species occurrences
#' @param xf character. column name in df containing the x coordinates
#' @param yf character. column name in df containing the y coordinates
#' @param af character. column name in df containing the coordinate uncertainty value (in the same)
#' @param dsf character. column name in df containing the dataset to which the record belongs to (e.g. Forest Inventory of Spain)
#' @param ef character. column name in df containing the registered elevation for the record.
#' @param tf character. column name in df containing the dataset with the date/time where the species is recorded
#' @param method character. Vector of methods to be used. See details. Default 'all'
#' @param r.env raster. Raster with environmental data
#' @param accept.threshold.cell numeric. Acceptance threshold for how much percentage of the Area of uncertainty in the cell we want to accept. Default to 0.5
#' @param accept.threshold.env numeric. Default 0.5
#' @param bearing.classes numeric. Default to 10.
#' @param distance.classes integer. Default to 5.
#' @param env.quantiles numeric. Default to c(0.3,0.7)
#' @param elev.threshold numeric. Default to 100
#' @param raster.elevation numeric. Default to 100
#' @param do logical. Should tests be performed? Default TRUE
#' @param verbose logical. Print messages? Default FALSE
#' @param doParallel logical. Should computation use parallel functions? Default FALSE
#' @param mc.cores numeric. How many cores to use? (used when doParallel = TRUE). Default 2
#' @return data.frame
#' @keywords Analysis
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr), A Zizka (CoordinateCleaner package)
#' @details Geoenvironmental accuracy function will implement differnt methods to assess occurrence accuracy in environmnental and geographic space.\cr
#' Current implmented methods are:
#' 'lattice' : tests for lattice arrangement in occurrence datasets. Borrowed from \link[CoordinateCleaner]{cd_round} . \cr
#' 'elevDiff' : assess the elevation difference between a given raster (or automatically downloaded fro SRTM), and the elevation recorded. If differences >elev.threshold then the record is considered as a low accuracy threshold\cr
#' 'noDate' : assess whether there is a date or timestamp information in the record. \cr
#' 'noDateFormatKnown' : assess whether the information in the timestamp agrees with different formatting of Dates. \cr
#' 'outDateRange' : (not implemented) assess whether the record is within a user specified time frame. \cr
#' 'percDiffCell' : assess whether the record may be falling in a different raster cell given an information of coordinate accuracy. \cr
#' 'envDeviation' : assess whether the climate in a given record may be outside of the interval 30th-70th (default values) for a given variable due to coordinate uncertainty. \cr
#' @seealso \link[CoordinateCleaner]{cd_round}
#' @family analysis
#' @examples
#' #see examples in vignetteXtra-occTest
#' @export
geoEnvAccuracy <- function (df,
xf,
yf,
af,dsf,ef,tf,
method='all',
r.env,
accept.threshold.cell =0.5,
accept.threshold.env = 0.5,
bearing.classes=10,
distance.classes=5,
env.quantiles=c(0.3,0.7),
elev.threshold = 100,
raster.elevation=NULL,
verbose=FALSE,
do=TRUE,
doParallel=FALSE,
mc.cores= 2
){
#output results
out = data.frame (geoenvLowAccuracy_lattice_value = NA,
geoenvLowAccuracy_lattice_test = NA,
geoenvLowAccuracy_lattice_comments = NA,
geoenvLowAccuracy_percDiffCell_value = NA,
geoenvLowAccuracy_percDiffCell_test = NA,
geoenvLowAccuracy_percDiffCell_comments = NA,
geoenvLowAccuracy_envDiff_value =NA,
geoenvLowAccuracy_envDiff_test =NA,
geoenvLowAccuracy_envDiff_comments =NA,
geoenvLowAccuracy_elevDiff_value =NA,
geoenvLowAccuracy_elevDiff_test =NA,
geoenvLowAccuracy_elevDiff_comments =NA,
geoenvLowAccuracy_noDate_value =NA,
geoenvLowAccuracy_noDate_test =NA,
geoenvLowAccuracy_noDate_comments =NA,
geoenvLowAccuracy_noDateFormatKnown_value =NA,
geoenvLowAccuracy_noDateFormatKnown_test =NA,
geoenvLowAccuracy_noDateFormatKnown_comments =NA,
geoenvLowAccuracy_outDateRange_value =NA,
geoenvLowAccuracy_outDateRange_test =NA,
geoenvLowAccuracy_outDateRange_comments =NA,
geoenvLowAccuracy_score=NA
)[1:nrow (df),]
row.names(x = out) <- NULL
if (!do) {return (out)}
#check if need parallel
os = .get_os()
# if (doParallel==TRUE) {mymclapply <- .hijack (parallel::mclapply,mc.cores=mc.cores)}
# if (doParallel==TRUE & grepl(pattern = 'win',x = os)) {mymclapply <- .hijack (parallelsugar::mclapply,mc.cores=mc.cores)}
if (doParallel == TRUE){
mymclapply <- switch( Sys.info()[['sysname']],
Windows = {parallelsugar::mclapply},
Linux = {parallel::mclapply},
Darwin = {parallel::mclapply})
}
if (doParallel==FALSE) {mymclapply <- lapply}
if (nrow(df)<100) {mymclapply <- lapply}
#start method lattice
if (any(method %in% c('lattice','all'))){
if(!is.null(dsf)) {
xydat = df[,c(xf,yf,dsf)]
}
if(is.null(dsf)) {
print ('No dataset field provided. Assuming all records are from the same dataset')
xydat = df[,c(xf,yf)]
xydat$dataset = 'TemporaryDatasetName'
dsf <- 'dataset'
}
#there are a lot of problems with this function
cd_round_test = try({.cc_round_occTest(x = xydat,lon = xf,lat = yf,ds=dsf,value='flagged2',verbose=FALSE,graphs=FALSE)},silent=TRUE)
#cd_round_test = CoordinateCleaner::cd_round(x = xydat,lon = xf,lat = yf,ds=dsf,value='flagged',verbose=FALSE,graphs=FALSE)
out$geoenvLowAccuracy_lattice_value = (!cd_round_test) * 1
out$geoenvLowAccuracy_lattice_test = (!cd_round_test)
out$geoenvLowAccuracy_lattice_comments = ('Based on default values of CoordinateCleaner::cd_round')
}
#start methods requiring elevation
#method elevation
if (any(method %in% c('elevDiff','all'))) {
if (!is.null(ef)){
if (is.null (raster.elevation)) {
if (verbose) print ('elevation raster not provided, downloading from SRTM')
do.srtm.download = TRUE
} else {do.srtm.download=FALSE}
#if no elevation raster provided, download it
if (do.srtm.download){
raster.elevation = try (expr = {.getSRTM (xydat=df,download=TRUE, verbose=FALSE)},silent = TRUE)
}
#determining elevation test
if (inherits(raster.elevation,"RasterLayer") ) {
elev.xy <- raster::extract (raster.elevation, df[,c(xf,yf)])
elev.difference <- abs (df[,ef] - elev.xy)
out$geoenvLowAccuracy_elevDiff_value <- elev.difference
out$geoenvLowAccuracy_elevDiff_test <- as.logical (ifelse (elev.difference >= elev.threshold, 1, 0))
out$geoenvLowAccuracy_elevDiff_comments <- paste ('Elev difference:', elev.difference)
}
#skipping elevation test
if (!inherits(raster.elevation,"RasterLayer") ) {
if(verbose) warning ("raster.elevation is not a raster; skipping elevDiff method")
}
}
}
#has time stamp?
if (any(method %in% c('noDate','all'))) {
if (!is.null(tf)){
vecTime = df[,tf]
dfEmptyVals = data.frame (na=is.na(vecTime),
#null=is.null(as.character(vecTime)==NULL),
isEmpty = (as.character(vecTime)==''),
isEmptySpace = (as.character(vecTime)== ' '))
vecTimeLogi = apply(dfEmptyVals,MARGIN = 1,FUN = function (x){any (x==TRUE)})
out$geoenvLowAccuracy_noDate_value = vecTimeLogi * 1
out$geoenvLowAccuracy_noDate_test = vecTimeLogi
out$geoenvLowAccuracy_noDate_comments = 'checked for NA,NULL, blankSpace and tab'
}
}
#has format stamp?
if (any(method %in% c('noDateFormatKnown','all'))) {
if (!is.null(tf)){
warning ('method noDateFormatKnown under development. Sorry')
# vecTime = df[,tf]
#
# dfVectime = data.frame(vecTime[1])
#
# a = identify_dates(dfVectime)
# dfVectime = data.table::as.data.table(dfVectime)
# k = find_and_transform_dates(dfVectime)
#
#
# out$geoenvLowAccuracy_noDate_value = vecTimeLogi * 1
# out$geoenvLowAccuracy_noDate_test = vecTimeLogi
# out$geoenvLowAccuracy_noDate_comments = 'checked for NA,NULL, blankSpace and tab'
}
}
#is within time range ?
if (any(method %in% c('outDateRange','all'))) {
if (!is.null(tf)){
warning ('method outDateRange under development. Sorry')
}
}
#Need coordinate uncertainty analysis ? If not gimme score and leave
if ( ! any (method %in% c('percDiffCell','envDeviation','all')) ) {
#write final score
out$geoenvLowAccuracy_score <- .gimme.score (out)
return (out)}
#Need coordinate uncertainty analysis ? Can you do it?
if (is.null(af) | all(is.na(df[,af])) ) {
print ('no coordinate accuracy/uncertainty field provided')
#write final score
out$geoenvLowAccuracy_score <- .gimme.score (out)
return (out)
}
#Start methods requiring coordinate uncertainty
if(length(af)>1) {df$new_accuracy = pmax(df[,af[1]],df[,af[2]],na.rm=TRUE); af = 'new_accuracy'}
xydat = df[,c(xf,yf,af)]
xydat$occCellids = raster::cellFromXY(r.env,as.data.frame (df[,c(xf,yf)]))
#get cell IDs of buffer points
cellIds.directions = mymclapply (1:nrow (xydat), function (i){
distance.accuracy = as.numeric (xydat[i,c(af)])
if (is.na (distance.accuracy) |is.na (xydat$occCellids[i]) ) {return (NA)}
if (distance.accuracy>1){
sequence.pts = seq(from=0, to=as.numeric(distance.accuracy) ,length.out = distance.classes +1)
sequence.pts = sequence.pts [-1]
} else {sequence.pts <- distance.accuracy}
myAngles = seq(from=1,to=360,length.out=bearing.classes)
out.bearing = sapply(myAngles, function (bearing){
points.buffer.id = sapply(sequence.pts, function (dpoints){
dest.point = geosphere::destPoint(p = xydat[i,c(xf,yf)],b=bearing,d=dpoints)
dest.point.cellID = raster::cellFromXY(object = r.env, dest.point)
#diff.2.target.cell = (xydat$occCellids[i] != dest.point.cellID)*1
#diff.2.target.cell
dest.point.cellID
})
#perc.points.buffer = mean (points.buffer,na.rm=TRUE)
points.buffer.id
})
as.vector (out.bearing)
})
#start method percentage of Different cells around uncertainty
if (any(method %in% c('percDiffCell','all'))){
perc.diff <- sapply (1:nrow(xydat), function (i){
pdiff <- mean ((cellIds.directions[[i]]==xydat$occCellids[i]) * 1, na.rm = TRUE)
pdiff
})
out$geoenvLowAccuracy_percDiffCell_value = (perc.diff)
out$geoenvLowAccuracy_percDiffCell_test = (perc.diff>accept.threshold.cell)
out$geoenvLowAccuracy_percDiffCell_comments = paste('Nbufferlocations=',bearing.classes*distance.classes)
}
#start method percentage of environmental differences around uncertainty
if (any(method %in% c('envDeviation','all'))) {
uniqueCellBuff <- unique (unlist (cellIds.directions))
if (raster::nlayers(r.env)>1) df.CellBuff = raster::extract(r.env,uniqueCellBuff,df=TRUE)
if (raster::nlayers(r.env)==1) {
df.CellBuff = raster::extract(r.env,uniqueCellBuff)
df.CellBuff = data.frame (ID=1:length(df.CellBuff),LAYER = df.CellBuff)
names (df.CellBuff) = c('ID',names(r.env))
}
df.CellBuff$cellID = uniqueCellBuff
targetCellUnique = xydat$occCellids [!xydat$occCellids %in% uniqueCellBuff]
if (raster::nlayers(r.env)>1) df.CellTarget = raster::extract(r.env,targetCellUnique,df=TRUE)
if (raster::nlayers(r.env)==1) {
df.CellTarget = raster::extract(r.env,targetCellUnique)
df.CellTarget = data.frame (ID=1:length(df.CellTarget),LAYER = df.CellTarget)
names (df.CellTarget) = c('ID',names(r.env))
}
df.CellTarget$cellID = targetCellUnique
df.cells = rbind (df.CellBuff,df.CellTarget)
env.test = mymclapply (1:nrow(xydat), function (i){
if (all(is.na(cellIds.directions[[i]]))) (return(NA))
cellIds.directions[[i]] = stats::na.omit (cellIds.directions[[i]])
NcellReps = table (cellIds.directions[[i]])
df.env.cellBuffs = lapply (1:length(NcellReps), function (m){
cellID= as.numeric (names (NcellReps[m]))
times = as.numeric (NcellReps[m])
df.env.var = df.cells[which(df.cells$cellID ==cellID ), ]
do.call("rbind", replicate(times, df.env.var, simplify = FALSE))
})
df.env.cellBuffs = do.call(rbind,df.env.cellBuffs)
df.env.cellBuffs = df.env.cellBuffs[stats::complete.cases(df.env.cellBuffs),]
df.env.targetcell = df.cells[which(df.cells$cellID == xydat$occCellids[i] ), ]
if (nrow (df.env.cellBuffs)==0) {
vars.outlier = rep(NA,length(names (r.env))+1 )
names (vars.outlier) = c(names (r.env),'all')
vars.outlier = as.data.frame (t(vars.outlier))
return (vars.outlier)
}
if (any ( is.na (df.env.targetcell [,names (r.env)]) )) {
vars.outlier = rep(NA,length(names (r.env))+1 )
names (vars.outlier) = c(names (r.env),'all')
vars.outlier = as.data.frame (t(vars.outlier))
return (vars.outlier)
}
vars.outlier <- sapply (names (r.env), function (nvar){
qprobs = quantile (df.env.cellBuffs[,nvar],probs=env.quantiles)
if (df.env.targetcell[,nvar] < qprobs[1]) {(return(1))}
if (df.env.targetcell[,nvar] > qprobs[2]) {(return(1))}
return (0)
})
vars.outlier = as.data.frame (t(vars.outlier))
vars.outlier$all <- mean (as.numeric(vars.outlier[1,]),na.rm=TRUE)
vars.outlier
})
env.test = do.call (rbind, env.test)
out$geoenvLowAccuracy_envDiff_value = env.test$all
out$geoenvLowAccuracy_envDiff_test = (env.test$all > accept.threshold.env)
out$geoenvLowAccuracy_envDiff_comments = paste('envQuantiles=',paste(env.quantiles,collapse = ','),';PercOutVariablesTh=',accept.threshold.env)
}
#write final score
out$geoenvLowAccuracy_score <- .gimme.score (out)
return (out)
}
# landUseSelect ====
#' @title Selection of records within a specified land use
#' @descriptions Selection of records within a specified land use
#' @param df data.frame of species occurrences
#' @param xf character. column name in df containing the x coordinates
#' @param yf character. column name in df containing the y coordinates
#' @param method character. Select 'in' (default) when records are selected when inside specified land use types (type 'out' otherwise)
#' @param .points.proj4string proj4string character. Indicate coordinate reference system
#' @param ras.landUse raster. Land use raster with integer codes for land use classes.
#' @param .landUseCodes numeric. Vector of specified land use codes to either select records (method 'in') or to exclude records (method 'out')
#' @param do logical. Should range analysis be performed?
#' @param verbose logical. Print messages? Default FALSE
#' @param output.dir character. Output directory
#' @return data.frame
#' @keywords internal
#' @author Josep M Serra-Diaz (pep.serradiaz@@agroparistech.fr)
#' @export
landUseSelect <- function (df,
xf,
yf,
method='in', #out would be the other one
.points.proj4string,
ras.landUse,
.landUseCodes,
do=TRUE, verbose=FALSE,output.dir){
out <- data.frame (landUse_wrongLU_value=NA,
landUse_wrongLU_test=NA,
landUse_wrongLU_comments= NA,
landUse_score = NA)[1:nrow (df),]
row.names(out) <- NULL
if (!do) {return (out)}
#start human influence analysis
xydat <- df[,c(xf,yf)]
#get species presence
data.sp.pres <- as.data.frame (xydat)
sp::coordinates (data.sp.pres) <- stats::as.formula (paste0('~',xf,'+',yf))
#start with land use raster
if(! inherits(ras.landUse,'RasterLayer')) {warning ('no raster of land use provided. Test not performed'); return (out)}
#accommodate projections
if (is.null (.points.proj4string)){sp::proj4string(data.sp.pres) <- raster::projection(ras.landUse); warning ('ASSUMING Points and HumanRaster data have the same projection')}
if (!is.null (.points.proj4string)) {sp::proj4string(data.sp.pres) <- .points.proj4string}
if (sp::proj4string(data.sp.pres) != raster::projection (ras.landUse)){
if (is.na (raster::projection (ras.landUse)) ) {sp::proj4string(data.sp.pres) <- NA ; warning ('ASSUMING Points and HumanRaster data have the same projection')}
if (!is.na (raster::projection (ras.landUse)) ) {data.sp.pres <- sp::spTransform(data.sp.pres,CRSobj =raster::projection(ras.landUse) )}
}
lu.sp.pres <-raster::extract(ras.landUse, y = data.sp.pres, cellnumbers=FALSE, df=TRUE)
row.id.lu.NA <- which(is.na(lu.sp.pres[,2]))
if (length (row.id.lu.NA) != 0) {
output.comments <- rep (NA,length =nrow (xydat))
output.comments [row.id.lu.NA] <- paste0('No land cover available for this record;' )
}
lu.sp.value = lu.sp.pres[,2]
if (method %in% c('in','all')) {lu.sp.pres <- ( lu.sp.value %in% .landUseCodes)}
if (method %in% c('out')) {lu.sp.pres <- (!lu.sp.value %in% .landUseCodes)}
out$landUse_wrongLU_value= !lu.sp.pres * 1
out$landUse_wrongLU_test= !lu.sp.pres
out$landUse_wrongLU_comments= paste(method,.landUseCodes)
#compute score
out$landUse_score <- .gimme.score (out)
return (out)
}
# centroidsBIEN ====
#' @title Centroid assessment using BIEN methods
#'
#' @descriptions Detect centroids in occurrences data frame using BIEN methods
#' @details
#' @param occurrences data.frame with occurrence data
#' @param centroid_data data.frame with centroid coordinates for different administrative entities.
#' @return data.frame
#' @keywords internal
#' @author Brian Maitner
centroid_assessment<-function(occurrences,centroid_data){
occurrences$county<-as.character(occurrences$county)
occurrences$state_province<-as.character(occurrences$state_province)
occurrences$country<-as.character(occurrences$country)
#message("This function assumes that occurrences are in wgs84 format, and then converts to whatever projection is used in the centroid data. Only one projection at a time may be used in the centroid data, but multiple centroids per political division are fine.")
### NOT ACTIVATED BECAUSE WE HAVE DONE THIS BEFORE
#deal with nonsense points by dropping those lines
# if(length( which(occurrences$latitude>90 | occurrences$latitude< -90) )>0 ){
# occurrences<-occurrences[-which(occurrences$latitude>90 | occurrences$latitude< -90),]}
# if(length( which(occurrences$longitude>180 | occurrences$longitude< -180) )>0){
# occurrences<-occurrences[-which(occurrences$longitude>180 | occurrences$longitude< -180),]}
# if(any(is.na(occurrences$latitude) | (is.na(occurrences$longitude)))){
# occurrences<-occurrences[-which(is.na(occurrences$latitude) | (is.na(occurrences$longitude))),]
#
# }
temp_points<-sp::SpatialPoints(coords = occurrences[c("longitude","latitude")],proj4string = sp::CRS("+init=epsg:4326"))
temp_points<-sp::spTransform(x = temp_points,CRSobj = sp::CRS(projargs = unique(as.character(centroid_data$projection))) )
occurrences$latitude <- temp_points@coords[,'latitude']
occurrences$longitude <- temp_points@coords[,'longitude']
rm(temp_points)
#output<-NULL
output<-as.data.frame(matrix(nrow = nrow(occurrences),ncol=13))
colnames(output)<-c("taxonobservation_id","country_cent_dist","country_cent_dist_relative","country_cent_type","country_max_uncertainty",
"state_cent_dist","state_cent_dist_relative","state_cent_type","state_max_uncertainty",
"county_cent_dist","county_cent_dist_relative","county_cent_type","county_max_uncertainty")
output$taxonobservation_id<-occurrences$taxonobservation_id
for(i in 1:length(unique(occurrences$country))){
country_i<-unique(occurrences$country)[i]
centroid_i<-centroid_data[which(centroid_data$gnrsed_country==country_i & is.na(centroid_data$state) & is.na(centroid_data$county)),]
#test<-centroid_data[which(centroid_data$gnrsed_country==country_i),]
if(nrow(centroid_i)>0){
centroid_pt_i<-sp::SpatialPoints(coords = centroid_i[c("centroid_lon","centroid_lat")],proj4string = sp::CRS(projargs = unique(as.character(centroid_i$projection))))
country_data_i<-occurrences[which(occurrences$country==country_i),]
new_data<-matrix(nrow = nrow(country_data_i),ncol = nrow(centroid_i))
new_data<-as.data.frame(new_data)
country_pts_i<-sp::SpatialPointsDataFrame(coords = country_data_i[c("longitude","latitude")],data = as.data.frame(country_data_i$taxonobservation_id),proj4string = sp::CRS(projargs = unique(as.character(centroid_i$projection))))
for(p in 1:nrow(centroid_i)){
new_data[,p]<- sp::spDistsN1(pts = country_pts_i,pt = centroid_pt_i[p]) / as.numeric(centroid_i$max_uncertainty[p] )
}
new_data_absolute<-matrix(nrow = nrow(country_data_i),ncol = nrow(centroid_i))
new_data_absolute<-as.data.frame(new_data)
for(p in 1:nrow(centroid_i)){
new_data_absolute[,p]<-sp::spDistsN1(pts = country_pts_i,pt = centroid_pt_i[p])
}
mins<-apply(X = new_data, MARGIN = 1,FUN = which.min)
output$country_cent_dist_relative[which(occurrences$country==country_i)]<-unlist(apply(X = new_data,MARGIN = 1,FUN = function(x){x[which.min(x)]}) )
output$country_max_uncertainty[which(occurrences$country==country_i)]<-unlist(lapply(X = mins,FUN = function(x){centroid_i$max_uncertainty[x]}))
output$country_cent_type[which(occurrences$country==country_i)]<-as.character(unlist(lapply(X = mins,FUN = function(x){centroid_i$centroid_type[x]})) )
output$country_cent_dist[which(occurrences$country==country_i)]<-unlist(apply(X = new_data_absolute,MARGIN = 1,FUN = function(x){x[which.min(x)]}) )
rm(centroid_pt_i)
}
rm(centroid_i)
}#country level
rm(country_data_i,new_data,country_i,country_pts_i,i,p,new_data_absolute,mins)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#State level
for(i in 1:nrow(unique(occurrences[,c("country","state_province")]))){
state_i<-unique(occurrences[,c("country","state_province")])[i,]
centroid_i<-centroid_data[which(centroid_data$gnrsed_country==state_i$country & centroid_data$gnrsed_state==state_i$state_province & is.na(centroid_data$county)),]
if(nrow(centroid_i)>0){
centroid_pt_i<-sp::SpatialPoints(coords = centroid_i[c("centroid_lon","centroid_lat")],proj4string = sp::CRS(projargs = unique(as.character(centroid_i$projection))))
state_data_i<-occurrences[which(occurrences$country==state_i$country & occurrences$state_province==state_i$state_province),]
new_data<-matrix(nrow = nrow(state_data_i),ncol = nrow(centroid_i))
new_data<-as.data.frame(new_data)
state_pts_i<-sp::SpatialPointsDataFrame(coords = state_data_i[c("longitude","latitude")],data = as.data.frame(state_data_i$taxonobservation_id),proj4string = sp::CRS(projargs = unique(as.character(centroid_i$projection))))
for(p in 1:nrow(centroid_i)){
new_data[,p]<- sp::spDistsN1(pts = state_pts_i,pt = centroid_pt_i[p]) / as.numeric(centroid_i$max_uncertainty[p] )
}
new_data_absolute<-matrix(nrow = nrow(state_data_i),ncol = nrow(centroid_i))
new_data_absolute<-as.data.frame(new_data)
for(p in 1:nrow(centroid_i)){
new_data_absolute[,p]<-sp::spDistsN1(pts = state_pts_i,pt = centroid_pt_i[p])
}
mins<-apply(X = new_data, MARGIN = 1,FUN = which.min)
output$state_cent_dist_relative[which(occurrences$country==state_i$country & occurrences$state_province==state_i$state_province)]<-unlist(apply(X = new_data,MARGIN = 1,FUN = function(x){x[which.min(x)]}) )
output$state_max_uncertainty[which(occurrences$country==state_i$country & occurrences$state_province==state_i$state_province)]<-unlist(lapply(X = mins,FUN = function(x){centroid_i$max_uncertainty[x]}))
output$state_cent_type[which(occurrences$country==state_i$country & occurrences$state_province==state_i$state_province)]<-as.character(unlist(lapply(X = mins,FUN = function(x){centroid_i$centroid_type[x]})) )
output$state_cent_dist[which(occurrences$country==state_i$country & occurrences$state_province==state_i$state_province)]<-unlist(apply(X = new_data_absolute,MARGIN = 1,FUN = function(x){x[which.min(x)]}) )
rm(centroid_pt_i)
}#if there is a matching centroid
rm(centroid_i)
}#state level
#rm(state_data_i,new_data,state_i,state_pts_i,i,mins,p,new_data_absolute)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#county level
for(i in 1:nrow(unique(occurrences[,c("country","state_province","county")]))){
county_i<-unique(occurrences[,c("country","state_province","county")])[i,]
centroid_i<-centroid_data[which(centroid_data$gnrsed_country==county_i$country & centroid_data$gnrsed_state==county_i$state_province & centroid_data$gnrsed_county==county_i$county ),]
if(nrow(centroid_i)>0){
centroid_pt_i<-sp::SpatialPoints(coords = centroid_i[c("centroid_lon","centroid_lat")],proj4string = sp::CRS(projargs = unique(as.character(centroid_i$projection))))
county_data_i<-occurrences[which(occurrences$country==county_i$country & occurrences$state_province==county_i$state_province & occurrences$county==county_i$county),]
new_data<-matrix(nrow = nrow(county_data_i),ncol = nrow(centroid_i))
new_data<-as.data.frame(new_data)
county_pts_i<-sp::SpatialPointsDataFrame(coords = county_data_i[c("longitude","latitude")],data = as.data.frame(county_data_i$taxonobservation_id),proj4string = sp::CRS(projargs = unique(as.character(centroid_i$projection))))
for(p in 1:nrow(centroid_i)){
new_data[,p]<- sp::spDistsN1(pts = county_pts_i,pt = centroid_pt_i[p]) / as.numeric(centroid_i$max_uncertainty[p] )
}
new_data_absolute<-matrix(nrow = nrow(county_data_i),ncol = nrow(centroid_i))
new_data_absolute<-as.data.frame(new_data_absolute)
for(p in 1:nrow(centroid_i)){
new_data_absolute[,p]<-sp::spDistsN1(pts = county_pts_i,pt = centroid_pt_i[p])
}
mins<-apply(X = new_data, MARGIN = 1,FUN = which.min)
#Populate output file
output$county_cent_dist_relative[which(occurrences$country==county_i$country & occurrences$state_province==county_i$state_province & occurrences$county==county_i$county)]<-unlist(apply(X = new_data,MARGIN = 1,FUN = function(x){x[which.min(x)]}) )
output$county_max_uncertainty[which(occurrences$country==county_i$country & occurrences$state_province==county_i$state_province & occurrences$county==county_i$county)]<-unlist(lapply(X = mins,FUN = function(x){centroid_i$max_uncertainty[x]}))
output$county_cent_type[which(occurrences$country==county_i$country & occurrences$state_province==county_i$state_province & occurrences$county==county_i$county)]<-as.character(unlist(lapply(X = mins,FUN = function(x){centroid_i$centroid_type[x]})) )
output$county_cent_dist[which(occurrences$country==county_i$country & occurrences$state_province==county_i$state_province & occurrences$county==county_i$county)]<-unlist(apply(X = new_data_absolute,MARGIN = 1,FUN = function(x){x[which.min(x)]}) )
rm(centroid_pt_i)
}
rm(centroid_i)
}#county level
#rm(county_data_i,new_data,county_i,county_pts_i,i,mins,p,new_data_absolute)
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.