#' Calculate River Invertebrate Classification Tool (RICT) predictions
#'
#' @seealso \code{\link{rict_classify}} to run classification using rict_predict
#' outputs
#' @param data Dataframe of predictive environmental values - variables
#' dependent on input for model 44, model 1, NI or GB
#' \describe{
#' \item{SITE}{Site reference number}
#' \item{Waterbody}{Water body identifier}
#' \item{LATITUDE}{Location point}
#' \item{LONGITUDE}{Location point}
#' \item{LOG.ALTITUDE}{Logarithmic value for Altitude}
#' \item{LOG.DISTANCE.FROM.SOURCE}{Logarithmic value for Distance from source}
#' \item{LOG.WIDTH}{Logarithmic value for Stream Width}
#' \item{LOG.DEPTH}{Logarithmic value for Stream Depth}
#' \item{MEAN.SUBSTRATUM}{Calculated mean substratum}
#' \item{DISCHARGE.CATEGORY}{Discharge category}
#' \item{ALKALINITY}{Alkalinity}
#' \item{LOG.ALKALINITY}{Logarithmic value for Alkalinity}
#' \item{LOG.SLOPE}{Logarithmic value for Slope}
#' \item{MEAN.AIR.TEMP}{Calculated Mean Air Temperature}
#' \item{AIR.TEMP.RANGE}{Calculated Air Temperature Range}
#' \item{p1 – p43}{Calculated probability of end group membership}
#' \item{SuitCode}{Calculated Suitability code for test site}
#' \item{BelongsTo_endGrp}{Predicted probable end group the test site belongs to}
#' \item{All Indices...}{...Predicted index value for All Indices}
#' \item{SPR_SEASON_ID}{ID number for season}
#' \item{SPR_TL2_WHPT_ASPT (ABW,DISTFAM)}{Observed value for WHPT NTAXA for spring}
#' \item{SPR_TL2_WHPT_NTAXA (ABW,DISTFAM)}{Observed value for WHPT ASPT for spring}
#' \item{SPR_NTAXA_BIAS}{Bias value used}
#' \item{SUM_SEASON_ID}{Summary ID}
#' \item{SUM_TL2_WHPT_ASPT (ABW,DISTFAM)}{Observed value for WHPT ASPT for summer}
#' \item{SUM_TL2_WHPT_NTAXA (ABW,DISTFAM)}{Observed value for WHPT NTAXA for summer}
#' \item{SUM_NTAXA_BIAS}{Bias value used}
#' \item{AUT_SEASON_ID}{Aut ID}
#' \item{AUT_TL2_WHPT_ASPT (ABW,DISTFAM)}{Observed value for WHPT ASPT for autumn}
#' \item{AUT_TL2_WHPT_NTAXA (ABW,DISTFAM)}{Observed value for WHPT NTAXA for autumn}
#' \item{AUT_NTAXA_BIAS}{Bias value used}
#' \item{Area_CEH}{}
#' \item{AltBar_CEH}{}
#' \item{Alt_CEH}{}
#' \item{DFS_CEH}{}
#' \item{Slope_CEH}{}
#' \item{QCat_CEH}{}
#' \item{Peat_CEH}{}
#' \item{Chalk_O1_CEH}{}
#' \item{Clay_O1_CEH}{}
#' \item{Hardrock_O1_CEH}{}
#' \item{Limestone_O1_CEH}{}
#' \item{area}{ni or gb}
#' }
#' @param all_indices Boolean - Return all indices in output (default only
#' returns WHPT indices).
#' @param taxa Boolean - Return taxa predictions (default returns indices).
#' @param taxa_list Vector of taxa lists to predict default all lists i.e.
#' c("TL1", "TL2", "TL3", "TL4", "TL5").
#' @param rows Number (integer) of rows (one site per row) to predict taxa.
#' Default is all rows.
#' @param area Area is by detected by default from the NGR, but you can provide
#' the area parameter either 'iom', 'gb, 'ni' for testing purposes.
#' @param crs optionally set crs to `29903` for Irish projection system.
#' @return Dataframe of predicted biotic scores and probability of observed
#' values falling into each statistical grouping of rivers.
#' @export
#' @importFrom rlang .data
#' @importFrom dplyr mutate n group_by filter select
#' @importFrom stats complete.cases
#'
#' @examples
#' \dontrun{
#' predictions <- rict_predict(demo_observed_values)
#' }
rict_predict <- function(data = NULL,
all_indices = FALSE,
taxa = FALSE,
taxa_list = c("TL1", "TL2", "TL3", "TL4", "TL5"),
rows = NULL,
area = NULL,
crs = NULL) {
# Validate predictive input data
all_validation <- rict_validate(data, area = area, crs = crs)
model <- all_validation[["model"]] # returns model based on input headers
area <- all_validation[["area"]] # returns area based on NGR
# Change all column names to uppercase
names(data) <- toupper(names(data))
## load supporting tables ---------------------------------------------------
if (taxa == TRUE) {
taxa.input.data <- utils::read.csv(system.file("extdat",
"TAXA_AB_APR_PRAB.csv",
package = "rict"
),
stringsAsFactors = TRUE,
skip = 1
)
model_type <- ifelse(area == "gb", 1, 2)
taxa.input.data <- filter(taxa.input.data, .data$Model == model_type)
}
end_group_index <- utils::read.csv(system.file("extdat",
"x-103-end-group-means-formatted-jdb-17-dec-2019.csv",
package = "rict"
),
check.names = FALSE
)
end_group_index <- rename_end_group_means(end_group_index)
if (area == "iom") {
end_group_index <- utils::read.csv(system.file("extdat",
"x-103-end-group-means-formatted-iom.csv",
package = "rict"
),
check.names = FALSE
)
end_group_index$`TL2 33 Group ARMI NTaxa` <- NA
end_group_index$`TL2 33 Group ARMI Score` <- NA
end_group_index$`TL2 33 Group Flow Silt NTaxa` <- NA
end_group_index$`TL2 33 Group Flow Silt Score` <- NA
end_group_index <- rename_end_group_means(end_group_index)
}
nr_efg_groups <- utils::read.csv(system.file("extdat",
"end-grp-assess-scores.csv",
package = "rict"
))
if (model == "physical" && area == "gb") {
df_mean_gb685 <-
utils::read.delim(
system.file("extdat", "df-mean-gb-685.DAT", package = "rict"),
header = FALSE,
sep = "",
as.is = TRUE
)
df_coeff_gb685 <-
utils::read.delim(
system.file("extdat", "df-coeff-gb-685.DAT", package = "rict"),
header = FALSE,
sep = "",
as.is = TRUE
)
}
if (area == "ni") {
df_mean_gb685 <-
utils::read.delim(system.file("extdat", "DFMEAN_NI_RALPH.DAT",
package = "rict"),
header = FALSE, sep = "", as.is = TRUE
)
df_coeff_gb685 <-
utils::read.delim(system.file("extdat", "DFCOEFF_NI.DAT",
package = "rict"),
header = FALSE, sep = "", as.is = TRUE
)
nr_efg_groups <-
utils::read.csv(system.file("extdat", "EndGrp_AssessScoresNI.csv",
package = "rict"
))
}
if (model == "gis") {
df_mean_gb685 <-
utils::read.csv(
system.file("extdat",
"end-group-means-discriminant-scores-model-44.csv",
package = "rict"
)
)
df_mean_gb685 <- df_mean_gb685[, 3:19]
df_coeff_gb685 <-
utils::read.csv(
system.file("extdat", "discriminant-function-coefficients-model-44.csv",
package = "rict"
)
)
}
if (area == "iom") {
df_mean_gb685 <-
utils::read.delim(system.file("extdat", "df-mean-iom.DAT",
package = "rict"),
header = FALSE, sep = "", as.is = TRUE
)
df_coeff_gb685 <-
utils::read.delim(system.file("extdat", "df-coeff-iom.DAT",
package = "rict"),
header = FALSE, sep = "", as.is = TRUE
)
nr_efg_groups <-
utils::read.csv(system.file("extdat", "end-group-assess-scores-iom.csv",
package = "rict"
))
}
# check season provided#
seasons_to_run <-
length(na.omit(c(data$SPR_SEASON_ID[1],
data$AUT_SEASON_ID[1],
data$SUM_SEASON_ID[1])))
if(seasons_to_run < 1) {
warning("No '...SEASON_ID' provided, predicting all seasons",
call. = FALSE
) # or run all seasons if not provided
seasons_to_run <- 1:3
} else {
seasons_to_run <- 1:3
}
data <- all_validation[[1]]
if (model == "gis") {
# remove TEST-SITE_CODE column - not required and causes issues later on!
data$`TEST SITECODE` <- NULL
}
# Final Data for classification e.g. Linear discriminant Analysis (LDA)
# classifier/predictor
if (model == "physical" && area == "gb") {
final_predictors <- data.frame(
"SITE" = data$SITE,
"LATITUDE" = data$LATITUDE,
"LONGITUDE" = data$LONGITUDE,
"LOG.ALTITUDE" = data$vld_alt_src_log,
"LOG.DISTANCE.FROM.SOURCE" = data$vld_dist_src_log,
"LOG.WIDTH" = data$mn_width_log,
"LOG.DEPTH" = data$mn_depth_log,
"MEAN.SUBSTRATUM" = data$vld_substr_log,
"DISCHARGE.CATEGORY" = data$DISCHARGE, # data$disch_log,
"ALKALINITY" = data$ALKALINITY,
"LOG.ALKALINITY" = data$vld_alkal_log,
"LOG.SLOPE" = data$vld_slope_log,
"MEAN.AIR.TEMP" = data$TEMPM,
"AIR.TEMP.RANGE" = data$TEMPR
)
}
if (area == "iom") {
final_predictors <- data.frame(
"SITE" = data$SITE,
"LOG.ALTITUDE" = data$vld_alt_src_log,
"LOG.DISTANCE.FROM.SOURCE" = data$vld_dist_src_log,
"LOG.ALKALINITY" = data$vld_alkal_log,
"LOG.SLOPE" = data$vld_slope_log
)
}
if (area == "ni") {
final_predictors <- data.frame(
"SITE" = data$SITE,
"LATITUDE" = data$LATITUDE,
"LONGITUDE" = data$LONGITUDE,
"LOG.ALTITUDE" = data$vld_alt_src_log,
"LOG.DISTANCE.FROM.SOURCE" = data$vld_dist_src_log,
"LOG.WIDTH" = data$mn_width_log,
"LOG.DEPTH" = data$mn_depth_log,
"MEAN.SUBSTRATUM" = data$vld_substr_log,
"DISCHARGE.CATEGORY" = data$DISCHARGE, # data$disch_log,
"ALKALINITY" = data$ALKALINITY,
"LOG.ALKALINITY" = data$vld_alkal_log,
"LOG.SLOPE" = data$vld_slope_log
)
}
if (model == "gis") {
final_predictors <- data.frame(
"SITE" = data$SITE,
"LATITUDE" = data$LATITUDE,
"LONGITUDE" = data$LONGITUDE,
"TEMPM" = data$TEMPM,
"TEMPR" = data$TEMPR,
"ALKALINITY" = data$ALKALINITY,
"LgAlk" = data$LgAlk,
"LgArea_CEH" = data$LOG_AREA,
"LgAltBar_CEH" = data$LOGALTBAR,
"LgAlt_CEH" = data$LgAlt_CEH,
"LgDFS_CEH" = data$LgDFS_CEH,
"LgSlope_CEH" = data$LgSlope_CEH,
"QCat_CEH" = data$DISCH_CAT,
"Peat_CEH" = data$PEAT,
"Chalk_O1_CEH" = data$CHALK,
"Clay_O1_CEH" = data$CLAY,
"Hardrock_O1_CEH" = data$HARDROCK,
"Limestone_O1_CEH" = data$LIMESTONE
)
# browser("check lat/lon conversion using st_transform Vs parse_osg?
# - check this is the problem first?")
# convert proportions to percentage for geology variables
final_predictors[, 14:18] <- final_predictors[, 14:18] * 100
}
# Prediction Settings
# 1. Enter
# Prediction Settings
# 2. Find the df_scores of each row using one line of coefficients
# Note: df_coeff_gb685[1,-1] # removes the first column
# NRefg equals number of reference sites in end group g , for GB = 685, for NI
# = 11
NRefg_all <- rowSums(nr_efg_groups[, -1])
# #DFScore_g <- DFCoef1 * Env1 + ... + DFCoefn * Envn ; remove "SITE" col=1
# from final_predictors, and remove col=1 from df_coeff_gb685
df_scores <- as.data.frame(getDFScores(
EnvValues = final_predictors,
DFCoeff = df_coeff_gb685
))
# Calculate the Mahanalobis distance of point x from site g for all reference
# sites
MahDist_g <- getMahDist(DFscore = df_scores, meanvalues = df_mean_gb685)
# Make column names
DistNames <- paste0("p", seq_len(ncol(MahDist_g))) # reused later
MahDistNames <- gsub("p", "Mah", DistNames)
colnames(MahDist_g) <- MahDistNames
# Calculate the minimum Mahanalobis distance of point x from site g
MahDist_min <- getMahDist_min(df_scores, df_mean_gb685)
# Calculate the probability distribution
PDist_g <- PDist(NRefg_all, MahDist_g)
# Calculate probabilities of sites belonging to the endgroups, prob_g, l,as
# last column 44 contains the total "PGdistTot"
# ALL probabilities p1..pn, rowsums() add to 1, except when last row which it
# "total" is removed i.e. rowSums(PDistTot[,-ncol(PDistTot)])=1
PDistTot <- as.data.frame(PDistTotal(PDist_g))
# Rename the columns to probabilities with "Total" at the end.
colnames(PDistTot) <- c(DistNames, "Total")
final_predictors_try1 <- cbind(final_predictors, PDistTot[, -ncol(PDistTot)])
# 3. Use chisquare to find suitability codes.
chi_square <-
utils::read.csv(
system.file("extdat", "chi_square.csv",
package = "rict"
)
)
suit_codes <- getSuitabilityCode(MahDist_min, chi_square, area, model)
# Add suitability codes to the final data, using cbind
final_predictors_try2 <- cbind(final_predictors_try1, suit_codes)
# Find max class group belongs to by getting the column names
group_probabilities <- final_predictors_try2[, DistNames]
belongs_to_end_grp <- lapply(1:nrow(group_probabilities), function(n) {
return(which.max(group_probabilities[n, ]))
})
# Replace p with EndGr
belongs_to_end_grp <- gsub("p", "EndGr", names(unlist(belongs_to_end_grp)))
final_predictors_try3 <- cbind(final_predictors_try2, belongs_to_end_grp)
# 4 Prediction: WE1.5 Algorithms for prediction of expected values of any index based on probability of end group
# membership and average values of the index amongst reference sites in each end group.
# We predict WHPT NTAXA, and WHPT ASP
endgroup_index_frame <- end_group_index[grep(area, end_group_index$RIVPACS_Model, ignore.case = TRUE), ]
endgroup_index_frame <- dplyr::select(endgroup_index_frame, -.data$RIVPACS_Model)
endgroup_index_frame <- dplyr::rename(endgroup_index_frame,
EndGrp = .data$End_Group,
SeasonCode = .data$Season_Code
)
# Sort by the columns "EndGrp", "SeasonCode"
endgroup_index_frame <- dplyr::arrange(endgroup_index_frame, .data$EndGrp, .data$SeasonCode)
# Prepare what you want to run - seasons, indices, and subset the data with the seasonCodes
endgroup_index_frame <- filter(endgroup_index_frame, .data$SeasonCode %in% seasons_to_run)
# Write a function that extracts user input columns and converts them to the values in c("") below :: USER INPUT
indices_to_run <- c(
"TL2_WHPT_NTAXA_AbW_DistFam", "TL2_WHPT_ASPT_AbW_DistFam",
"TL2_WHPT_NTAXA_AbW_CompFam", "TL2_WHPT_ASPT_AbW_CompFam"
)
# Run the index Scores
# seasons_to_run <- seasons_to_run[!is.na(seasons_to_run)]
if (taxa == TRUE && area != "iom") { # This block predicts and returns taxa predictions
# Declare a variable where we append all sites
taxa_pred <- list()
taxa_predictions <- data.frame()
# Use complete cases removing null values
taxa.input.data <- taxa.input.data[complete.cases(taxa.input.data), ]
nsites <- nrow(final_predictors_try2)
if (is.null(rows)) {
chooseSite <- seq_len(nsites)
} else {
chooseSite <- seq_len(rows)
}
taxa.input.data$EndGrp_Probs <- taxa.input.data$End.Group
taxa.input.data <- taxa.input.data[taxa.input.data$TL %in% taxa_list, ]
## Loop for each SITE
for (i in chooseSite) {
message("Processing input row: ", i)
site1 <- taxa.input.data
siteIndex <- ifelse(i < 10, paste0("0", i), i)
measuredColumns <- site1[, c(
"Average_Numerical_Abundance",
"Average_Log10_Abundance", "Prob_Occurrence",
"Prob_Log1", "Prob_Log2", "Prob_Log3", "Prob_Log4", "Prob_Log5"
)]
names_measuredColumns <- names(measuredColumns)
site1$EndGrp_Probs <- EndGrpProb_Replacement(
site1$EndGrp_Probs,
final_predictors_try3,
which(chooseSite == i)
)
b1 <- site1 %>%
group_by(.data$Season_Code, .data$TL, .data$Furse_Code) %>%
mutate(count = n(), mlist_endGrps = list(.data$EndGrp_Probs)) # `End Group`
# gives 3,522 rows.or unique b1 Multiply by 12 sites gives us 3,532x12 = 42,384 groups in TOTAL.
allUniqueSites <- unique(b1[, c(4, 3, 5, 6, 7, 20, 21)])
for (k in seq_len(nrow(allUniqueSites))) { ## loop over these unique rows per SITE
sitex <- groupSitesFunction(allUniqueSites, k, siteIndex, b1)
sitex$siteName <- final_predictors_try2$SITE[i]
taxa_pred[[k]] <- sitex
} # for k
taxa_preds <- data.frame(do.call("rbind", taxa_pred))
taxa_predictions <- rbind(taxa_predictions, taxa_preds)
} # for i
# Remove the "End.Group" column
taxa_predictions$End.Group <- NULL
# Arrange the sites by siteName, TL, Season_Code, Furse_code
dplyr::arrange(
taxa_predictions,
.data$siteName,
.data$TL,
.data$Season_Code,
.data$Furse_Code
)
# In multi-year predictions there can be multiple rows with the same
# name/predictors. Therefore processing by row creates dups. Think about
# re-factoring this to only process unique rows.
taxa_predictions <- taxa_predictions[!duplicated(taxa_predictions), ]
return(taxa_predictions)
}
if(taxa == TRUE && area == "iom") {
message("Isle of Man model cannot predict taxa")
taxa_predictions <- NULL
return(taxa_predictions)
}
# Predict indices scores
indices_predictions <- getSeasonIndexScores(
data_to_bindTo = final_predictors_try3,
season_to_run = seasons_to_run,
index_id = indices_to_run,
end_group_IndexDFrame = endgroup_index_frame,
DistNames = DistNames,
all_indices = all_indices
)
# Append the biological data to the main output dataframe
# Get all the bioligical data
names_biological <- c(
"SITE",
"WATERBODY",
"YEAR",
"SPR_SEASON_ID",
"SPR_TL2_WHPT_ASPT (ABW,DISTFAM)",
"SPR_TL2_WHPT_NTAXA (ABW,DISTFAM)",
"SPR_NTAXA_BIAS",
"SUM_SEASON_ID",
"SUM_TL2_WHPT_ASPT (ABW,DISTFAM)",
"SUM_TL2_WHPT_NTAXA (ABW,DISTFAM)",
"SUM_NTAXA_BIAS",
"AUT_SEASON_ID",
"AUT_TL2_WHPT_ASPT (ABW,DISTFAM)",
"AUT_TL2_WHPT_NTAXA (ABW,DISTFAM)",
"AUT_NTAXA_BIAS"
)
# Check user input data contains biological values
if (any(names_biological %in% names(data))) {
biological_data <- data[, names(data) %in% names_biological]
if(length(biological_data[names(biological_data) %in%
c("SITE", "WATERBODY", "YEAR")]) < 3) {
stop("You provided data missing either SITE, WATERBODY or YEAR, we expect
all three columns to be present")
}
# remove column "SITE", the first one of columns
biological_data <- select(biological_data, -.data$SITE)
# Check at least one season's worth of data present
spring <- length(grep("SPR", names(biological_data)))
summer <- length(grep("SUM", names(biological_data)))
autumn <- length(grep("AUT", names(biological_data)))
# If no season / observation present - then prediction only.
if(all(c(spring, summer, autumn) == 0)) {
spring <- 4
summer <- 4
autumn <- 4
}
if(!any(c(spring, summer, autumn) == 4)) {
stop("At least one season must have SEASON_ID, ASPT, NTAXA and
BIAS columns present.")
}
indices_predictions <- cbind(indices_predictions, biological_data)
indices_predictions$area <- area # needed for classify function
}
else {
stop(paste0("You provided a data with no observation columns provided,
We expect at least one season's worth of observation and
SITE, WATERBODY, YEAR: ",
paste((names_biological), collapse = ",")))
}
# Add in missing columns to allow classification to run
# - will return empty values for missing seasons etc
missing_columns <-
names_biological[!names_biological %in% names(data)]
if(length(missing_columns) > 0) {
missing <- data.frame()
missing[missing_columns] <- list(numeric(0))
missing[1:nrow(indices_predictions), missing_columns] <- NA
indices_predictions <- cbind(indices_predictions, missing)
}
return(indices_predictions)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.