#' Substrate metrics
#'
#' @param data Input data
#'
#' @export
#'
#' @importFrom magrittr "%>%"
#'
#' @examples
#' sampdat <- phabformat(sampdat)
#' substrate(sampdat)
substrate <- function(data){
data <- data[which(data$AnalyteName %in% c('Substrate Size Class', 'Embeddedness', 'CPOM')),]
# the code that runs below this line messed up the XEMBED metric somehow. Specifically it messed up the counts
# I am preserving the data before it gets tweaked, so that XEMBED can work with "original.data" and the rest can work with "data"
original.data <- data
data$Result[data$Result == -88] <- NA
data <- data %>%
dplyr::select(id, LocationCode, AnalyteName, VariableResult, Result) %>%
unique %>%
tidyr::complete(id, LocationCode, AnalyteName) %>%
dplyr::group_by(id) %>%
dplyr::filter(!all(is.na(VariableResult) & !all(is.na(Result)))) %>%
dplyr::ungroup()
data$VariableResult <- as.character(data$VariableResult)
data$VariableResult[data$VariableResult=="a"]<-"SA"
data$VariableResult[data$VariableResult=="as"]<-"SA"
data$VariableResult[data$VariableResult=="ws"]<-"SA"
data$VariableResult[data$VariableResult=="sn"]<-"SA"
data$VariableResult[data$VariableResult=="N"]<-"Not Recorded"
data$VariableResult[data$VariableResult=="n"]<- "Not Recorded"
data$VariableResult[data$VariableResult=="LB"]<-"XB"
data$VariableResult[data$VariableResult=="G"]<- "GC"
data$VariableResult[data$VariableResult=="FM"]<-"FN"
data$VariableResult[data$VariableResult=="fd"]<-"SB"
data$VariableResult[data$VariableResult=="fb"]<-"SB"
sub <- data[data$AnalyteName =="Substrate Size Class",]
sub$Result <- as.numeric(as.character(sub$Result))
sub$VariableResult[sub$Result>= 1000 & sub$Result<4000] <- "XB"
sub$VariableResult[sub$Result>= 250 & sub$Result<1000] <- "SB"
sub$VariableResult[sub$Result>= 64 & sub$Result<250] <- "CB"
sub$VariableResult[sub$Result>= 16 & sub$Result<64] <- "GC"
sub$VariableResult[sub$Result>= 2 & sub$Result<16] <- "GF"
sub$VariableResult[sub$Result>= 0.06 & sub$Result<2] <- "SA"
# remove reach row
# Some metric calculations later on needed Reach to be in there
sub <- sub %>%
dplyr::filter(!LocationCode %in% 'Reach')
###Compute
sumna <- function(data){sum(data, na.rm=T)}
lengthna <- function(data){sum(!is.na(data))}
metric <- c('RS', 'RR', 'RC', 'XB', 'SB', 'CB', 'GC', 'GF', 'SA', 'FN', 'HP', 'WD', 'OT')
# All VariableResult values get transformed to uppercase here
# Not Recorded turns into NOT RECORDED
sub$VariableResult <- lapply(sub$VariableResult, toupper)
lengths <- function(data){
length(which(((data != "NOT RECORDED") &(data != "NA"))&(data != "FNOT RECORDED") & (!is.na(data))))}
totals <- tapply(unlist(sub$VariableResult), sub$id, lengths)
tnames <- as.vector(dimnames(totals))
qq <-unlist(tnames)
l <- matrix(NA, ncol=length(metric), nrow=length(totals))
for(j in 1:length(qq))
for(i in 1:length(metric)){
l[j, i] <- length(which(sub$VariableResult[sub$id == qq[j]] == metric[i]))
}
divd <- function(data) data*100/totals
result <- as.data.frame(apply(l, 2, divd))
if(length(colnames(result))==1){
result <- t(result)
rownames(result) <- unique(sub$id)
result <- as.data.frame(result)
}
colnames(result) <- c("PCT_RS.result","PCT_RR.result","PCT_RC.result","PCT_XB.result","PCT_SB.result","PCT_CB.result",
"PCT_GC.result","PCT_GF.result","PCT_SA.result","PCT_FN.result","PCT_HP.result",
"PCT_WD.result","PCT_OT.result")
# Below lines are to add counts to the results dataframe
# it kind of scares me though, because in this way, I am not sure if the counts will be assigned out of order
# We will have to test this thoroughly
# print("creating the PCTs vector")
PCTs <- gsub(".result", ".count", colnames(result)[grepl("PCT_", colnames(result))])
for (name in PCTs){
result[[name]] <- totals
}
# print("calculating the BDRK, BIGR, SFGF, SAFN")
result$PCT_BDRK.result <- result$PCT_RS.result + result$PCT_RR.result
result$PCT_BIGR.result <- result$PCT_RS.result + result$PCT_RR.result + result$PCT_XB.result + result$PCT_SB.result + result$PCT_CB.result + result$PCT_GC.result
result$PCT_SFGF.result <- result$PCT_GF.result + result$PCT_SA.result + result$PCT_FN.result
result$PCT_SAFN.result <- result$PCT_SA.result + result$PCT_FN.result
# print("Getting the counts for those metrics")
result$PCT_BDRK.count <- rowSums(!is.na(result[,c('PCT_RS.result', 'PCT_RR.result')]))
result$PCT_BIGR.count <- rowSums(!is.na(result[,c('PCT_RS.result', 'PCT_RR.result', 'PCT_XB.result', 'PCT_SB.result', 'PCT_CB.result', 'PCT_GC.result')]))
result$PCT_SFGF.count <- rowSums(!is.na(result[,c('PCT_GF.result', 'PCT_SA.result', 'PCT_FN.result')]))
result$PCT_SAFN.count <- rowSums(!is.na(result[,c('PCT_SA.result', 'PCT_FN.result')]))
result <- round(result)
# H_SubNat, Ev_SubNat
SubNat <- sub %>%
dplyr::select(id, LocationCode, VariableResult) %>%
tidyr::unnest() %>%
dplyr::group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
H_SubNat.result = purrr::map(data, function(VariableResult){
VariableResult <- VariableResult %>% dplyr::pull(VariableResult)
# step 2 and 3
# the argument that says length in the aggregate function shoud probably be lengthna for correct measurement.
# I believe I put length, to get it to match the (what we are pretty sure is) wrong legacy calculation.
uniques <- aggregate(data.frame(count = VariableResult), list(value = VariableResult), lengthna) %>% dplyr::filter(!toupper(value) %in% c('RC', 'NOT RECORDED'))
if (length(intersect(c('RS','RR','HP'), uniques$value)) != 0){
RSRRHPgroup <- data.frame('RSRRHP', sum(uniques[uniques$value %in% c('RS','RR','HP'),]$count))
names(RSRRHPgroup) <- c('value','count')
uniques <- rbind(uniques, RSRRHPgroup)
}
uniques <- uniques %>% dplyr::filter(!value %in% c('RS','RR','HP'))
if ((sum(uniques$count) == 0) | (uniques %>% nrow == 0)) { return(0) }
# step 4
uniques$pi <- uniques$count / sum(uniques$count)
# step 5
uniques$imlt <- uniques$pi * log(uniques$pi)
# step 6
res <- round(sum(uniques$imlt) * (-1), 2)
return(res)
}
),
H_SubNat.count = purrr::map(data, function(VariableResult){
VariableResult <- VariableResult %>% dplyr::pull(VariableResult)
# total number of size classes used to calculate H_SubNat
uniques <- aggregate(data.frame(count = VariableResult), list(value = VariableResult), lengthna) %>% dplyr::filter(!toupper(value) %in% c('RC', 'NOT RECORDED'))
if (length(intersect(c('RS','RR','HP'), uniques$value)) != 0){
RSRRHPgroup <- data.frame('RSRRHP', sum(uniques[uniques$value %in% c('RS','RR','HP'),]$count))
names(RSRRHPgroup) <- c('value','count')
uniques <- rbind(uniques, RSRRHPgroup)
}
uniques <- uniques %>% dplyr::filter(!value %in% c('RS','RR','HP'))
n_size_classes <- length(uniques$value)
return(n_size_classes)
}),
Ev_SubNat.result = purrr::pmap(list(data, H_SubNat.result), function(VariableResult, H_SubNat.result){
VariableResult <- VariableResult %>% dplyr::pull(VariableResult)
# total number of size classes used to calculate H_SubNat
uniques <- aggregate(data.frame(count = VariableResult), list(value = VariableResult), length) %>% dplyr::filter(!toupper(value) %in% c('RC', 'NOT RECORDED'))
if (length(intersect(c('RS','RR','HP'), uniques$value)) != 0){
RSRRHPgroup <- data.frame('RSRRHP', sum(uniques[uniques$value %in% c('RS','RR','HP'),]$count))
names(RSRRHPgroup) <- c('value','count')
uniques <- rbind(uniques, RSRRHPgroup)
}
uniques <- uniques %>% dplyr::filter(!value %in% c('RS','RR','HP'))
if ((sum(uniques$count) == 0) | (uniques %>% nrow == 0)) {
return(0)
} else {
n_size_classes <- length(uniques$value)
if (n_size_classes > 1) {
return(round(H_SubNat.result / log(n_size_classes), 2))
} else {
return(0)
}
}
}),
Ev_SubNat.count = H_SubNat.count
) %>%
dplyr::select(-data) %>%
tidyr::unnest() %>%
as.data.frame(stringsAsFactors = F) %>%
tibble::column_to_rownames('id')
###Second set of computation
# I believe the metrics from here on out needed Reach to be in there
sub$value <- rep(NA, length(sub$id))
sub$value[which(!is.na(sub$Result))] <- sub$Result[which(!is.na(sub$Result))]
sub$value[which(sub$VariableResult == "RR")] <- 5660.0
sub$value[which(sub$VariableResult == "RS")] <- 5660.0
sub$value[which(sub$VariableResult == "XB")] <- 2500.0
sub$value[which(sub$VariableResult == "SB")] <- 625.0
sub$value[which(sub$VariableResult == "CB")] <- 157.0
sub$value[which(sub$VariableResult == "GC")] <- 40.0
sub$value[which(sub$VariableResult == "GF")] <- 9.0
sub$value[which(sub$VariableResult == "SA")] <- 1.03
sub$value[which(sub$VariableResult == "FN")] <- 0.03
sub$value[which(sub$VariableResult == "HP")] <- 5660.0
sub$value[which(sub$VariableResult == "RC")] <- 5660.0
# Thi put this in because the instructions say "If result column contains a value, leave as is"
sub <- sub %>% dplyr::mutate(value = dplyr::case_when(
is.na(Result) ~ value,
TRUE ~ Result
))
sub$value <- as.numeric(as.character(sub$value))
sub$log <- log10(sub$value)
XSDGM_sum <- tapply(sub$log, sub$id, sumna)
XSDGM_count <- tapply(sub$log, sub$id, lengthna)
XSDGM <- 10^(XSDGM_sum/XSDGM_count)
result$XSDGM.result <- XSDGM %>% round(1)
result$XSDGM.count <- XSDGM_count
sub$value2 <- rep(NA, length(sub$id))
sub$value2[which(!is.na(sub$Result))] <- sub$Result[which(!is.na(sub$Result))]
sub$value2[which(sub$VariableResult == "RR")] <- NA
sub$value2[which(sub$VariableResult == "RS")] <- NA
sub$value2[which(sub$VariableResult == "XB")] <- 2500.0
sub$value2[which(sub$VariableResult == "SB")] <- 625.0
sub$value2[which(sub$VariableResult == "CB")] <- 157.0
sub$value2[which(sub$VariableResult == "GC")] <- 40.0
sub$value2[which(sub$VariableResult == "GF")] <- 9.0
sub$value2[which(sub$VariableResult == "SA")] <- 1.03
sub$value2[which(sub$VariableResult == "FN")] <- 0.03
sub$value2[which(sub$VariableResult == "HP")] <- NA
sub$value2[which(sub$VariableResult == "RC")] <- NA
#Result value should be preserved, midpoint value should only be used when Result is NA
sub <- sub %>%
dplyr::mutate(value2 = dplyr::case_when(
is.na(Result) ~ value2,
TRUE ~ Result
))
sub$value2 <- as.numeric(as.character(sub$value2))
sub$log2 <- log10(sub$value2)
XSPDGM_sum <- tapply(sub$log2, sub$id, sumna)
XSPDGM_count <- tapply(sub$log2, sub$id, lengthna)
XSDPGM <- 10^(XSPDGM_sum/XSPDGM_count)
result$XSPGM.result <- XSDPGM
# Below is code that does what the original VBA code did. Sorting values and grabbing a value from a certain index
# It appears as if SWAMP takes the floor instead of rounding
PT_ind <- sub %>% dplyr::select(id, value) %>% dplyr::group_by(id) %>% tidyr::nest() %>%
dplyr::mutate(perc = purrr::map(data, function(df){
percentiles <- c('PTD10index','PTD25index','PTD50index','PTD75index','PTD90index')
indices <- (sum(!is.na(df$value)) * c(0.1,0.25,0.5,0.75,0.9)) %>% ceiling #I think SWAWMP does ceiling rather than rounding
output <- data.frame(percentiles = percentiles, indices = indices)
return(output)
})) %>%
dplyr::select(-data) %>% tidyr::unnest() %>% dplyr::group_by_at(dplyr::vars(id)) %>% tidyr::spread(key = percentiles, value = indices)
PP_ind <- sub %>% dplyr::select(id, value2) %>% dplyr::group_by(id) %>% tidyr::nest() %>%
dplyr::mutate(perc = purrr::map(data, function(df){
percentiles <- c('PPD10index','PPD25index','PPD50index','PPD75index','PPD90index')
indices <- (sum(!is.na(df$value2)) * c(0.1,0.25,0.5,0.75,0.9)) %>% ceiling #I think SWAWMP does ceiling rather than rounding
indices <- replace(indices, which(indices == 0), 1)
output <- data.frame(percentiles = percentiles, indices = indices)
return(output)
})) %>%
dplyr::select(-data) %>% tidyr::unnest() %>% dplyr::group_by_at(dplyr::vars(id)) %>% tidyr::spread(key = percentiles, value = indices)
sub <- merge(sub, PT_ind, by = 'id')
sub <- merge(sub, PP_ind, by = 'id')
percentiles <- sub %>%
dplyr::group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
SB_PT_D50.result = purrr::map(data, function(df){
median(df$value, na.rm = T)
}),
SB_PT_D10.result = purrr::map(data, function(df){
sort(df$value)[df$PTD10index[1]]
}),
SB_PT_D25.result = purrr::map(data, function(df){
sort(df$value)[df$PTD25index[1]]
}),
SB_PT_D75.result = purrr::map(data, function(df){
sort(df$value)[df$PTD75index[1]]
}),
SB_PT_D90.result = purrr::map(data, function(df){
sort(df$value)[df$PTD90index[1]]
}),
SB_PP_D50.result = purrr::map(data, function(df){
sort(df$value)[df$PPD50index[1]]
}),
SB_PP_D10.result = purrr::map(data, function(df){
sort(df$value)[df$PPD10index[1]]
}),
SB_PP_D25.result = purrr::map(data, function(df){
sort(df$value)[df$PPD25index[1]]
}),
SB_PP_D75.result = purrr::map(data, function(df){
sort(df$value)[df$PPD75index[1]]
}),
SB_PP_D90.result = purrr::map(data, function(df){
sort(df$value)[df$PPD90index[1]]
}),
SB_PT_D50.count = purrr::map(data, function(df){
sum(!is.na(df$value))
}),
SB_PT_D10.count = purrr::map(data, function(df){
sum(!is.na(df$value))
}),
SB_PT_D25.count = purrr::map(data, function(df){
sum(!is.na(df$value))
}),
SB_PT_D75.count = purrr::map(data, function(df){
sum(!is.na(df$value))
}),
SB_PT_D90.count = purrr::map(data, function(df){
sum(!is.na(df$value))
}),
SB_PP_D50.count = purrr::map(data, function(df){
sum(!is.na(df[!(df$VariableResult %in% c('HP','RS','RR','RC','WD','OT')),]$value))
}),
SB_PP_D10.count = purrr::map(data, function(df){
sum(!is.na(df[!(df$VariableResult %in% c('HP','RS','RR','RC','WD','OT')),]$value))
}),
SB_PP_D25.count = purrr::map(data, function(df){
sum(!is.na(df[!(df$VariableResult %in% c('HP','RS','RR','RC','WD','OT')),]$value))
}),
SB_PP_D75.count = purrr::map(data, function(df){
sum(!is.na(df[!(df$VariableResult %in% c('HP','RS','RR','RC','WD','OT')),]$value))
}),
SB_PP_D90.count = purrr::map(data, function(df){
sum(!is.na(df[!(df$VariableResult %in% c('HP','RS','RR','RC','WD','OT')),]$value))
})
) %>% dplyr::select(-data) %>%
tidyr::unnest() %>% as.data.frame %>% tibble::column_to_rownames('id')
sdna <- function(data) sd(data, na.rm=T)
embed <- original.data[which((original.data$AnalyteName=="Embeddedness")&
(!(original.data$LocationCode=="X"))),]
embed$Result <- as.numeric(as.character(embed$Result))
XEMBED_sum <- tapply(embed$Result, embed$id, sumna)
XEMBED_count <- tapply(embed$Result, embed$id, lengthna)
result$XEMBED.result <- round(XEMBED_sum/XEMBED_count)
result$XEMBED.count <- XEMBED_count
result$XEMBED.sd <- round(tapply(embed$Result, embed$id, sdna), 1)
cpom <- data[data$AnalyteName=="CPOM",]
present <- function(data){
sum(data == "Present", na.rm = TRUE)
}
cpomtotal <- function(data){
sum((data == "Present") | (data == "Absent"), na.rm = TRUE)
}
cpresent <- tapply(cpom$VariableResult, cpom$id, present)
ctotal <- tapply(cpom$VariableResult, cpom$id, cpomtotal)
result$PCT_CPOM.result <- round(cpresent*100/ctotal)
result$PCT_CPOM.count <- ctotal
# add H_SubNat, Ev_SubNat
result$H_SubNat.result <- SubNat$H_SubNat.result
result$H_SubNat.count <- SubNat$H_SubNat.count
result$Ev_SubNat.result <- SubNat$Ev_SubNat.result
result$Ev_SubNat.count <- SubNat$Ev_SubNat.count
# merge result with the percentile metrics on row names
result <- merge(result, percentiles, by = 'row.names') %>% tibble::column_to_rownames('Row.names')
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.