# rm(list = ls())
# install.packages('cowplot')
#*******************************************************************************************************
# summary function
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=F,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
median = median (xx[[col]], na.rm=na.rm),
#do.call("rbind", tapply(xx[[col]], measurevar, quantile, c(0.25, 0.5, 0.75)))
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
#*******************************************************************************************************
# clean function 1
clean_data <- function (data) {
# data <- data_orig
sdata <- data
sdata %>%
filter(!is.na(Unsaturated_K2cm_cmhr) & !is.na(Percent_Sand) & !is.na(Type)) ->
sdata
sdata %>% select(Percent_Clay, Percent_Silt, Percent_Sand, Texture, Rock_group, BD,
Unsaturated_K2cm_cmhr, Percent_Rock_Fragment,
Type, Top_Type) ->
sdata
sdata <- sdata[!is.na(sdata$Texture),]
colnames(sdata) <- c("CLAY", "SILT", "SAND", "TEXTURE", "Rock_group", "BD", "Unsaturated_K2cm_cmhr", "ROCK", "Type", "Top_Type")
# handle NA data
sdata$sum <- sdata$CLAY + sdata$SILT + sdata$SAND
# convert sum to 100
sdata$ratio <- 100/sdata$sum
sdata$CLAY <- sdata$CLAY*sdata$ratio
sdata$SILT <- sdata$SILT*sdata$ratio
sdata$SAND <- sdata$SAND*sdata$ratio
sdata$UF <- "Urban Observed"
sdata <- sdata[!is.na(sdata$Unsaturated_K2cm_cmhr),]
sdata <- sdata[!is.na(sdata$CLAY),]
sdata$Rock_group <- as.factor(sdata$Rock_group)
sdata$BD <- ifelse(is.na(sdata$Rock_group), 1.36, sdata$Rock_group)
# sdata$Rock_group <- ifelse(is.na(sdata$Rock_group), "NotReport", sdata$Rock_group)
print(paste0('-----------------------obs(n)=',nrow(sdata)))
sdata
}
# clean function with rock considered
clean_data_rock <- function (data) {
sdata <- data
sdata <- sdata[!is.na(sdata$Unsaturated_K2cm_cmhr), ]
sdata <- sdata[!is.na(sdata$Percent_Sand), ]
sdata <- sdata[!is.na(sdata$Type), ]
sdata <- sdata[,c(which(colnames(sdata)=="Percent_Clay"),which(colnames(sdata)=="Percent_Silt")
, which(colnames(sdata)=="Percent_Sand"), which(colnames(sdata)=="Texture")
, which(colnames(sdata)=="Soil_Series_Type"), which(colnames(sdata)=="Unsaturated_K2cm_cmhr")
, which(colnames(sdata)=="Percent_Rock_Fragment")
, which(colnames(sdata)=="Top_Type")
, which(colnames(sdata)=="BD")
, which(colnames(sdata)=="Rock_group")
)]
sdata <- sdata[!is.na(sdata$Texture),]
colnames(sdata) <- c("CLAY", "SILT", "SAND", "TEXTURE", "UF", "Unsaturated_K2cm_cmhr",
"ROCK", "Top_Type", "BD", "Rock_group")
sdata <- sdata[!is.na(sdata$ROCK),]
# handle NA data
sdata$sum <- sdata$CLAY + sdata$SILT + sdata$SAND
sdata$sscr <- 100- sdata$ROCK
# convert sum to 100
sdata$ratio <- sdata$sscr/sdata$sum
sdata$CLAY <- sdata$CLAY*sdata$ratio
sdata$SILT <- sdata$SILT*sdata$ratio
sdata$SAND <- sdata$SAND*sdata$ratio
sdata <- sdata[sdata$UF == "Urban Observed",]
sdata <- sdata[!is.na(sdata$Unsaturated_K2cm_cmhr),]
sdata <- sdata[!is.na(sdata$CLAY),]
print(paste0('-----------------------obs(n)=',nrow(sdata)))
sdata
}
#*******************************************************************************************************
# clean function 2
# length(sdata[which(sdata$Type == "fine granular structure"),]$Type)
update_structure <- function( data ) {
sdata <- data
sdata <- sdata[!is.na(sdata$Unsaturated_K2cm_cmhr), ]
sdata <- sdata[!is.na(sdata$Percent_Sand), ]
sdata <- sdata[!is.na(sdata$Type), ]
sdata$Top_Type <- as.character(sdata$Top_Type)
# sdata$Type <- as.character(sdata$Type)
for ( i in 1:nrow(sdata) ) {
if( sdata[i,]$Type == "fine granular structure" ) {
sdata[i,]$Top_Type <- "granular" }
else if ( sdata[i,]$Type == "single grain" ){
sdata[i,]$Top_Type <- "single grain"
}
else if ( sdata[i,]$Type == "medium granular structure" ){
sdata[i,]$Top_Type <- "granular"
}
else if (sdata[i,]$Type == "thin and medium plate-like structure"){
sdata[i,]$Top_Type <- "platy"
}
else if (sdata[i,]$Type == "massive"){
sdata[i,]$Top_Type <- "massive"
}
else if (sdata[i,]$Type == "medium subangular blocky"){
sdata[i,]$Top_Type <- "blocky"
}
else if (sdata[i,]$Type == "medium and fine granular"){
sdata[i,]$Top_Type <- "granular"
}
else if (sdata[i,]$Type == "coarse granular blocky" ){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "fine subangular blocky"){
sdata[i,]$Top_Type <- "blocky"
}
else if (sdata[i,]$Type == "fine and medium granular structure" ){
sdata[i,]$Top_Type <- "granular"
}
else if (sdata[i,]$Type == "medium platy structure" ){
sdata[i,]$Top_Type <- "platy"}
else if (sdata[i,]$Type == "fine and medium subangular blocky"){
sdata[i,]$Top_Type <- "blocky"
}
else if ( sdata[i,]$Type == "fine and medium prismatic structure" ){
sdata[i,]$Top_Type <- "prismatic"}
else if (sdata[i,]$Type == "medium granular and strong"){
sdata[i,]$Top_Type <- "granular"
}
else if (sdata[i,]$Type == "medium angular blocky"){
sdata[i,]$Top_Type <- "blocky" }
else if (sdata[i,]$Type == "fine angular structure"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "medium prismatic parting to moderate medium subangular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "medium prismatic structure parting to moderate medium subangular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "coarse prismatic"){
sdata[i,]$Top_Type <- "prismatic"}
else if (sdata[i,]$Type == "medium prismatic"){
sdata[i,]$Top_Type <- "prismatic"}
else if (sdata[i,]$Type == "angular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "very coarse prismatic structure"){
sdata[i,]$Top_Type <- "prismatic"}
else if (sdata[i,]$Type == "very fine granular structure"){
sdata[i,]$Top_Type <- "granular"}
else if (sdata[i,]$Type == "coarse subangular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "fine subangular and angular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "very fine and fine subangular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "medium and coarse subangular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "subangular blocky"){
sdata[i,]$Top_Type <- "blocky"}
else if (sdata[i,]$Type == "fine granular structure and weak very fine subangular blocky"){
sdata[i,]$Top_Type <- "granular" }
else if (sdata[i,]$Type == "coarse granular"){
sdata[i,]$Top_Type <- "granular" }
else if (sdata[i,]$Type == "medium granular and strong coarse angular blocky"){
sdata[i,]$Top_Type <- "granular" }
else { sdata[i,]$Top_Type <- NA }
print (paste0('******',i))
}
# 2 convert sum to 100
sdata$sum <- sdata$Percent_Sand + sdata$Percent_Silt + sdata$Percent_Clay
sdata %>%
filter(sum > 99 & sum < 101) ->
sdata
sdata$ratio <- 100/sdata$sum
sdata$Percent_Clay <- sdata$Percent_Clay*sdata$ratio
sdata$Percent_Silt <- sdata$Percent_Silt*sdata$ratio
sdata$Percent_Sand <- sdata$Percent_Sand*sdata$ratio
sdata$Top_Type <- as.factor(sdata$Top_Type)
return (sdata)
}
#*******************************************************************************************************
# prepare data for ANN and RF modeling
data_ann <- function (sdata) {
data <- sdata[,c(which(colnames(sdata)=="Percent_Sand")
, which(colnames(sdata)=="Percent_Silt")
, which(colnames(sdata)=="Percent_Clay")
, which(colnames(sdata)=="Ksat_cmhr")
, which(colnames(sdata)=="Unsaturated_K2cm_cmhr") )]
return(data)
}
data_rf <- function (sdata) {
dataRF <- sdata[,c(which(colnames(sdata)=="Percent_Sand")
, which(colnames(sdata)=="Percent_Silt")
, which(colnames(sdata)=="Percent_Clay")
, which(colnames(sdata)=="Grade")
, which(colnames(sdata)=="Top_Type")
, which(colnames(sdata)=="Ksat_cmhr")
, which(colnames(sdata)=="Unsaturated_K2cm_cmhr")
, which(colnames(sdata)=="BulkDensity_gcm3") )]
return(dataRF)
}
#*******************************************************************************************************
# ANN model
ann_ssc <- function(sdata) {
neuralnet(Unsaturated_K2cm_cmhr ~ Percent_Sand + Percent_Silt + Percent_Clay
,data=sdata, hidden=c(5,3), linear.output=T, stepmax=1e6)
}
#*******************************************************************************************************
# RF1 model
rf_ssc <- function(sdata) {
randomForest(Unsaturated_K2cm_cmhr ~ Percent_Sand + Percent_Silt + Percent_Clay , data=sdata,
ntree = 100,
mtry = 2,
importance = TRUE,
proximity = TRUE)
}
#*******************************************************************************************************
# RF2 model
rf_sscs <- function(sdata) {
randomForest(Unsaturated_K2cm_cmhr ~ Percent_Sand + Percent_Silt + Percent_Clay + Top_Type, data=sdata,
ntree = 100,
mtry = 2,
importance = TRUE,
proximity = TRUE)
}
#*******************************************************************************************************
# plot and test RF2
rf2_visual <- function (model) {
p1 <- qplot(1:100, model$mse ) + geom_line() + theme_bw() +
xlab ("Number of trees (n)") + ylab(" MSE ") +
theme(axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 12),
axis.title = element_text(face = "bold", size = 12),
axis.title.y = element_text(angle = 0, vjust = 0.5)
)
# annotate("text", x = 8, y = 7.15, label = "( a )", size = 6)
# panel (b)
p2 <- qplot(sort(importance(model, type = 1)[1:4]), 1:4) + geom_line() + theme_bw() +
xlab ("Change of MSE (%)") +
theme(axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 12),
axis.title = element_text(face = "bold", size = 12),
axis.title.y = element_blank()
) + scale_y_continuous(labels = c("Structure", "%Silt", "%Clay", "%Sand") )
# annotate("text", x =4.25, y = 3.75, label = "( b )", size = 6)
# panel (c)
p3 <- qplot(sort(importance(model, type = 2)[1:4]), 1:4) + geom_line() + theme_bw() +
xlab ("Change of node purity") +
theme(axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 12),
axis.title = element_text(face = "bold", size = 12),
axis.title.y = element_blank()
) + scale_y_continuous(labels = c(" ", " ", " ", " ") )
# annotate("text", x = 650, y = 3.75, label = "( c )", size = 6)
panel_b <- plot_grid(p2, p3, ncol = 2, labels = c("b", "c"), hjust = c(-9.5, -4), vjust = 2.75)
# print(panel_b)
print(plot_grid(p1, panel_b, nrow = 2, labels = c("a", "")
, hjust = -10, vjust = 2.75 )
)
invisible(list(p1,p2,p3, panel_b))
}
#*******************************************************************************************************
# model evaluation functions
model_evaluation1 <- function (sdata) {
sdata$S_M <- sdata$kfs_m - sdata$kfs
# hist(sdata$S_M)
E <- round(sum(sdata$S_M)/length(sdata$S_M), 5)
# t-test
p <- round(t.test(sdata$S_M)$p.value, 5)
# calculate d
d <- round(1- sum(sdata$S_M^2)/sum((abs(sdata$kfs_m-mean(sdata$kfs))+abs(sdata$kfs-mean(sdata$kfs)))^2), 5)
# calculate EF
EF <- round(1- sum(sdata$S_M^2)/sum((sdata$kfs-mean(sdata$kfs))^2), 5)
# calculate RMSE
RMSE <- round((sum(sdata$S_M^2)/length(sdata$S_M))^0.5, 5)
df <- t.test(sdata$S_M)[2]
eval_mat <- data.frame(E, p, d, EF, RMSE, df)
return(eval_mat)
}
#*******************************************************************************************************
# plot figure 4
# colnames(train)
# colnames(test_rosetta)
# max(data$Unsaturated_K2cm_cmhr)
model_evaluation2 <- function () {
# a,b,c,d,e,f,g are data for panels a-h
panel_a <- qplot(kfs_m, kfs, data = train) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50) +
theme(axis.title.x = element_blank()) + theme(axis.title.y = element_blank())
panel_b <- qplot(kfs_m, kfs, data = train_RF) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50)+
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
panel_c <- qplot(kfs_m, kfs, data = train_RF2) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50)+
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
panel_HSD <- qplot(kfs_m, kfs, data = train_RF2) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50)+
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
panel_d <- qplot(kfs_m, kfs, data = test) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50)+
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
panel_e <- qplot(kfs_m, kfs, data = test_RF) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50)+
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
panel_f <- qplot(kfs_m, kfs, data = test_RF2) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) + xlim(0, 50)+
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
panel_g <- qplot(kfs_m, kfs, data = test_rosetta) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 50) +
theme(axis.title.x = element_blank())+ theme(axis.title.y = element_blank())
y.grob <- textGrob(expression(Measured~K[fs]~cm~h^-1),
gp=gpar(fontface="bold", col="black", fontsize=15), rot=90)
x.grob <- textGrob(expression(Predicted~K[fs]~cm~h^-1),
gp=gpar(fontface="bold", col="black", fontsize=15))
plot <- plot_grid (panel_a, panel_d, panel_b, panel_e, panel_c, panel_f, panel_HSD, panel_g, ncol = 2, nrow = 4
, labels = c('( a ) ANN train', '( e ) ANN test', '( b ) RF1 train', '( f ) RF1 test'
, '( c ) RF2 train', '( g ) RF2 test', '( d ) HSD ', '( h ) ROSETTA')
, vjust = 3, hjust = c(-0.8, -0.8, -0.8, -0.8, -0.8, -0.8, -0.8, -0.75), label_size = 8)
grid.arrange(arrangeGrob(plot, left = y.grob, bottom = x.grob))
}
#*******************************************************************************************************
# plot figure 5
model_evaluation3 <- function () {
panel_a <- qplot(Ksat_ANN, Ksat, data = histdata) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 25) + xlim(0, 25) +
theme(axis.title.x = element_blank()) + theme(axis.title.y = element_blank())
panel_b <- qplot(Ksat_RF, Ksat, data = histdata) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 25) + xlim(0, 25) +
theme(axis.title.x = element_blank()) + theme(axis.title.y = element_blank())
panel_c <- qplot(Ksat_RF2, Ksat, data = histdata) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 25) + xlim(0, 25) +
theme(axis.title.x = element_blank()) + theme(axis.title.y = element_blank())
panel_d <- qplot(Ksat_Rosseta, Ksat, data = histdata) + geom_abline(slope = 1, linetype = 2, size = 1.5, col = 'blue') +
geom_smooth(method = 'lm', col = 'red') + ylim (0, 25) +
theme(axis.title.x = element_blank()) + theme(axis.title.y = element_blank())
y.grob <- textGrob(expression(Measured~K[fs]~cm~h^-1),
gp=gpar(fontface="bold", col="black", fontsize=15), rot=90)
x.grob <- textGrob(expression(Predicted~K[fs]~cm~h^-1),
gp=gpar(fontface="bold", col="black", fontsize=15))
plot <- plot_grid (panel_a, panel_c, panel_b, panel_d, ncol = 2, nrow = 2
, labels = c('( a ) ANN', '( c ) RF2', '( b ) RF1', '( d ) ROSETTA')
, vjust = 3, hjust = c(-1.2, -1.2, -1.2, -0.7), label_size = 8 )
grid.arrange(arrangeGrob(plot, left = y.grob, bottom = x.grob))
}
#*******************************************************************************************************
# get bulk density from HWSD
#*******************************************************************************************************
get_bd <- function (hwsd_data, urban_ksat_data) {
hwsd_data <- nc_open("../extdata/HWSD_BULK_DEN.nc4")
for (i in 1:nrow(urban_ksat_data) ) {
# get the lat and lon from urban_ksat_data
target_lat <- urban_ksat_data$Latitude[i]
target_lon <- urban_ksat_data$Longitude[i]
if (!is.na(target_lat) & !is.na(target_lon)) {
# get the closest lat and lon from hwsd
lat <- ncvar_get(hwsd_data, "lat")
ilat <- which.min(abs(lat - target_lat))
lon <- ncvar_get(hwsd_data, "lon")
ilon <- which.min(abs(lon - target_lon))
# get the bd information
target_bd <- ncvar_get(hwsd_data, "T_BULK_DEN", start = c(ilon, ilat), count = c(1, 1))
urban_ksat_data[i, "hwsd_bd"] <- target_bd
}
else {next}
print(paste0("====================", i))
}
return (urban_ksat_data)
}
# Bill updated their dataset on April 2020
update_urban_kfs <- function(){
MFT3 <- read_excel("../extdata/MasterFlatTable_19Dec20.xlsx", sheet = 4) # updated in Dec 2020
# Surface_HC <- read_excel("../extdata/MasterFlatTable_3July19_preColorRemov.xlsx", sheet = 5) # no longer used
Borehole_ksat <- read_excel("../extdata/MasterFlatTable_3July19_preColorRemov.xlsx", sheet = 6) # no longer used
data_orig <- read.csv(here::here("extdata/old/AllCities_Victoria_RDS_rock_bd.csv"))
# set 99999 to 3600 and 0 to 0.0042 (column BZ)
# Borehole_ksat$Borehole_Ksat_cmhr[Borehole_ksat$Borehole_Ksat_cmhr == 0,] = 0.0042
# Borehole_ksat$Borehole_Ksat_cmhr[Borehole_ksat$Borehole_Ksat_cmhr == 99999,] = 3600
# Borehole_ksat %>% select(SampleLayer_ID, Borehole_Ksat_cmhr, BulkDensity_gcm3) %>%
# filter(!is.na(Borehole_Ksat_cmhr)) %>%
# group_by(SampleLayer_ID) %>%
# summarise(Borehole_Ksat_cmhr = mean(Borehole_Ksat_cmhr),
# BulkDensity_gcm3 = mean(BulkDensity_gcm3)) ->
# Borehole_ksat_agg
# Borehole_ksat_agg$SampleLayer_ID = as.character(Borehole_ksat_agg$SampleLayer_ID)
# left_join(
# MFT3,
# Surface_HC %>% select(SampleLayer_ID, Unsaturated_K2cm_cmhr) %>% filter(!is.na(Unsaturated_K2cm_cmhr)),
# by = c("SampleLayer_ID")
# ) -> All_cite_new
# left_join(
# MFT3,
# Borehole_ksat_agg,
# by = c("SampleLayer_ID")) ->
# All_cite_new
#
MFT3$SampleLayer_ID = as.character(MFT3$SampleLayer_ID)
data_orig$SampleLayer_ID = as.character(data_orig$SampleLayer_ID)
left_join(
MFT3,
data_orig %>% select(SampleLayer_ID, Latitude, Longitude, Type, hwsd_bd, Rock_group,
Texture_mod, Top_Type, Soil_Series_Type),
by = c("SampleLayer_ID")
) -> All_cite_new
# only data from minidisk was used
All_cite_new$Unsaturated_K2cm_cmhr <- coalesce(All_cite_new$K_minidisk_cmhr)
# All_cite_new$Ksat_cmhr <- ifelse(All_cite_new$Ksat_cmhr == 0,
# All_cite_new$Borehole_Ksat_cmhr,
# All_cite_new$Ksat_cmhr)
All_cite_new$BD <- ifelse(!is.na(All_cite_new$Bulk_Density_Avg_g_cm3),
All_cite_new$Bulk_Density_Avg_g_cm3,
All_cite_new$hwsd_bd)
return(All_cite_new)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.