testing/NullExt/NullExt_06_explore_sample_size.R

# LOAD PACKAGES AND DATA #######################################################

library(runoptGPP)
library(rgeos)
library(sperrorest)
library(raster)
library(rgdal)

setwd("/home/jason/Data/Chile/")
# elevation model
dem <- raster("elev_alos_12_5m_no_sinks.tif")

# slide start/source points
slide_point_vec <- readOGR(".", "debris_flow_source_points")

# actual/mapped debris flow polygons
slide_poly_vec <- readOGR(".", "debris_flow_polys_sample")
slide_poly_vec$objectid <- 1:100
crs(slide_point_vec) <- crs(slide_poly_vec)


# SPATIAL CROSS VALIDATION SETTINGS ###########################

repetitions <- 100
n_folds <- 5
n_smps <- seq(from = 10, to = 80, by = 10)


# RW SAMPLE SIZE ###############################################################

n_smps <- seq(from = 10, to = 80, by = 10)

setwd("/home/jason/Scratch/GPP_RW_Paper")
(load("rw_gridsearch_multi.Rd"))

rwslp_vec <- as.numeric(dimnames(rw_gridsearch_multi[[1]])[[1]])
rwexp_vec <- as.numeric(dimnames(rw_gridsearch_multi[[1]])[[2]])
rwper_vec <- as.numeric(dimnames(rw_gridsearch_multi[[1]])[[3]])


smp_sizes_rw <- vector("list", length = length(n_smps))


for(SMP in 1:length(n_smps)){

  n_smp <- n_smps[SMP]

  rep_spcv_rw <- vector("list", length = repetitions)

  for(n in 1:repetitions){

    print(paste("Sample size:", n_smp, "Rep:", n))

    rep_seed <- 11082017 + n

    # Random sample of slides
    #rndm_smp <- sample(length(polyid.vec), n_smp)
    #smp_slide_poly_vec <- slide_poly_vec[rndm_smp,]
    #plot(smp_slide_poly_vec)

    # Define folds for sample
    fold_labels <- spcvFoldsPoly(slide_poly_vec, n_folds = n_folds, seed = rep_seed)

    spcv_results <- list()



    # Folds ###############
    for(k in 1:n_folds){

      roc_median <- vector("list", length = n_folds)
      roc_iqr <- vector("list", length = n_folds)
      #roc_mean <- vector("list", length = n_folds)
      #roc_sd <- vector("list", length = n_folds)

      polyid_train.vec <- which(fold_labels != k)


      if(n_smp < length(polyid_train.vec)){

        polyid_train.vec <- sample(polyid_train.vec, n_smp, replace = FALSE)

      }

      #print(length(polyid_train.vec))




      polyid_test.vec <- which(fold_labels == k)

      for(PER in 1:length(rwper_vec)){

        per_list <- list()

        for(i in 1:length(polyid_train.vec)){
          obj.id <- polyid_train.vec[i]
          res <- rw_gridsearch_multi[[obj.id]]
          per_list[[i]] <- as.matrix(res[,,PER])

        }

        Y <- do.call(cbind, per_list)
        Y <- array(Y, dim=c(dim(per_list[[1]]), length(per_list)))

        Y.median <- apply(Y, c(1, 2), median, na.rm = TRUE)
        Y.iqr<- apply(Y, c(1, 2), IQR, na.rm = TRUE)
        #Y.mean <- apply(Y, c(1, 2), mean, na.rm = TRUE)
        #Y.sd<- apply(Y, c(1, 2), sd, na.rm = TRUE)

        roc_median[[PER]] <- Y.median
        roc_iqr[[PER]] <- Y.iqr
        #roc_mean[[PER]] <- Y.mean
        #roc_sd[[PER]] <- Y.sd

      }

      rocMedian <- do.call(cbind, roc_median)
      rocMedian <- array(rocMedian, dim=c(dim(roc_median[[1]]), length(roc_median)))

      rocIQR <- do.call(cbind, roc_iqr)
      rocIQR <- array(rocIQR, dim=c(dim(roc_iqr[[1]]), length(roc_iqr)))

      #rocMean <- do.call(cbind, roc_mean)
      #ocMean <- array(rocMean, dim=c(dim(roc_mean[[1]]), length(roc_mean)))

      #rocSD <- do.call(cbind, roc_sd)
      #rocSD <- array(rocSD, dim=c(dim(roc_sd[[1]]), length(roc_sd)))


      # Find which combination of RW parameters had the highest auroc
      ROCwh <- which(rocMedian==max(rocMedian), arr.ind=T)
      ROCwh

      #train_auc <- rocMedian[ROCwh]

      train_auc_median <- rocMedian[ROCwh]
      train_auc_iqr <- rocIQR[ROCwh]
      #train_auc_mean <- rocMean[ROCwh]
      #train_auc_sd <- rocSD[ROCwh]

      slp_opt <- rwslp_vec[ROCwh[1]]
      exp_opt <- rwexp_vec[ROCwh[2]]
      per_opt <- rwper_vec[ROCwh[3]]

      # to get test results
      #roc_median <- list()
      #roc_iqr <- list()
      #roc_mean <- list()
      #roc_sd <- list()

      for(PER in 1:length(rwper_vec)){

        per_list <- list()

        for(i in 1:length(polyid_test.vec)){
          obj.id <- polyid_test.vec[i]
          res <- rw_gridsearch_multi[[obj.id]]
          per_list[[i]] <- as.matrix(res[,,PER])

        }

        Y <- do.call(cbind, per_list)
        Y <- array(Y, dim=c(dim(per_list[[1]]), length(per_list)))

        Y.median <- apply(Y, c(1, 2), median, na.rm = TRUE)
        Y.iqr<- apply(Y, c(1, 2), IQR, na.rm = TRUE)
        #Y.mean <- apply(Y, c(1, 2), mean, na.rm = TRUE)
        #Y.sd<- apply(Y, c(1, 2), sd, na.rm = TRUE)

        roc_median[[PER]] <- Y.median
        roc_iqr[[PER]] <- Y.iqr
        #roc_mean[[PER]] <- Y.mean
        #roc_sd[[PER]] <- Y.sd

      }

      test_rocMedian <- do.call(cbind, roc_median)
      test_rocMedian <- array(test_rocMedian, dim=c(dim(roc_median[[1]]), length(roc_median)))

      test_rocIQR <- do.call(cbind, roc_iqr)
      test_rocIQR <- array(test_rocIQR, dim=c(dim(roc_iqr[[1]]), length(roc_iqr)))

      #test_rocMean <- do.call(cbind, roc_mean)
      #test_rocMean <- array(test_rocMean, dim=c(dim(roc_mean[[1]]), length(roc_mean)))

      #test_rocSD <- do.call(cbind, roc_sd)
      #test_rocSD <- array(test_rocSD, dim=c(dim(roc_sd[[1]]), length(roc_sd)))

      #test_roc <- do.call(cbind, roc_median)
      #test_roc<- array(test_roc, dim=c(dim(roc_median[[1]]), length(roc_median)))

      test_auc_median <- test_rocMedian[ROCwh]
      test_auc_iqr <- test_rocIQR[ROCwh]
      #test_auc_mean <- test_rocMean[ROCwh]
      #test_auc_sd <- test_rocSD[ROCwh]

      # result for test dataset
      result_list <- list(
        n_train = length(polyid_train.vec),
        n_test = length(polyid_test.vec),
        rw_slp_opt = slp_opt,
        rw_exp_opt = exp_opt,
        rw_per_opt = per_opt,
        test_auroc_median = test_auc_median,
        train_auroc_median = train_auc_median
      )

      spcv_results[[k]] <- result_list

    }


    spcv_rw_folds <- data.frame(matrix(unlist(spcv_results), nrow = length(spcv_results), byrow = T))
    names(spcv_rw_folds) <- names(spcv_results[[1]])

    rep_spcv_rw[[n]] <- spcv_rw_folds

  }

  smp_sizes_rw[[SMP]] <- rep_spcv_rw

}

#save(smp_sizes_rw, file = "smp_size_repeated_spcv_RW.Rd")

# PCM SAMPLE SIZE ##############################################################

setwd("/home/jason/Scratch/GPP_PCM_Paper")
(load("pcm_gridsearch_multi.Rd"))

pcmmd.vec <- as.numeric(colnames(pcm_gridsearch_multi[[1]][[1]]))
pcmmu.vec <- as.numeric(rownames(pcm_gridsearch_multi[[1]][[1]]))

spcv_pcm <- list()
smp_sizes_pcm <- vector("list", length = length(n_smps))


for(SMP in 1:length(n_smps)){

  n_smp <- n_smps[SMP]

  rep_spcv_pcm <- vector("list", length = length(n_smps))

  for(n in 1:repetitions){

    print(paste("Sample size:", n_smp, "Rep:", n))

    rep_seed <- 11082017 + n

    # Define folds for sample
    fold_labels <- spcvFoldsPoly(slide_poly_vec, n_folds = n_folds, seed = rep_seed)


    # Folds ###############
    for(k in 1:n_folds){

      train_relerr_list <- vector("list", length = n_folds)

      polyid_train.vec <- which(fold_labels != k)

      if(n_smp < length(polyid_train.vec)){

        polyid_train.vec <- sample(polyid_train.vec, n_smp, replace = FALSE)

      }

      polyid_test.vec <- which(fold_labels == k)

      for(i in 1:length(polyid_train.vec)){
        obj.id <- polyid_train.vec[i]
        res <- pcm_gridsearch_multi[[obj.id]]$relerr
        train_relerr_list[[i]] <- res
        # calc median for these using apply

      }


      rel_err <- apply(simplify2array(train_relerr_list), 1:2, median)
      iqr_relerr<- apply(simplify2array(train_relerr_list), 1:2, IQR)

      rel_err_wh <- which(rel_err==min(rel_err), arr.ind=T)
      rel_err_wh # an arrayInd() ...
      rel_err[rel_err_wh]

      #In case two optimal params found, take first
      if(length(rel_err_wh) > 2){
        rel_err_wh <- rel_err_wh[1,]
      }

      opt_md <- pcmmd.vec[rel_err_wh[2]]
      opt_mu <- pcmmu.vec[rel_err_wh[1]]

      # Test
      test_relerr_list <- vector("list", length = length(n_folds))

      for(i in 1:length(polyid_test.vec)){
        obj.id <- polyid_test.vec[i]
        res <- pcm_gridsearch_multi[[obj.id]]$relerr
        test_relerr_list[[i]] <- res
        # calc median for these using apply

      }


      test_rel_err <- apply(simplify2array(test_relerr_list), 1:2, median)
      test_rel_err[rel_err_wh[1], rel_err_wh[2]]
      test_iqr_relerr<- apply(simplify2array(test_relerr_list), 1:2, IQR)


      opt_gpp_par <- list(
        n_train = length(polyid_train.vec),
        n_test = length(polyid_test.vec),
        pcm_mu = opt_mu,
        pcm_md = opt_md,
        train_relerr = rel_err[rel_err_wh[1], rel_err_wh[2]],
        test_relerr = test_rel_err[rel_err_wh[1], rel_err_wh[2]]
      )

      spcv_pcm[[k]] <- opt_gpp_par

    }

    spcv_pcm_folds <- data.frame(matrix(unlist(spcv_pcm), nrow = length(spcv_pcm), byrow = T))
    names(spcv_pcm_folds) <- names(spcv_pcm[[1]])

    rep_spcv_pcm[[n]] <- spcv_pcm_folds

  }

  smp_sizes_pcm[[SMP]] <- rep_spcv_pcm
}


#save(smp_sizes_pcm, file = "smp_size_repeated_spcv_PCM.Rd")

# SUMMARY RW RESULTS ######################

setwd("/home/jason/Scratch/GPP_RW_Paper")
(load("smp_size_repeated_spcv_RW.Rd"))

smp_size_rw_l <- list()

for(SMP in 1:length(n_smps)){

  mean_rep <- rep(NA, repetitions)

  for(k in 1:repetitions){
    # summarize for each repetition
    mean_rep[k] <- mean(smp_sizes_rw[[SMP]][[k]]$test_auroc_median)
  }

  smp_size_rw_l[[SMP]] <- mean_rep

}

mean_auc <- sapply(smp_size_rw_l, FUN = mean)
sd_auc <- sapply(smp_size_rw_l, FUN = sd)
plot(n_smps, mean_auc,
     xlab = "Sample size", ylab = "AUROC",
     ylim = c(0.91, 0.94), type = "l")


epsilon = 0.3
for(i in 1:length(n_smps)) {
  up = mean_auc[i] + sd_auc[i]
  low = mean_auc[i] - sd_auc[i]
  segments(n_smps[i],low , n_smps[i], up)
  segments(n_smps[i]-epsilon, up , n_smps[i]+epsilon, up)
  segments(n_smps[i]-epsilon, low , n_smps[i]+epsilon, low)
}

points(n_smps, mean_auc, pch = 16)



# SUMMARY PCM RESULTS ####################

setwd("/home/jason/Scratch/GPP_PCM_Paper")

(load("smp_size_repeated_spcv_PCM.Rd"))

n_smps <- seq(from = 10, to = 80, by = 10)

smp_size_pcm_l <- list()
smp_size_pcm_sd_l <- list()

# test summarized mean k and mean rep.

for(SMP in 1:length(smp_sizes_pcm)){

  mean_rep <- rep(NA, length(smp_sizes_pcm[[1]]))

  for(k in 1:length(smp_sizes_pcm[[1]])){
    # summarize for each repetition
    mean_rep[k] <- mean(smp_sizes_pcm[[SMP]][[k]]$test_relerr)
  }

  smp_size_pcm_l[[SMP]] <- mean_rep

}

mean_relerr <- sapply(smp_size_pcm_l, FUN = mean)
sd_relerr <- sapply(smp_size_pcm_l, FUN = sd)
plot(n_smps, mean_relerr,
     xlab = "Sample size", ylab = "Relative error",
     ylim = c(0.1, 0.3), type = "l")


epsilon = 0.3
for(i in 1:length(n_smps)) {
  up = mean_relerr[i] + sd_relerr[i]
  low = mean_relerr[i] - sd_relerr[i]
  segments(n_smps[i],low , n_smps[i], up)
  segments(n_smps[i]-epsilon, up , n_smps[i]+epsilon, up)
  segments(n_smps[i]-epsilon, low , n_smps[i]+epsilon, low)
}

points(n_smps, mean_relerr, pch = 16)

# PLOT RW AND PCM RESULTS ######################################################

setwd("/home/jason/Scratch/Figures")

# Sample size performance and variation ###

png(filename="sampe_size_performance.png", res = 300, width = 7.5, height = 3,
    units = "in", pointsize = 11)

par(family = "Arial", mfrow = c(1,2), mar = c(4, 4, 1, 0.5),
    mgp = c(2, 0.75, 0))


#AUROC

mean_auc <- sapply(smp_size_rw_l, FUN = mean)
sd_auc <- sapply(smp_size_rw_l, FUN = sd)
plot(n_smps, mean_auc,
     xlab = "Sample size", ylab = "Runout path AUROC",
     ylim = c(0.91, 0.94), type = "l")


epsilon = 0.3
for(i in 1:length(n_smps)) {
  up = mean_auc[i] + sd_auc[i]
  low = mean_auc[i] - sd_auc[i]
  segments(n_smps[i],low , n_smps[i], up)
  segments(n_smps[i]-epsilon, up , n_smps[i]+epsilon, up)
  segments(n_smps[i]-epsilon, low , n_smps[i]+epsilon, low)
}

points(n_smps, mean_auc, pch = 16)


#Relative error
mean_relerr <- sapply(smp_size_pcm_l, FUN = mean)
sd_relerr <- sapply(smp_size_pcm_l, FUN = sd)
plot(n_smps, mean_relerr,
     xlab = "Sample size", ylab = "Runout distance relative error",
     ylim = c(0.1, 0.25), type = "l")


epsilon = 0.3
for(i in 1:length(n_smps)) {
  up = mean_relerr[i] + sd_relerr[i]
  low = mean_relerr[i] - sd_relerr[i]
  segments(n_smps[i],low , n_smps[i], up)
  segments(n_smps[i]-epsilon, up , n_smps[i]+epsilon, up)
  segments(n_smps[i]-epsilon, low , n_smps[i]+epsilon, low)
}

points(n_smps, mean_relerr, pch = 16)


dev.off()



# REL FREQ PCM PARAMS / SAMPLE SIZE SCATTER ####################################

setwd("/home/jason/Scratch/Figures")

pairs_pcm_df <- pairs_pcm[[1]]
pairs_pcm_df$smp_size <- n_smps[1]

for(i in 2:length(pairs_pcm)){
  tmp_pcm_df <- pairs_pcm[[i]]
  tmp_pcm_df$smp_size <- n_smps[i]

  pairs_pcm_df <- rbind(pairs_pcm_df, tmp_pcm_df)

}

summary(as.factor(pairs_pcm_df$smp_size))

pairs_pcm_df$labels <- as.factor(pairs_pcm_df$smp_size)

mybreaks <- c(1, 10, 50 )

ggplot(pairs_pcm_df, aes(x=md, y=mu)) +
  geom_point(alpha=0.7, aes(colour = smp_size, size = rel_freq)) +


  scale_size(name="Relative\nfrequency (%)", range = c(2, 6), breaks = mybreaks) +

  #scale_colour_gradient(high = "#1B4F72", low = "#85C1E9", name = "Sample size") +
  scale_color_viridis_c(name = "Sample size")+

  xlab("Mass-to-drag ratio (m)") +
  ylab("Sliding friction coefficient") +


  theme_light() +
  theme(text = element_text(family = "Arial", size = 8), axis.title = element_text(size = 9),
        axis.text = element_text(size = 8)) +
  guides(
    colour = guide_colourbar(order = 1),
    size = guide_legend(order = 2))

ggsave("sample_size_scatter_opt_pcm.png", dpi = 300, width = 5.5, height = 3.25, units = "in")
jngtz/runout.opt documentation built on Sept. 8, 2021, 2:15 p.m.