#' @name tunaboot
#' @title Boostrap for Model Predictions
#' @description Bootstrap function which compute proportion by species fishing mode and ocean for confindence interval
#' @param sample_data (data frame) Data used for the modelling. Output table from process 3.1.
#' @param allset_data (data frame) Data used for prediction.Output table from process 3.4.
# @param schooltype (integer) Fishing mode of the catch.
# @param ocean (integer) Target ocean.
# @param species (character) Target species. 'SKJ' for kipjack and 'YFT' for yellowfin.
#' @param Ntree (integer) Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times. The default value is 1000.
#' @param Nmtry Number of variables randomly sampled as candidates at each split. The default value is 2.
#' @param Nseed Set the initial seed for the modelling. The default value is 7.
#' @param min_node Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time).The default value is 5.
#' @param Nboot The number of bootstrap samples desired. The fefault value is 10
#' @param Nseed_boot Set the initial seed for the modelling. The default value is equal to Nseed parameter
#' @param target_period Time period for the predictions in year. Default is the year of the data to predict
#' @importFrom ranger ranger
#' @importFrom dplyr sample_n bind_rows rename
#' @importFrom stats predict
#' @return TODO
tunaboot <- function(sample_data,
allset_data,
Ntree = 1000,
Nmtry = 2,
Nseed = 7,
min_node = 5,
Nboot = 10,
Nseed_boot,
target_period) {
# local binding global variables ----
prop_t3 <- modrf0 <- NULL
if (missing(Nseed_boot)) {Nseed_boot = Nboot}
if (missing(target_period)) {target_period = allset_data$yr[1]}
# build structure of the output list for the bootstrap
boot_output_list <- vector("list", length = Nboot)
# subset of all sampled set
sub <- sample_data
# format columns
sub$resp <- (sub$prop_t3)
sub$tlb <- (sub$prop_lb)
sub$yr <- factor(sub$year)
sub$mon <- factor(sub$mon)
sub$vessel <- factor(sub$vessel)
sub <- droplevels(sub)
sub$ocean <- factor(sub$ocean)
sub$fmod <- factor(sub$fmod)
#### bootstrap
# select sample and data to predict
### remove set used to train models from data to predict
no_sampled_set <- droplevels(allset_data[!(allset_data$id_act %in% unique(sub$id_act)),])
no_sampled_set <- droplevels(no_sampled_set[no_sampled_set$yr %in% target_period,])
## Prepare data
no_sampled_set$tlb <- (no_sampled_set$prop_lb) # transform predictor if necessary
no_sampled_set$yr <- factor(no_sampled_set$yr)
no_sampled_set$mon <- factor(no_sampled_set$mon)
no_sampled_set$vessel <- factor(no_sampled_set$vessel)
### split dataframe for different treatment if needed
no_sampled_set$data_source <- NA # assign source later
no_sampled_set$fit_prop <- NA # stock final proportion
# not sampled vessel list
vessel_not_train <- setdiff(levels(no_sampled_set$vessel),
levels(sub$vessel))
# dataset with all information
newd <- (no_sampled_set[!no_sampled_set$vessel %in% vessel_not_train, ])
# dataset with vessel not sampled
new_wtv <- no_sampled_set[no_sampled_set$vessel %in% vessel_not_train & !is.na(no_sampled_set$prop_lb), ]
# dataset with no logbook
new_0 <- no_sampled_set[no_sampled_set$vessel %in% vessel_not_train & is.na(no_sampled_set$prop_lb), ]
new_0$data_source <- as.character(new_0$data_source)
# compute catch by species by set
sampled_set <- unique(sub[sub$yr %in% target_period,])
# add sample not corrected catch by species
if(nrow(sampled_set) > 0){
sampled_set$data_source <- "sample"
} # add flag
sampled_set <- dplyr::rename(sampled_set,
fit_prop = prop_t3)
# output columns
column_keep <- c("id_act",
"fmod",
"sp",
"lat",
"lon",
"date_act",
"vessel",
"ocean",
"yr",
"mon",
"fit_prop",
"wtot_lb_t3",
# "w_lb_t3",
"data_source")
for (i in seq.int(from = 1, to = Nboot)) {
print(i)
set.seed(i)
newsample <- dplyr::sample_n(tbl = sub,
size = nrow(sub),
replace = TRUE)
#-----------------------#
## models and predicts ##
#-----------------------#
# best case (all information available)###
if (nrow(newd)>0) {
set.seed(Nseed_boot)
model_rf_full <- ranger::ranger(resp ~ tlb + lon + lat + yr + mon + vessel,
data = newsample,
num.trees = Ntree,
mtry = Nmtry,
importance = "none",
min.node.size = min_node,
splitrule = "variance",
replace = TRUE,
quantreg = FALSE,
keep.inbag= FALSE
)
newd$fit_prop <- stats::predict(object = model_rf_full,
data = newd)$predictions
newd$data_source <- "full_model" # add flag
}
## without vessel information
if (nrow(new_wtv)>0) {
set.seed(Nseed_boot)
model_rf_wtvessel <- ranger::ranger(resp ~ tlb + lon + lat + yr + mon,
data = newsample,
num.trees = Ntree,
mtry = Nmtry,
importance = "none",
min.node.size = min_node,
splitrule = "variance",
replace = TRUE,
quantreg = FALSE,
keep.inbag= FALSE
)
new_wtv$fit_prop<- predict(model_rf_wtvessel,data=new_wtv)$predictions
new_wtv$data_source <- "model_wtv" # add flag
}
# without logbook information on the catch, location and date only
if (nrow(new_0)>0) {
set.seed(Nseed_boot)
model_rf_simple <- ranger::ranger(resp ~ lon + lat + yr + mon,
data = newsample,
num.trees = Ntree,
mtry = Nmtry,
importance = "none",
min.node.size = min_node,
splitrule = "variance",
replace = TRUE,
quantreg = FALSE,
keep.inbag= FALSE
)
new_0$fit_prop<- predict(modrf0,newdata=new_0)$predictions
new_0$data_source <- "simple_model" # add flag
}
# remove set with no data and combine all catch
all_set <- list(newd,new_wtv,new_0, sampled_set)
all_set <- all_set[which(unlist(lapply(all_set, nrow)) > 0)]
all_set <- dplyr::bind_rows(all_set)
boot_output_list[[i]] <- all_set
}
return(boot_output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.