#' Uses PBT assignments and other data to estimate group composition by an "accounting- style" method
#'
#' \code{SCOBI_deux} Uses PBT assignments and tag rates to calculate composition
#' estimates and adjust estimates based on the tag rates. Confidence intervals are
#' generated by non-parametric bootstrapping.
#'
#' This function uses the "accounting-style" method (Ac) described by Delomas and Hess (in review) to estimate
#' composition of various groups when using PBT with tag rates less than 100%. This function is set up to
#' provide estimates for three groups of fish, based on fishery managment of Salmonids: Ad-clipped, Ad-intact
#' hatchery, and Ad-intact wild. The paper describes how the data for all Ad-intact fish sampled is used to
#' estimate the composition of Ad-intact hatchery and wild fish. If both Ad-clipped and Ad-intace fish are
#' present in the input dataset, the total number of Ad-clipped fish is estimated using the ratio of Ad-clipped
#' fish to Ad-intact fish in the input data. If the \code{pbtGroupVariable} is included in the
#' \code{Hierarch_variables} option, then the composition of the Ad-clipped fish is calculated analagously
#' to the method described in the paper for the Ad-intact hatchery and wild fish, but the wild group is now
#' the fish which are Unassigned by PBT. If the \code{pbtGroupVariable} is NOT included in the
#' \code{Hierarch_variables} option, then the estimates are simply made by calculating the relative numbers of
#' observed fich in each category of the \code{Hierarch_variables} option (ie, tag rates are NOT used). This is
#' because it is assumed the sampled PBT Untagged fish that are Ad-clipped are still identified as being
#' Ad-clipped, and so can be used to estimate composition.
#' This function is written to have an a user interface similar to that of a function written for a similar
#' purpose in a different package by different authors, \code{SCOBI}.
#'
#' @param adultData This is the detailed dataset that is used to determine composition. In
#' dam escapement, this is the "trap" data. It can either be a dataframe or the path to a
#' csv file with a header.
#' @param windowData This is the dataset that gives the census numbers for each strata (such
#' as time period or location). In dam escapement, this is the "window count" dataset.
#' It can either be a dataframe or the path to a csv file with a header. The first column has the groups
#' as listed in the \code{dataGroupVariable} column of the \code{adultData}. The second column is the census
#' count for that data group. The third column is the strata that data group belongs to. The purpose of this is to allow
#' a user to easily reconfigure groups of observations into different strata.
#' @param Run This is a prefix that will be appended to all output files.
#' @param RTYPE This is the type of fish that you want to analyze for the \code{Hierarch_variables}.
#' Options are: "clipped", "noclip_H", "wild", representing Ad-clipped hatchery, Ad-intact
#' hatchery, and wild fish, respectively.
#' @param Hierarch_variables These are the variables (column names in \code{adultData}) that you
#' want to assess RTYPE for in the order you want them to be assessed.
#' @param SizeCut Fish less than this value will be put into a "Small" category, and fish greater or equal
#' to this value will be put into a "Large" category. This is intended for easy analysis of jacks or
#' large and small steelhead.
#' @param alph This is the alpha value to use for confidence intervals. So, an alph of .05 will give a
#' 95\% CI, with quantiles at .025 and .975.
#' @param B This is the number of bootstrap iteratiosn to perform. To skip bootstraping and only produce
#' point estiamtes, use a value of 0.
#' @param writeBoot If \code{TRUE}, the values for each bootstrap iteration will be written.
#' @param pbtRates This is either a dataframe or the path to a csv file with a header. The first
#' column contains the names of all the pbtGroupVariable values, and the second column contains their
#' respective tag rates.
#' @param adClipVariable This is the name of the variable that has the ad-clip and ad-intact status of
#' all samples. Values in this column shoudl either be "AD" or "AI" for ad-clipped and ad-intact,
#' respectively. NA values are considered missing and are removed from the dataset prior to any analysis.
#' @param physTagsVariable This is the name of the variable that has information on physical tags (such
#' as CWTs). Fish with a physical tag that identifies them as hatchery origin should have the value "tag"
#' and fish without such a physical tag should have the value "notag". Fish that were not assessed for the
#' presence of a physical tag should have the value NA.
#' @param pbtGroupVariable This is the name of the variable that has the pbt group that a fish assigned to.
#' Fish that did not assign to any group should be given the value of "Unassigned". Fish that were not
#' assessed with PBT, for example they failed to genotype or were not genotyped, should be given the value
#' of NA.
#' @param lengthVariable This is the name of the variable that has numberic lengths if the SizeCut option
#' is being used.
#' @param dataGroupVariable TThis is the name of the variable that has the grouping that each sample
#' belongs to for pooling into strata, with the values matching the values given in the first column of windowData.
#' @param screenOutput This is the name of the file to write all output the would otherwise be printed to
#' the console. If not specified, output is printed to the console.
#' @param spibetr If this is \code{TRUE}, then the composition estimates of the wild fish will be corrected
#' to account for the expected number and composition of the unclipped, untagged hatchery fish present in
#' the wild sample. It this is \code{FALSE}, the total number of wild fish will be reduced in accordance with
#' the expected number of uncliipped, untagged hatchery fish, but the composition of the wild fish will not
#' be corrected. When \code{TRUE}, the method is the "Ac" method described in the paper. When \code{FALSE},
#' the method is the "TotEx" method described in the paper.
#' @return A string is returned simply stating that the analysis is complete. All of
#' the output of the function is written to the working directory. These files are:
#' ....
#' ....
#' ....
#' @export
SCOBI_deux <- function(adultData = NULL, windowData = NULL, Run = "output", RTYPE = "wild", Hierarch_variables = NULL,
SizeCut = NULL, alph = 0.1, B = 5000, writeBoot = FALSE, pbtRates = NULL, adClipVariable = "AdClip",
physTagsVariable = "PhysTag", pbtGroupVariable = "GenParentHatchery", lengthVariable = "LGDFLmm", dataGroupVariable = "WeekNumber",
screenOutput = NULL, spibetr = TRUE){
# read in data
if (is.character(adultData)){
Fishdata <- read.csv(file = adultData, header = TRUE, na.strings = c("NA",""), stringsAsFactors = FALSE)
} else {
Fishdata <- adultData
rm(adultData) # remove in case passed as a dataframe and is large
}
if (is.character(windowData)){
Windata <- read.csv(file = windowData, header = TRUE, stringsAsFactors = FALSE)
} else {
Windata <- windowData
rm(windowData)
}
if (is.character(pbtRates)){
pbtRate <- read.csv(file = pbtRates, header = TRUE, stringsAsFactors = FALSE)
} else {
pbtRate <- pbtRates
}
if (is.character(screenOutput)){
sink_out <- TRUE
} else {
sink_out <- FALSE
}
if(sink_out){
sink(paste0(Run, "_", screenOutput))
on.exit(sink()) # make sure writing to file is stopped regardless of exit type
}
collaps <- Windata[,3]
# combine (collapse) counts for each group according to user specifies strata
cat("\n", dataGroupVariable, "is collapsed according to: \n")
temp <- rbind(Windata[,1],collaps)
rownames(temp) <- c("Week","Strata")
print(temp)
u_collaps <- sort(unique(collaps)) # These are the strata used for the analysis
WinCounts <- c()
for(i in u_collaps){
WinCounts <- c(WinCounts, sum(as.numeric(Windata[Windata[,3] == i,2])))
}
WinData <- data.frame(u_collaps,WinCounts) # This is the window data reduced to strata defined by collaps
if("Strata" %in% colnames(Fishdata)){
stop("\nStrata is a reserved column name. Please rename this column in your input adultData dataset. Exiting.\n")
}
# Recode dataGroupVariable in fish data to collapsed strata
nullstrat <- c()#strata without any trapped fish
first <- TRUE
for(strat in Windata[,1]) { # Note that the original Windata is used here
juststrat <- Fishdata[Fishdata[,dataGroupVariable] == strat,] #trap data with only fish in the current strata
if( nrow(juststrat) == 0 ) {#if no fish in the strata
nullstrat <- c(nullstrat,strat)
} else {
juststrat$Strata <- Windata[Windata[,1] == strat,3] #add in the strata that matches that week
if(first) {
FishData <- juststrat
first <- FALSE
} else {
FishData <- rbind(FishData,juststrat)
}
}
}
##### input data is Fishdata, working data with Strata field is FishData, whith a capital D
# Check to see if any of the original weeks are missing trapped fish and if any recoded week are missing fish
if(length(nullstrat) == 0) {
cat("\nThere are trapped fish for every week\n")
} else if (sum(table(FishData$Strata) == 0) == 0) {
cat("\nWeeks ", nullstrat, " have no trapped fish but it appears that \n")
cat(" these weeks are collapsed with weeks having trapped fish.\n")
} else{
cat("\nWeeks ",nullstrat," have no trapped fish and it appears that \n")
cat("some of these weeks are NOT collapsed with weeks having trapped fish.\n")
cat("Here are the totals of trapped fish for each collapsed strata: \n" )
print(table(FishData$Strata))
stop("\nExiting.")
}
rm(Windata) # save memory, new window data has capital D
rm(Fishdata) # save memory, new fish data has capital D
# check adClipVariable column to make sure all values are recognized
checkInput <- unique(FishData[!is.na(FishData[,adClipVariable]), adClipVariable])
if (sum(!(checkInput %in% c("AD", "AI"))) > 0){
errorMessage <- paste0("Unrecognized entry \"", checkInput[!(checkInput %in% c("AD", "AI"))], "\" in the adClipVariable column.")
stop(errorMessage)
}
# check PhysTag column to make sure all values are recognized
checkInput <- unique(FishData[!is.na(FishData[,physTagsVariable]), physTagsVariable])
if (sum(!(checkInput %in% c("tag", "notag"))) > 0){
errorMessage <- paste0("Unrecognized entry \"", checkInput[!(checkInput %in% c("tag", "notag"))], "\" in the physTagsVariable column.")
stop(errorMessage)
}
#check that all PBT assigned groups are in the tag rate file
if("Unassigned" %in% pbtRate[,1]){
if(pbtRate[(pbtRate[,1] == "Unassigned"),2] != 1){
stop("\nError: Unassigned was found in your tag rate file with a tag rate not equal to 1. Exiting\n")
}
} else {
pbtRate <- rbind(pbtRate, c("Unassigned", 1)) #Add Unassigned with tag rate of 1 to make calculations below simple to code
pbtRate[,2] <- as.numeric(pbtRate[,2]) #fix R's automatic switch of column 2 to character
}
pbt_groups <- FishData[,pbtGroupVariable]
pbt_groups <- unique(pbt_groups[!is.na(pbt_groups)])
if (sum(!(pbt_groups %in% pbtRate[,1])) != 0){
cat("\nError: the below PBT groups were not found in the PBT tag rate input.\n")
cat(pbt_groups[!(pbt_groups %in% pbtRate[,1])], sep = "\n")
stop("Exiting.")
}
###estimate proportions hatchery, hatchery-no-clip, and wild
### with PBT expansion
if("pbt_expansion" %in% colnames(FishData)){
stop("pbt_expansion is a reserved column name. Please rename this column in your input adultData dataset. Exiting.")
}
#discard observations with:
##no entry for ad-clip
num_obs <- nrow(FishData)
FishData <- FishData[!is.na(FishData[,adClipVariable]),]
cat("\nRemoved", num_obs - nrow(FishData), "observations for missing ad-clip information.\n")
#add pbt expansion column - there will be NA in non pbt attempted fish, but that is ok
FishData$pbt_expansion <- 1/pbtRate[match(FishData[,pbtGroupVariable], pbtRate[,1]),2]
# Define large and small if one of the factors is fork length
if(lengthVariable %in% Hierarch_variables) {
FishData[!is.na(FishData[,lengthVariable]) & FishData[,lengthVariable] < SizeCut, lengthVariable] <- "Sm"
FishData[!is.na(FishData[,lengthVariable]) & FishData[,lengthVariable] != "Sm", lengthVariable] <- "Lg"
}
# point estimate of proportions hatchery, hatchery-no-clip, and wild
#calculate numbers of fish in each category, for each strata
#expand hatchery fish and remove fish from wild as appropriate
# calculate expansion for fish that are only tagged with PBT (no clip, CWT)
# subtract this from the wild count
cat("\nBegan analysis at:\n")
print(Sys.time())
cat("\n")
###Function to decompose into H, HNC and W
## organzed estimation procedures into functions to make non-parametric bootstrapping simpler
decompose_h_hnc_w <- function(func_FishData, func_WinData){
func_h_hnc_w_estimates <- matrix(nrow = nrow(func_WinData), ncol = 4) ## this will have proportion of each type trapped each strata
colnames(func_h_hnc_w_estimates) <- c("strata", "clipped_hatchery", "unclipped_hatchery", "wild")
func_h_hnc_w_estimates[,1] <- func_WinData[,1]
func_unexp_func_h_hnc_w_estimates <- func_h_hnc_w_estimates
#for each strata
for(i in 1:nrow(func_h_hnc_w_estimates)){
prop_all_data <- func_FishData[func_FishData$Strata == func_h_hnc_w_estimates[i,1],]
#calculate number clipped
clip_count <- sum(prop_all_data[,adClipVariable] == "AD")
#save "unexpanded" clipped count
func_unexp_func_h_hnc_w_estimates[i,2] <- clip_count
#unclipped only data
ai_data <- prop_all_data[prop_all_data[,adClipVariable] == "AI",]
#calculate number AI with phystical tag
phystag_count <- sum(!is.na(ai_data[,physTagsVariable]) & ai_data[,physTagsVariable] == "tag")
#calculate number AI without phys tag or unknown phystag status - this is the number that needs to be called HNC and W based on PBT and tag rates
noORna_phystag_count <- sum(is.na(ai_data[,physTagsVariable]) | ai_data[,physTagsVariable] == "notag")
# select pbt-ony fish that were successfully genotyped
ai_data <- ai_data[(is.na(ai_data[,physTagsVariable]) | ai_data[,physTagsVariable] == "notag") & !is.na(ai_data[,pbtGroupVariable]),]
# count pbt_only_HNC and wild with PBT expansions
pbtOnly_HNC <- sum(ai_data[ai_data[,pbtGroupVariable] != "Unassigned", "pbt_expansion"]) #this is expanded pbtonly hnc count
expected_untagged <- pbtOnly_HNC - sum(ai_data[,pbtGroupVariable] != "Unassigned") #this is the number expanded (expected number of untagged fish)
wild_count <- sum(ai_data[,pbtGroupVariable] == "Unassigned") # this is the wild count BEFORE subtracting pbt expanded fish
# save unexpanded wild count
func_unexp_func_h_hnc_w_estimates[i,4] <- wild_count
# save unexpanded HNC count
func_unexp_func_h_hnc_w_estimates[i,3] <- sum(ai_data[,pbtGroupVariable] != "Unassigned") + phystag_count
if(expected_untagged > wild_count){
cat("\nWarning: The expansion of HNC fish that appear wild in strata", func_h_hnc_w_estimates[i,1], "is estimated to be ", round(expected_untagged - wild_count, 2), "more than",
"the number of wild appearing fish. Capping the expansion to make the count of wild fish in this strata 0.\n")
expected_untagged <- wild_count
}
wild_count <- wild_count - expected_untagged # this is the wild count after subtracting pbt expanded fish
#turn into proportions
total_pbtHNCwild <- pbtOnly_HNC + wild_count
if (total_pbtHNCwild == 0){
### if all fish are clipped - used mainly when running clipped fish separately from AI fish
pbtOnly_HNC <- 0
wild_count <- 0
} else {
pbtOnly_HNC <- pbtOnly_HNC / total_pbtHNCwild
wild_count <- wild_count / total_pbtHNCwild
}
#multiply by noORna_phystag_count to get totals
pbtOnly_HNC <- pbtOnly_HNC * noORna_phystag_count
wild_count <- wild_count * noORna_phystag_count
# add hnc groups together, one from physical tags, one from expanded pbt
hnc_count <- phystag_count + pbtOnly_HNC
total <- sum(wild_count, hnc_count, clip_count)
func_h_hnc_w_estimates[i,2] <- clip_count/total
func_h_hnc_w_estimates[i,3] <- hnc_count/total
func_h_hnc_w_estimates[i,4] <- wild_count/total
rm(ai_data)
}
#use proportions combined with census (window) data to estimate counts of each category
func_h_hnc_w_count_est <- func_h_hnc_w_estimates[,2:4]*func_WinData[,2]
if(!is.matrix(func_h_hnc_w_count_est)){
func_h_hnc_w_count_est <- t(as.matrix(func_h_hnc_w_count_est))
}
func_category_totals <- apply(func_h_hnc_w_count_est,2,sum)
func_h_hnc_w_count_est <- cbind(func_h_hnc_w_estimates[,1], func_h_hnc_w_count_est)
colnames(func_h_hnc_w_count_est)[1] <- "strata"
return(list(func_unexp_func_h_hnc_w_estimates, func_h_hnc_w_estimates, func_h_hnc_w_count_est, func_category_totals))
}
Estimate_h_hnc_w <- decompose_h_hnc_w(FishData, WinData)
#split output into individual variables for easy referencing later on
unexp_h_hnc_w_estimates <- Estimate_h_hnc_w[[1]]
h_hnc_w_estimates <- Estimate_h_hnc_w[[2]]
h_hnc_w_count_est <- Estimate_h_hnc_w[[3]]
category_totals <- Estimate_h_hnc_w[[4]]
rm(Estimate_h_hnc_w) #save memory
cat("Estimated totals for\n\tClipped:\t", category_totals[1], "\n\tHatchery No-clip:\t", category_totals[2], "\n\tWild:\t", category_totals[3], "\n")
#write output to files
sink(paste0(Run,"_Rearing.txt"))
cat("Counts of fish of each type in the trap data\n")
write.table(unexp_h_hnc_w_estimates, row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
cat("\n\nPBT expanded proportions of each type\n")
write.table(h_hnc_w_estimates, row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
cat("\n\nPBT expanded counts of each type in the run\n")
write.table(h_hnc_w_count_est, row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
cat("Totals", category_totals[1], category_totals[2], category_totals[3], sep = "\t")
sink()
#run analyses of each factor, for rearing type of interest
#select rearing type of interest
hierarch_data <- FishData
#select rearing type of interest(clipped, noclip_h, wild)
if (RTYPE == "clipped"){
max_expand <- NULL
spibetr_data <- NULL
hierarch_data <- hierarch_data[hierarch_data[,adClipVariable] == "AD",]
} else if (RTYPE == "noclip_H"){
spibetr_data <- NULL
hierarch_data <- hierarch_data[hierarch_data[,adClipVariable] == "AI" & ((!is.na(hierarch_data[,physTagsVariable]) & hierarch_data[,physTagsVariable] == "tag") | (!is.na(hierarch_data[,pbtGroupVariable]) & hierarch_data[,pbtGroupVariable] != "Unassigned")),]
#calculate total number of putatively "wild" fish in each strata so that the algorithm knows how many it can expand the pbt-only fish for
max_expand <- matrix(nrow = 0, ncol = 2)
for(s in unique(FishData$Strata)){
max_expand <- rbind(max_expand, c(s,
sum(FishData$Strata == s & FishData[,adClipVariable] == "AI" &
(is.na(FishData[,physTagsVariable]) | FishData[,physTagsVariable] == "notag") &
!is.na(FishData[,pbtGroupVariable]) & FishData[,pbtGroupVariable] == "Unassigned")
))
}
} else if (RTYPE == "wild"){
max_expand <- NULL
#spibetr data is data for fish that are only known to be hatchery origin through PBT
if (spibetr){
spibetr_data <- hierarch_data[hierarch_data[,adClipVariable] =="AI" & (is.na(hierarch_data[,physTagsVariable]) | hierarch_data[,physTagsVariable] == "notag") & !is.na(hierarch_data[,pbtGroupVariable]) & hierarch_data[,pbtGroupVariable] != "Unassigned",]
} else {
spibetr_data <- NULL
}
hierarch_data <- hierarch_data[hierarch_data[,adClipVariable] == "AI" & (is.na(hierarch_data[,physTagsVariable]) | hierarch_data[,physTagsVariable] == "notag") & !is.na(hierarch_data[,pbtGroupVariable]) & hierarch_data[,pbtGroupVariable] == "Unassigned",]
} else {
stop("Unrecognized RTYPE. Must be one of: \"clipped\", \"noclip_h\", \"wild\"\n")
}
#check that there is data
if(nrow(hierarch_data) < 1){
stop("No fish present in the dataset of the RTYPE that you chose for the hierarchical analysis. Exiting.")
}
# For troubleshooting
# func_hierarch_data=hierarch_data
# func_Hierarch_variables=Hierarch_variables
# func_pbtGroupVariable=pbtGroupVariable
# func_RTYPE=RTYPE
# func_h_hnc_w_count_est=h_hnc_w_count_est
#
#
decompose_hierarchical <- function(func_hierarch_data, func_Hierarch_variables, func_pbtGroupVariable, func_RTYPE, func_h_hnc_w_count_est, func_spibetr_data, func_spibetr, func_max_expand){
hierarch_output_list <- list() #storage of output, flexible for different numbers of factors
column_cats <- c() #this is the list of factors analyzed so far plus the current
#it is therefore the list of columns that the output for the current factor will have
#for each factor
for (factor in func_Hierarch_variables){
#remove NAs
removed <- sum(is.na(func_hierarch_data[,factor]))
func_hierarch_data <- func_hierarch_data[!is.na(func_hierarch_data[,factor]),]
cat("\nRemoved ", removed, " observations for missing data for ", factor, ".\n", sep="")
# remove NAs from spibetr data as appropriate
if(func_spibetr && func_RTYPE == "wild"){
func_spibetr_data <- func_spibetr_data[!is.na(func_spibetr_data[,factor]),]
}
column_cats <- c(column_cats, factor) #list columns for this hierarchical output variable
if(nrow(func_hierarch_data) < 1){
stop("There are no fish in any strata with complete data for factors", column_cats, " Exiting.")
}
if(length(column_cats) < 2){
unique_categories <- list(unique(func_hierarch_data[,column_cats]))
names(unique_categories) <- c(column_cats)
} else {
unique_categories <- apply(func_hierarch_data[,column_cats],2,unique) # get lists of all unique categories for each variable in this analysis, combinations will be the entries for each strata in the output
# if in rare case all column_cats have the same number of categories, a matrix is returned
# it must be turned into a list for later functionality
if(!is.list(unique_categories)){
if(length(dim(unique_categories)) < 2){ # in some cases only one category, so a vector is returned
unique_categories <- t(as.matrix(unique_categories))
}
unique_categoriesMatrix <- unique_categories
unique_categories <- list()
for(i in 1:ncol(unique_categoriesMatrix)){
unique_categories[[colnames(unique_categoriesMatrix)[i]]] <- unique_categoriesMatrix[,i]
}
}
}
hierarch_estimates <- as.data.frame(matrix(nrow = 0, ncol = (length(column_cats) + 2))) ## this will have columns for: strata, categories, proportion of each type trapped
hierarch_estimates_count <- as.data.frame(matrix(nrow = 0, ncol = (length(column_cats) + 2))) ## this will have columns for: strata, categories, count of each type trapped
replace_NA <- FALSE #this will record whether or not the function needs to check for and replace NA entries due to lack of data for one or more strata
#for each strata
for(s in func_h_hnc_w_count_est[,1]){
strata_data <- func_hierarch_data[func_hierarch_data$Strata == s,] #select strata
if(func_spibetr && func_RTYPE == "wild"){
strata_spibetr <- func_spibetr_data[func_spibetr_data$Strata == s,]
if(length(dim(strata_spibetr)) < 2){
strata_spibetr <- t(as.matrix(strata_spibetr))
}
}
#make sure there is data
if(nrow(strata_data) == 0){
cat("\nWarning. No fish with complete data for factors", column_cats, "in strata", s, "\n Using the mean proportions of the entire run for this strata.\n")
replace_NA <- TRUE
#write all NA as temporary measure to be fixed at the end
#build matrix with all combinations of variables
temp_output <- build_all_combos(unique_categories)
temp_output <- cbind(s, temp_output, NA)
colnames(temp_output) <- c("Strata", column_cats, "Proportion_of_strata")
hierarch_estimates <- rbind(hierarch_estimates, temp_output, stringsAsFactors = FALSE)
colnames(temp_output) <- c("Strata", column_cats, "Count_trapped")
hierarch_estimates_count <- rbind(hierarch_estimates_count, temp_output, stringsAsFactors = FALSE)
next #go to next strata
}
#calculate proportions of each category
#build matrix with all combinations of variables
temp_output <- build_all_combos(unique_categories)
temp_output <- cbind(s, temp_output, -9) #put -9 in as a placeholder
colnames(temp_output) <- c("Strata", column_cats, "Proportion_of_strata")
col_num <- ncol(temp_output)
if(func_RTYPE == "wild"){
temp_output2 <- temp_output #counts of samples trapped
matching_data <- strata_data[ , c("Strata", names(unique_categories))] #get data with just columns that you need, in the order you need them
if(func_spibetr && nrow(strata_spibetr) > 0){
matching_spibetr <- strata_spibetr[,c("Strata", names(unique_categories))]
if(length(dim(matching_spibetr)) < 2){
matching_spibetr <- t(as.matrix(matching_spibetr))
}
strata_spibetr_bool <- TRUE
}
else {
strata_spibetr_bool <- FALSE
}
for(i in 1:nrow(temp_output)){
#calculate number of observations that are an exact match for the category
temp_output[i,col_num] <- sum(find_matching_rows(matching_data, temp_output[i,1:ncol(matching_data)]))
temp_output2[i,col_num] <- temp_output[i,col_num]
if(strata_spibetr_bool){
bool_temp <- find_matching_rows(matching_spibetr, temp_output[i,1:ncol(matching_spibetr)])
subtract <- sum(as.numeric(strata_spibetr[bool_temp, "pbt_expansion"])) - sum(bool_temp) #get number of expanded fish (not including fish actually observed)
temp_output[i,col_num] <- as.numeric(temp_output[i,col_num]) - subtract
#prevent negative estimates
if (temp_output[i,col_num] < 0){
temp_output[i,col_num] <- 0
}
}
}
colnames(temp_output2) <- c("Strata", column_cats, "Count_trapped")
hierarch_estimates_count <- rbind(hierarch_estimates_count, temp_output2, stringsAsFactors = FALSE)
colnames(temp_output) <- c("Strata", column_cats, "Proportion_of_strata")
# if sum is not 0, divide to turn into proportions, otherwise leave all as zeroes
if(sum(as.numeric(temp_output[,col_num])) > 0){
temp_output[,col_num] <- as.numeric(temp_output[,col_num]) / sum(as.numeric(temp_output[,col_num]))
}
hierarch_estimates <- rbind(hierarch_estimates, temp_output, stringsAsFactors = FALSE)
} else if (func_RTYPE == "clipped" && !(func_pbtGroupVariable %in% column_cats)){
#if clipped and no pbt variable, just use counts, no need to expand b/c the fish that would be accounted for by expansion are still clipped and therefore in the dataset
#same routine as for wild, but no spibetr
matching_data <- strata_data[ , c("Strata", names(unique_categories))] #get data with just columns that you need, in the order you need them
for(i in 1:nrow(temp_output)){
#calculate number of observations that are an exact match for the category
temp_output[i,col_num] <- sum(find_matching_rows(matching_data, temp_output[i,1:ncol(matching_data)]))
}
colnames(temp_output) <- c("Strata", column_cats, "Count_trapped")
hierarch_estimates_count <- rbind(hierarch_estimates_count, temp_output, stringsAsFactors = FALSE)
colnames(temp_output) <- c("Strata", column_cats, "Proportion_of_strata")
temp_output[,col_num] <- as.numeric(temp_output[,col_num]) / nrow(strata_data)
hierarch_estimates <- rbind(hierarch_estimates, temp_output, stringsAsFactors = FALSE)
} else if (func_RTYPE == "noclip_H" && !(func_pbtGroupVariable %in% column_cats)){
# need to use expansion, but only for pbt-only fish
# expansion of phys tag fish not necessary b/c they will be included in the dataset
temp_output_pbt_only <- temp_output #this will be expansion for pbt-only fish (only the unobserved expanded fish, not counting the observed fish)
colnames(temp_output_pbt_only) <- c("Strata", column_cats, "Expanded_Count_trapped")
matching_data <- strata_data[ , c("Strata", names(unique_categories))] #get data with just columns that you need, in the order you need them
for(i in 1:nrow(temp_output)){
#calculate number of observations that are an exact match for the category
bool_temp <- find_matching_rows(matching_data, temp_output[i,1:ncol(matching_data)])
temp_output_pbt_only[i,col_num] <- sum(strata_data[bool_temp & (is.na(strata_data[,physTagsVariable]) | strata_data[,physTagsVariable] == "notag") ,"pbt_expansion"]) - sum(bool_temp & (is.na(strata_data[,physTagsVariable]) | strata_data[,physTagsVariable] == "notag"))
## temp_output_pbt_only is the extra expanded fish only (the sample count has been subtracted)
temp_output[i,col_num] <- sum(bool_temp) #this is sample count
}
#save estimates
colnames(temp_output) <- c("Strata", column_cats, "Count_trapped")
hierarch_estimates_count <- rbind(hierarch_estimates_count, temp_output, stringsAsFactors = FALSE)
# make sure there are enough wild fish to handle the expansion
expanded_counts <- as.numeric(temp_output_pbt_only[,col_num]) # this is already expanded - sample count from above, so "extra" fish predicted
if (sum(expanded_counts) > 0){ #only need to do this if there was an expansion. This prevents division by zero.
#this is the number of putatively wild fish in the strata (ie, no or unknown phystag and no PBT assignment)
num_expandable <- as.numeric(func_max_expand[func_max_expand[,1] == s,2])
if(num_expandable > sum(expanded_counts)){
#if there are more "wild" fish than there are predicted by expansion, only allocate as many as are predicted
#this is the typical case as there are some strays
num_expandable <- sum(expanded_counts)
}
# so we take the sample count and then divy up the expandable fish in proportion to the tag rate expansions
# temp_output[,col_num] is the sample count at this point
temp_output[,col_num] <- as.numeric(temp_output[,col_num]) + num_expandable*(expanded_counts / sum(expanded_counts))
}
colnames(temp_output) <- c("Strata", column_cats, "Proportion_of_strata")
temp_output[,col_num] <- as.numeric(temp_output[,col_num]) / sum(as.numeric(temp_output[,col_num])) #turn into proportions
hierarch_estimates <- rbind(hierarch_estimates, temp_output, stringsAsFactors = FALSE)
} else { #if group of interest is clipped or noclip_H with pbt variable, use expanded and subtract from unassigned
temp_output_2 <- temp_output
colnames(temp_output_2) <- c("Strata", column_cats, "Count_trapped")
if(func_RTYPE == "noclip_H"){
temp_output_3 <- temp_output #this will be only phystagged fish, so that there expansions can be easily calculated and subtracted from unassigned
temp_output_4 <- temp_output #this will be only phystagged fish, so that there expansions can be easily calculated and subtracted from unassigned
colnames(temp_output_3) <- c("Strata", column_cats, "Proportion_of_strata")
colnames(temp_output_4) <- c("Strata", column_cats, "Count_trapped")
}
matching_data <- strata_data[ , c("Strata", names(unique_categories))] #get data with just columns that you need, in the order you need them
#only physically tagged fish, for expansion to subtract from "Unassigned"
if (func_RTYPE == "noclip_H"){
matching_data_phystag <- strata_data[!is.na(strata_data[,physTagsVariable]) & strata_data[,physTagsVariable] == "tag" , c("Strata", names(unique_categories), "pbt_expansion")] #get data with just columns that you need, in the order you need them
if(length(dim(matching_data_phystag)) < 2){
matching_data_phystag <- t(as.matrix(matching_data_phystag))
}
}
for(i in 1:nrow(temp_output)){
#calculate number of observations that are an exact match for the category
bool_temp <- find_matching_rows(matching_data, temp_output[i,1:ncol(matching_data)])
temp_output[i,col_num] <- sum(as.numeric(strata_data[bool_temp,"pbt_expansion"])) #this will be proportions
temp_output_2[i,col_num] <- sum(bool_temp) #this will sample count
if (func_RTYPE == "noclip_H"){
#only physically tagged fish
#calculate number of observations that are an exact match for the category
bool_temp <- find_matching_rows(matching_data_phystag[,1:(ncol(matching_data_phystag) - 1)], temp_output[i,1:(ncol(matching_data_phystag) - 1)])
temp_output_3[i,col_num] <- sum(as.numeric(matching_data_phystag[bool_temp,"pbt_expansion"])) #this will be proportions
temp_output_4[i,col_num] <- sum(bool_temp) #this will sample count
}
}
#Using the tag rate expanded "counts" directly to calculate proportions can lead to some grou sbeing reduced below their
#sample count in cases where there aren't enough Unassigned fish to distribute
#to prevent this, we need to use the sample counts and the use the tag rates to only redistribute the Unassigned fish
#for example, is you have four fish and all PBT assign, then regardless of tag rates, you will say each one represents
# a quarter of the return. This is really only an issue with small samples b/c of sampling variation
if(func_RTYPE == "clipped"){ # if clipped, simple process:
expanded_counts <- as.numeric(temp_output[,col_num]) - as.numeric(temp_output_2[,col_num]) # this is expanded - sample count, so
#... it is the number of untagged fish expected
if (sum(expanded_counts) > 0){ #only need to do this if there was an expansion. This prevents division by zero.
#this is the number of fish that are available to be used up by expansion
num_expandable <- sum(as.numeric(temp_output_2[temp_output_2[,func_pbtGroupVariable] == "Unassigned",col_num]))
if(num_expandable > sum(expanded_counts)){
#if there are more Unassigned fish than there are predicted by expansion, only allocate as many as are predicted
#this is the typical case as there are some strays
num_expandable <- sum(expanded_counts)
}
# so we take the sample count and then divy up the expandable fish in proportion to the tag rate expansions
temp_output[,col_num] <- as.numeric(temp_output_2[,col_num]) + num_expandable*(expanded_counts / sum(expanded_counts))
}
} else { # if noclip_H, have to use W count for PBT only expansion, and have to use Unassigned for physTag expansion
# temp_output and _2 are all fish
# temp_output_3 and 4 are phystag only
#so first create pbt only data structures
temp_output_pbt_only <- temp_output
temp_output_pbt_only_2 <- temp_output_2
temp_output_pbt_only[,col_num] <- as.numeric(temp_output_pbt_only[,col_num]) - as.numeric(temp_output_3[,col_num])
temp_output_pbt_only_2[,col_num] <- as.numeric(temp_output_pbt_only_2[,col_num]) - as.numeric(temp_output_4[,col_num])
# first the physTag
expanded_counts <- as.numeric(temp_output_3[,col_num]) - as.numeric(temp_output_4[,col_num]) # this is expanded - sample count
if (sum(expanded_counts) > 0){ #only need to do this if there was an expansion. This prevents division by zero.
num_expandable <- sum(as.numeric(temp_output_4[temp_output_4[,func_pbtGroupVariable] == "Unassigned",col_num]))
if(num_expandable > sum(expanded_counts)){
#if there are more Unassigned fish than there are predicted by expansion, only allocate as many as are predicted
#this is the typical case as there are some strays
num_expandable <- sum(expanded_counts)
}
# so we take the sample count and then divy up the expandable fish in proportion to the tag rate expansions
temp_output_3[,col_num] <- as.numeric(temp_output_4[,col_num]) + num_expandable*(expanded_counts / sum(expanded_counts))
}
# now the pbt only
expanded_counts <- as.numeric(temp_output_pbt_only[,col_num]) - as.numeric(temp_output_pbt_only_2[,col_num]) # this is expanded - sample count, so
if (sum(expanded_counts) > 0){ #only need to do this if there was an expansion. This prevents division by zero.
#this is the number of putatively wild fish in the strata (ie, no or unknown phystag and no PBT assignment)
num_expandable <- as.numeric(func_max_expand[func_max_expand[,1] == s,2])
if(num_expandable > sum(expanded_counts)){
#if there are more "wild" fish than there are predicted by expansion, only allocate as many as are predicted
#this is the typical case as there are some strays
num_expandable <- sum(expanded_counts)
}
# so we take the sample count and then divy up the expandable fish in proportion to the tag rate expansions
temp_output_pbt_only[,col_num] <- as.numeric(temp_output_pbt_only_2[,col_num]) + num_expandable*(expanded_counts / sum(expanded_counts))
}
#and now combine phystag and pbt only
temp_output[,col_num] <- as.numeric(temp_output_3[,col_num]) + as.numeric(temp_output_pbt_only[,col_num])
}
#### need to adjust unassigned group, if it exists
if("Unassigned" %in% temp_output[,func_pbtGroupVariable]){
#need to loop through all combinations of other variables
# count number of fish expanded, then subtract that from the
# corresponding Unassigned category, with floor of 0
categories_to_sum <- temp_output[,1:(ncol(temp_output) - 1)]
categories_to_sum <- categories_to_sum[,colnames(categories_to_sum) != func_pbtGroupVariable]
categories_to_sum <- unique(categories_to_sum)
if(!is.matrix(categories_to_sum)){
categories_to_sum <- as.matrix(categories_to_sum)
colnames(categories_to_sum) <- "Strata"
#this happens when the pbt group is the only variable
#causes problems for algorithm below (b/c categories_to_sum needs to be passed as a matrix)
}
## if analyzing clipped fish, then expanded from all clipped can be subtracted from all clipped unassigned
# if analyzing unclipped hatchery fish, then only expanded from phystaged fish can be subtracted from unassigned
###### expanded form nonphystag fish will appear in the "wild" group and must be subtracted there
# if analyzing wild fish, then pbt should not be a variable, and so will not get here
if (func_RTYPE == "clipped"){
assigned_data <- temp_output[temp_output[,func_pbtGroupVariable] != "Unassigned", ]
assigned_data2 <- temp_output_2[temp_output_2[,func_pbtGroupVariable] != "Unassigned", ]
if(length(dim(assigned_data)) < 2){ ## sometimes only one category present
assigned_data <- t(as.matrix(assigned_data))
assigned_data2 <- t(as.matrix(assigned_data2))
}
} else if (func_RTYPE == "noclip_H"){
assigned_data <- temp_output_3[temp_output_3[,func_pbtGroupVariable] != "Unassigned", ]
assigned_data2 <- temp_output_4[temp_output_4[,func_pbtGroupVariable] != "Unassigned", ]
if(length(dim(assigned_data)) < 2){ ## sometimes only one category present
assigned_data <- t(as.matrix(assigned_data))
assigned_data2 <- t(as.matrix(assigned_data2))
}
} else {
stop("The PBT group is a variable in your Hierarchical variables list, but the group of interest is not \"clipped\" or \"noclip_H\".")
}
assigned_data_categories <- assigned_data[, colnames(categories_to_sum)]
if(length(dim(assigned_data_categories)) < 2){
if(length(colnames(categories_to_sum)) > 1){ ## multiple variables, one category present
assigned_data_categories <- t(as.matrix(assigned_data_categories))
} else { # one variable
assigned_data_categories <- as.matrix(assigned_data_categories)
}
}
temp_output_categories <- temp_output[, colnames(categories_to_sum)]
if(length(dim(temp_output_categories)) < 2){
if(length(colnames(categories_to_sum)) > 1){ ## multiple variables, one category present
temp_output_categories <- t(as.matrix(temp_output_categories))
} else { # one variable
temp_output_categories <- as.matrix(temp_output_categories)
}
}
for (i in 1:nrow(categories_to_sum)){
bool_temp <- find_matching_rows(assigned_data_categories, categories_to_sum[i,])
num_expanded <- sum(as.numeric(assigned_data[bool_temp, col_num])) - sum(as.numeric(assigned_data2[bool_temp, col_num])) #number of untagged fish accounted for by expansion
#now subtract from unassigned group
bool_temp <- find_matching_rows(temp_output_categories, categories_to_sum[i,])
temp_output[bool_temp & temp_output[,func_pbtGroupVariable] == "Unassigned", col_num] <- as.numeric(temp_output[bool_temp & temp_output[,func_pbtGroupVariable] == "Unassigned", col_num]) - num_expanded
#prevent negative estimates - using 1e-7 b/c sometiems roundign error above can cause Unassigned to only be reduced
# to a very small fraction (the calculations are trying to make them eaqual, so round can make it a small fraction)
if (as.numeric(temp_output[bool_temp & temp_output[,func_pbtGroupVariable] == "Unassigned", col_num]) < .0000001){
temp_output[bool_temp & temp_output[,func_pbtGroupVariable] == "Unassigned", col_num] <- 0
}
}
}
hierarch_estimates_count <- rbind(hierarch_estimates_count, temp_output_2, stringsAsFactors = FALSE)
# now we make them proportions of the whole strata
temp_output[,col_num] <- as.numeric(temp_output[,col_num]) / sum(as.numeric(temp_output[,col_num]))
hierarch_estimates <- rbind(hierarch_estimates, temp_output, stringsAsFactors = FALSE)
}
}#end of for each strata loop
#replace NAs written to strata that did not have any data
if(replace_NA){
#calculate the means
all_categories <- as.matrix(build_all_combos(unique_categories))
#calculate means
means <- rep(-9, nrow(all_categories)) #-9 as a placeholder
bool_mat <- as.matrix(hierarch_estimates[,2:(ncol(all_categories) + 1)])
for(i in 1:nrow(all_categories)){
#calculate number of observations that are an exact match for the category
bool_temp <- find_matching_rows(bool_mat, all_categories[i,])
means[i] <- mean(as.numeric(hierarch_estimates[bool_temp,col_num]), na.rm = TRUE)
}
for(i in which(is.na(hierarch_estimates[,col_num]))){
#find the matching entry in all_categories, order of means is the same
bool_temp <- find_matching_rows(all_categories, unlist(hierarch_estimates[i,2:(col_num - 1)]))
#make proportion for this strata the mean over strata for the run
hierarch_estimates[i,col_num] <- means[bool_temp]
}
}
#save output
hierarch_output_list[[factor]] <- list(hierarch_estimates, hierarch_estimates_count)
}# end of hierachical factor loop
# apply window counts to generate estimates of total counts
# apply proportions in a hierarchical fashion
# so that numbers are estimated using as much data as possible
####get column number to use when pulling out estimates from the H HNC W decomposition
if (func_RTYPE == "clipped") {
col_n <- 2
} else if (func_RTYPE == "noclip_H") {
col_n <- 3
} else {
col_n <- 4
}
for(i in 1:length(hierarch_output_list)){
props <- hierarch_output_list[[i]][[1]]
props_col <- ncol(props)
colnames(props)[props_col] <- "Total_number_in_run"
if(i == 1){
#first factor, so pull numbers from the H HNC W estimates
for (j in 1:nrow(func_h_hnc_w_count_est)){
props[props[,1] == func_h_hnc_w_count_est[j,1],props_col] <- as.numeric(props[props[,1] == func_h_hnc_w_count_est[j,1],props_col]) * as.numeric(func_h_hnc_w_count_est[j,col_n])
}
hierarch_output_list[[i]][[3]] <- props
} else {
replace_NA_2 <- c() #keep track of row number for which there are NAs to replace with mean
props_col_2 <- (props_col-2) # minus 2 because we are expressing everything in terms of the last estimates groups
for (j in 1:nrow(last_estim)){
#find matching categories
## have to use unlist here b/c last_estim is a dataframe, and it needs to be passed as a one dimensional vector, not a one row dataframe
bool_temp <- find_matching_rows(props[,1:props_col_2], unlist(last_estim[j,1:props_col_2]))
if (sum(as.numeric(props[bool_temp,props_col])) != 0){
# take proportions of strata and turn into proportions of group as analyzed by the previous factor
props[bool_temp,props_col] <- (as.numeric(props[bool_temp,props_col]) / sum(as.numeric(props[bool_temp,props_col])))
# multiply by estimate from analysis for the previous factor
props[bool_temp,props_col] <- as.numeric(props[bool_temp,props_col]) * as.numeric(last_estim[j,(props_col-1)])
} else if (as.numeric(last_estim[j,(props_col-1)]) != 0) { #in cases where there is data to estimate a non-zero value for the previous variable, but there is not data to estimate the current category conditional on the previous variables
replace_NA_2 <- c(replace_NA_2, j)
props[bool_temp,props_col] <- NA
cat("\nWarning. There are no fish in")
print(last_estim[j,1:props_col_2], row.names = FALSE)
cat("with complete data.")
cat("\nUsing the mean composition for all other fish in this strata to estimate this group.\n")
}
}
#replace NAs with means for all other categories if necessary
if (length(replace_NA_2) > 0){
#calculate means for all categories
cats_means <- unique(props[,c(1,(props_col-1))]) #this will be the order of means
cats_means <- cbind(cats_means, -9)
colnames(cats_means)[3] <- "Total_number_in_run"
for(j in 1:nrow(cats_means)){
#have to unlist cat_means to prevent being passed as a one row dataframe
bool_temp <- find_matching_rows(props[,c(1, (props_col-1))], unlist(cats_means[j,1:2]))
cats_means[j,3] <- mean(as.numeric(props[bool_temp,props_col]), na.rm = TRUE)
}
for(j in unique(cats_means[,1])){
bool_temp <- cats_means[,1] == j
cats_means[bool_temp,3] <- as.numeric(cats_means[bool_temp,3]) / sum(as.numeric(cats_means[bool_temp,3]))
}
#apply means to categories with no data to estimate
for(j in 1:length(replace_NA_2)){
#build matrix to add on to props with the missing categories
add_on <- cbind(last_estim[replace_NA_2[j],1:(props_col-2)], cats_means[cats_means[,1] == last_estim[replace_NA_2[j],1],2:3], row.names = NULL)
add_on[,props_col] <- as.numeric(add_on[,props_col]) * as.numeric(last_estim[replace_NA_2[j],(props_col-1)])
props <- rbind(props, add_on)
}
# remove lines with NA, they have been replaced by adding on rows
props <- props[!is.na(props[,props_col]),]
}
#save as a third matrix in list for output
hierarch_output_list[[i]][[3]] <- props
}
#use estimated number in the last estimate to estimate numbers in the next estimate
last_estim <- props
}
return(hierarch_output_list)
} # end of decompose hierarchical function
if(length(Hierarch_variables) > 0){
Hierarchical_decomposition <- decompose_hierarchical(hierarch_data, Hierarch_variables, pbtGroupVariable, RTYPE, h_hnc_w_count_est, spibetr_data, spibetr, max_expand)
# Write hierarchical estimate outputs
for(i in 1:length(Hierarchical_decomposition)){
## the propor_hier output file can be helpful for troublshooting but is not very informative for the end-user
#write.table(Hierarchical_decomposition[[i]][[1]], paste0(Run, "_Propor_Hier_", names(Hierarchical_decomposition)[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
write.table(Hierarchical_decomposition[[i]][[2]], paste0(Run, "_Trap_Counts_Hier_", names(Hierarchical_decomposition)[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
write.table(Hierarchical_decomposition[[i]][[3]], paste0(Run, "_Estim_Totals_Hier_", names(Hierarchical_decomposition)[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
#calculate total estimates over the entire run
for(i in 1:length(Hierarchical_decomposition)){
estimates_by_strata <- Hierarchical_decomposition[[i]][[3]]
categories_to_sum <- estimates_by_strata[,2:(ncol(estimates_by_strata) - 1)]
categories_to_sum <- unique(categories_to_sum)
if(!is.matrix(categories_to_sum) && !is.data.frame(categories_to_sum)){
categories_to_sum <- as.matrix(categories_to_sum)
colnames(categories_to_sum) <- colnames(Hierarchical_decomposition[[i]][[3]])[2]
#this happens when there is only variable
#causes problems below if categories to sum is not passed as a matrix
}
totals_for_run <- rep(-9, nrow(categories_to_sum))
est_by_s_mat_comp <- as.matrix(estimates_by_strata[,colnames(categories_to_sum)])
for (j in 1:nrow(categories_to_sum)){
bool_temp <- find_matching_rows(est_by_s_mat_comp, unlist(categories_to_sum[j,]))
totals_for_run[j] <- sum(as.numeric(estimates_by_strata[bool_temp,"Total_number_in_run"]))
}
totals_for_run <- cbind(categories_to_sum, totals_for_run)
colnames(totals_for_run)[ncol(totals_for_run)] <- "Estimated_total_for_run"
write.table(totals_for_run, paste0(Run, "_Estim_Grand_Totals_Hier_", names(Hierarchical_decomposition)[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
Hierarchical_decomposition[[i]][[4]] <- totals_for_run #save for use in bootstrapping
}
}
#### bootstrap to obtain CIs for quantities/groups of interest
#### nonparametric b/c adjusting for tag rates can't be simulated by parametric with simple multinomial distribution
# build data storage
boot_h_hnc_w <- matrix(nrow = B, ncol = 3)
colnames(boot_h_hnc_w) <- c("clipped_hatchery", "unclipped_hatchery", "wild")
if(length(Hierarch_variables) > 0){
boot_hier <- list()
for (i in 1:length(Hierarch_variables)){
#storing replicates as columns b/c can keep row order identical to the rownames of the Grand Totals output
#and then just pull rownames from the Grand Totals output to label the CI output
boot_hier[[i]] <- matrix(nrow = nrow(Hierarchical_decomposition[[i]][[4]]), ncol = B)
}
}
#bootstrap
if(B < 1){
cat("\nEnd analysis at:\n")
print(Sys.time())
cat("\n")
return("B is less than 1. Not performing any bootstrapping. SCOBI Deux is complete.")
}
for (b in 1:B){
cat("Beginning bootstrap iteration", b, "\n")
#select data for each strata
data_boot <- FishData #to get R to assign appropriate column types to variables, start with original data and then overwrite
for(s in WinData[,1]){
strata_bool <- data_boot$Strata == s
num_obs <- sum(strata_bool)
strata_data <- data_boot[data_boot$Strata == s,]
data_boot[data_boot$Strata == s,] <- strata_data[sample(1:num_obs, num_obs, replace = TRUE),]
}
#run algorithms
#H, HNC, W estimation
Estimate_h_hnc_w_boot <- decompose_h_hnc_w(data_boot, WinData)
#save category totals - we are only interested in CIs for the estimates of the run as a whole
boot_h_hnc_w[b,] <- Estimate_h_hnc_w_boot[[4]]
if(length(Hierarch_variables) > 0){
#select rearing type of interest(clipped, noclip_h, wild)
if (RTYPE == "clipped"){
boot_max_expand <- NULL
boot_spibetr_data <- NULL
data_boot <- data_boot[data_boot[,adClipVariable] == "AD",]
} else if (RTYPE == "noclip_H"){
boot_spibetr_data <- NULL
#calculate total number of putatively "wild" fish in each strata so that the algorithm knows how many it can expand the pbt-only fish for
boot_max_expand <- matrix(nrow = 0, ncol = 2)
for(s in unique(data_boot$Strata)){
boot_max_expand <- rbind(boot_max_expand, c(s,
sum(data_boot$Strata == s & data_boot[,adClipVariable] == "AI" &
(is.na(data_boot[,physTagsVariable]) | data_boot[,physTagsVariable] == "notag") &
!is.na(data_boot[,pbtGroupVariable]) & data_boot[,pbtGroupVariable] == "Unassigned")
))
}
data_boot <- data_boot[data_boot[,adClipVariable] == "AI" & ((!is.na(data_boot[,physTagsVariable]) & data_boot[,physTagsVariable] == "tag") | (!is.na(data_boot[,pbtGroupVariable]) & data_boot[,pbtGroupVariable] != "Unassigned")),]
} else if (RTYPE == "wild"){
boot_max_expand <- NULL
#spibetr data is data for fish that are only known to be hatchery origin through PBT
if (spibetr){
boot_spibetr_data <- data_boot[data_boot[,adClipVariable] =="AI" & (is.na(data_boot[,physTagsVariable]) | data_boot[,physTagsVariable] == "notag") & !is.na(data_boot[,pbtGroupVariable]) & data_boot[,pbtGroupVariable] != "Unassigned",]
} else {
boot_spibetr_data <- NULL
}
data_boot <- data_boot[data_boot[,adClipVariable] == "AI" & (is.na(data_boot[,physTagsVariable]) | data_boot[,physTagsVariable] == "notag") & !is.na(data_boot[,pbtGroupVariable]) & data_boot[,pbtGroupVariable] == "Unassigned",]
} else {
stop("Unrecognized RTYPE. Must be one of: \"clipped\", \"noclip_h\", \"wild\".")
}
if(nrow(data_boot) < 1){
cat("\nNo fish present in the dataset of the RTYPE that you chose for the hierarchical analysis. Writing all zeros for this iteration.\n")
for(i in 1:length(Hierarchical_decomposition_boot)){
boot_hier[[i]][,b] <- 0
}
next
}
#hierarchical estimation
Hierarchical_decomposition_boot <- decompose_hierarchical(data_boot, Hierarch_variables, pbtGroupVariable, RTYPE, Estimate_h_hnc_w_boot[[3]], boot_spibetr_data, spibetr, boot_max_expand)
# hierarchical estimate outputs
for(i in 1:length(Hierarchical_decomposition_boot)){
estimates_by_strata <- Hierarchical_decomposition_boot[[i]][[3]]
#pull categories to sum from the original data estimate, in case any group was eliminated by sampling and to maintain order
categories_to_sum <- Hierarchical_decomposition[[i]][[4]]
categories_to_sum <- categories_to_sum[,1:(ncol(categories_to_sum) - 1)]
if(!is.matrix(categories_to_sum) && !is.data.frame(categories_to_sum)){
categories_to_sum <- as.matrix(categories_to_sum)
colnames(categories_to_sum) <- colnames(Hierarchical_decomposition_boot[[i]][[3]])[2]
#this happens when there is only variable
#causes problems below if categories to sum is not passed as a matrix
}
totals_for_run <- rep(-9, nrow(categories_to_sum))
est_by_s_mat_comp <- as.matrix(estimates_by_strata[,colnames(categories_to_sum)])
for (j in 1:nrow(categories_to_sum)){
bool_temp <- find_matching_rows(est_by_s_mat_comp, unlist(categories_to_sum[j,]))
totals_for_run[j] <- sum(as.numeric(estimates_by_strata[bool_temp,"Total_number_in_run"]))
}
boot_hier[[i]][,b] <- totals_for_run
}
}
}
#if write bootstrap estimates
if (writeBoot){
write.table(boot_h_hnc_w, paste0(Run, "_Boot_Rearing.txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
if(length(Hierarch_variables) > 0){
for(i in 1:length(boot_hier)){
write.table(boot_hier[[i]], paste0(Run, "_Boot_Hier_", names(Hierarchical_decomposition)[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
}
}
#compute CIs
cat("\nComputing CIs\n")
#H HNC wild
CI_h_hnc_w <- matrix(nrow = 3, ncol = 8)
colnames(CI_h_hnc_w) <- c("Group", "Estimate", paste0("Lower_(", alph/2, ")"), paste0("Upper_(", 1-(alph/2), ")"), "Percen_half_width", "Lower_simul", "Upper_simul", "Percen_half_width_simul")
CI_h_hnc_w[1,1] <- "clipped_hatchery"
CI_h_hnc_w[2,1] <- "unclipped_hatchery"
CI_h_hnc_w[3,1] <- "wild"
#calculate simultaneous CIs
sim_ci <- simulConfInt(boot_h_hnc_w, alph)
for(i in 1:3){
CI_h_hnc_w[i,2] <- category_totals[i]
CI_h_hnc_w[i,3:4] <- quantile(boot_h_hnc_w[,i],c(alph/2, 1-(alph/2)))
CI_h_hnc_w[i,5] <- round(100 * ( (as.numeric(CI_h_hnc_w[i,4]) - as.numeric(CI_h_hnc_w[i,3]) )/ (category_totals[i]*2)),2)
CI_h_hnc_w[i,6] <- sim_ci[i,1]
CI_h_hnc_w[i,7] <- sim_ci[i,2]
CI_h_hnc_w[i,8] <- round(100 * ( (sim_ci[i,2] - sim_ci[i,1] )/ (category_totals[i]*2)),2)
}
#output CIs
write.table(CI_h_hnc_w, paste0(Run, "_CI_Rearing.txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
#hierarchical
if(length(Hierarch_variables) > 0){
boot_CI <- list()
for(i in 1:length(boot_hier)){
#one less column, will stitch on categories at end
sim_ci_hier <- simulConfInt(t(boot_hier[[i]]), alph)
CI_hier <- matrix(nrow = nrow(boot_hier[[i]]), ncol = 7)
colnames(CI_hier) <- c("Estimate", paste0("Lower_(", alph/2, ")"), paste0("Upper_(", 1-(alph/2), ")"), "Percen_half_width", "Lower_simul", "Upper_simul", "Percen_half_width_simul")
col_index <- ncol(Hierarchical_decomposition[[i]][[4]])
for (j in 1:nrow(boot_hier[[i]])){
CI_hier[j,1] <- Hierarchical_decomposition[[i]][[4]][j,col_index]
CI_hier[j,2:3] <- quantile(boot_hier[[i]][j,],c(alph/2, 1-(alph/2)))
CI_hier[j,4] <- round(100 * ( (as.numeric(CI_hier[j,3]) - as.numeric(CI_hier[j,2]) )/ (as.numeric(CI_hier[j,1])*2)),2)
CI_hier[j,5] <- sim_ci_hier[j,1]
CI_hier[j,6] <- sim_ci_hier[j,2]
CI_hier[j,7] <- round(100 * ( (sim_ci_hier[j,2] - sim_ci_hier[j,1] )/ (as.numeric(CI_hier[j,1])*2)),2)
}
CI_hier <- cbind(Hierarchical_decomposition[[i]][[4]][,1:(col_index - 1)], CI_hier)
if(col_index - 1 == 1){
colnames(CI_hier)[1] <- colnames(Hierarchical_decomposition[[i]][[4]])[1]
}
boot_CI[[i]] <- CI_hier
}
#output CIs
for(i in 1:length(boot_CI)){
write.table(boot_CI[[i]], paste0(Run, "_CI_Hier_", names(Hierarchical_decomposition)[i], ".txt"), sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
}
cat("\nEnd analysis at:\n")
print(Sys.time())
cat("\n")
return("Scobi Deux is complete")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.