#' Format RUM data base on resampled tows
#' Function calls mlogit
#' @param data_in Data going in to the function; default is filt_clusts
##' @param trip_dists Distances covered by each trip
#' @param the_port Port of focus; Default is Astoria
#' @param min_year Minimum year used to filter the data
#' @param max_year Maximum year used to filter the data, also RUM data is filtered to be from the max_year
#' @param risk_coefficient Coefficient to adjust the quota prices up or down, feeding into net revenue calculations
#' @param ndays Number of previous days data to use in revenue expectations
#' @param focus_year Year to focus on for the models
#' @param nhauls_sampled Number of hauls to sample from the full data set
#' @param seed Seed for sampling tows
#' @export
#Add argument for focus_year
# focus_year <- 2012
# nhauls_sampled <- 50
sampled_rums_clust <- function(data_in = filt_clusts, the_port = "ASTORIA / WARRENTON",
min_year = 2011, max_year = 2012, risk_coefficient = 1,
ndays = 60, focus_year = 2012, nhauls_sampled = 50, seed = 300, ncores){
# browser()
#Start by sampling 50 tows within the same fleet
#Figure out how close the different clusters are
# browser()
##Filter the data
dat <- data_in %>% filter(dport_desc == the_port, set_year >= min_year,
set_year <= max_year)
#---------------------------------------------------------------
#Calculate haul revenues
#Turn NA prices into zeroes
dat[which(is.na(dat$avg_quota_price)), 'avg_quota_price'] <- 0
#Adjust the net prices based on risk coefficient
dat$rc <- risk_coefficient
# print("only weak stock species adjusted for risk")
dat$net_price <- dat$exval_pound
weak_inds <- which(dat$type == 'weaks')
dat[weak_inds, 'net_price'] <- dat$exval_pound[weak_inds] - dat$rc[weak_inds] *
dat$avg_quota_price[weak_inds]
# dat$net_price <- (dat$exval_pound - dat$rc * dat$avg_quota_price)
dat$net_revenue <- dat$net_price * dat$hpounds
#Sum the haul revenues
dat <- dat %>% group_by(haul_id) %>%
mutate(haul_net_revenue = sum(net_revenue, na.rm = T))
#Create data set, for each tow
dist_hauls <- dat %>% distinct(haul_id, .keep_all = T) %>% select(haul_id, unq_clust, set_month,
drvid, trip_id, set_day, set_year, haul_net_revenue, avg_long_clust, avg_lat_clust,
haul_num, avg_long, avg_lat, avg_depth, unq, unq_clust_bin) %>% as.data.frame
dist_hauls_catch_shares <- dist_hauls %>% filter(set_year >= 2011)
set.seed(seed)
#For each tow in the focus year, sample other tows
#Hauls in focus year
hauls <- dist_hauls %>% filter(set_year == focus_year) %>% arrange(trip_id, haul_num)
hauls$prev_haul_num <- hauls$haul_num - 1
#Data frame of previous haul locations
prev_hauls <- hauls %>% select(trip_id, haul_num, avg_long, avg_lat)
#add in zero haul_num values
zero_hauls <- prev_hauls %>% distinct(trip_id)
zero_hauls$haul_num <- 0
zero_hauls$avg_long <- unique(dat$d_port_long)
zero_hauls$avg_lat <- unique(dat$d_port_lat)
#Add into previous hauls data frame
prev_hauls <- rbind(prev_hauls, zero_hauls) %>% arrange(trip_id, haul_num)
names(prev_hauls)[2:4] <- c('prev_haul_num', "prev_avg_long", 'prev_avg_lat')
#Add this into the hauls data frame
hauls <- hauls %>% left_join(prev_hauls, by = c('trip_id', 'prev_haul_num'))
#-----------------------------------------------------------------------------
#Sample hauls and calculate distances
#For each haul in the focus year, sample nhauls_sampled tows
sampled_hauls <- mclapply(1:nrow(hauls), FUN = function(xx){
the_samples <- dist_hauls_catch_shares %>% filter(haul_id != hauls[xx, 'haul_id']) %>%
sample_n(size = nhauls_sampled, replace = F)
#Now calculate the distances between the points and the actual points
actual_haul <- hauls[xx, ]
#calculate distances in km
prev_point <- actual_haul %>% select(prev_avg_long, prev_avg_lat)
the_samples$prev_avg_long <- prev_point$prev_avg_long
the_samples$prev_avg_lat <- prev_point$prev_avg_lat
the_samples[, c('avg_long', 'avg_lat', 'prev_avg_lat', 'prev_avg_long')] <- deg2rad(the_samples[,
c('avg_long', 'avg_lat', 'prev_avg_lat', 'prev_avg_long')])
the_samples$distance <- gcd_slc(the_samples$prev_avg_long,
the_samples$prev_avg_lat, the_samples$avg_long, the_samples$avg_lat)
#Calculate distance of empirical haul
actual_haul[, c('avg_long', 'avg_lat', 'prev_avg_lat', 'prev_avg_long')] <- deg2rad(actual_haul[,
c('avg_long', 'avg_lat', 'prev_avg_lat', 'prev_avg_long')])
actual_haul$distance <- gcd_slc(actual_haul$prev_avg_long,
actual_haul$prev_avg_lat, actual_haul$avg_long, actual_haul$avg_lat)
#Combine the sampled values and the empirical haul
actual_haul$prev_haul_num <- NULL
actual_haul$fished <- TRUE
actual_haul$fished_haul <- actual_haul$haul_id
the_samples$fished <- FALSE
the_samples$fished_haul <- actual_haul$haul_id
the_samples <- rbind(actual_haul, the_samples)
#Define the set_date
the_samples$set_date <- ymd(paste(actual_haul$set_year, actual_haul$set_month, actual_haul$set_day, sep = "-"))
return(the_samples)
}, mc.cores = ncores)
print("Done sampling hauls")
sampled_hauls <- ldply(sampled_hauls)
#-----------------------------------------------------------------------------
#Calculate revenues from each period
sampled_hauls$prev_days_date <- sampled_hauls$set_date - days(ndays)
sampled_hauls$prev_year_set_date <- sampled_hauls$set_date - days(365)
sampled_hauls$prev_year_days_date <- sampled_hauls$prev_days_date - days(365)
#What were the average revenues in each location
tow_dates <- sampled_hauls %>%
select(unq_clust, set_date, prev_days_date, prev_year_set_date, prev_year_days_date,
avg_long, avg_lat, avg_depth, unq_clust_bin, unq)
#Look at the unique dates and clusters only
# td1 <- tow_dates %>% distinct(unq_clust, set_date)
tow_dates$days_inter <- interval(tow_dates$prev_days_date, tow_dates$set_date)
tow_dates$prev_year_days_inter <- interval(tow_dates$prev_year_days_date, tow_dates$prev_year_set_date)
td1 <- tow_dates %>% distinct(unq_clust, set_date, .keep_all = T)
#-----------------------------------------------------------------------------
dummys <- mclapply(1:nrow(td1), FUN = function(xx){
temp_dat <- td1[xx, ]
#Filter based on distance from the sampled point
#Sum the hauls for each haul in clust_dat, and keep only previous hauls
clust_dat <- dat %>% filter(unq_clust == temp_dat$unq_clust) %>%
distinct(haul_id, .keep_all = T) %>%
filter(set_date <= temp_dat$set_date)
#Filter based on unq_bin rather than cluster, I may be missing data by using clusters
#Filter based on distance from sampled point
# clust_dat <- dat %>% filter(unq_clust >= temp_dat$unq_clust - 5,
# unq_clust <= temp_dat$unq_clust + 5) %>%
# distinct(haul_id, .keep_all = T) %>%
# filter(set_date <= temp_dat$set_date)
# clust_dat <- dat %>% filter(unq == temp_dat$unq) %>%
# distinct(haul_id, .keep_all = T) %>%
# filter(set_date <= temp_dat$set_date)
#Convert degrees to radians
clust_dat$avg_long <- deg2rad(clust_dat$avg_long)
clust_dat$avg_lat <- deg2rad(clust_dat$avg_lat)
#Calculate distances
clust_dat$dist_from_samp_tow <- gcd_slc(temp_dat$avg_long, temp_dat$avg_lat,
clust_dat$avg_long, clust_dat$avg_lat)
##NEED TO add distance to filter argument
#Remove points that are greater than 5 km away
clust_dat <- clust_dat %>% filter(dist_from_samp_tow <= 5)
#Filter based on the depths also,
clust_dat <- clust_dat %>% filter(avg_depth >= temp_dat$avg_depth - 25,
avg_depth <= temp_dat$avg_depth + 25)
###End of filtering by location and depth
#If towed in the previous ndays
towed_prev_days <- sum(clust_dat$set_date %within% temp_dat$days_inter)
towed_prev_days_rev <- 0
if(towed_prev_days != 0){
hauls_in_period <- clust_dat %>% filter(set_date %within% temp_dat$days_inter) %>%
distinct(haul_id, .keep_all = T)
towed_prev_days_rev <- mean(hauls_in_period$haul_net_revenue, na.rm = TRUE)
}
#If towed in the previous year's ndays
towed_prev_year_days <- sum(clust_dat$set_date %within% temp_dat$prev_year_days_inter)
towed_prev_year_days_rev <- 0
if(towed_prev_year_days != 0){
hauls_in_period <- clust_dat %>% filter(set_date %within% temp_dat$prev_year_days_inter) %>%
distinct(haul_id, .keep_all = T)
towed_prev_year_days_rev <- mean(hauls_in_period$haul_net_revenue, na.rm = TRUE)
}
outs <- data_frame(dummy_prev_days = towed_prev_days, prev_days_rev = towed_prev_days_rev,
dummy_prev_year_days = towed_prev_year_days, prev_year_days_rev = towed_prev_year_days_rev)
return(outs)
}, mc.cores = ncores)
print("Done calculating dummys and revenues")
dummys1 <- ldply(dummys)
#Change values to be 0 and 1 for dummy variables
# tow_dates <- cbind(tow_dates, dummys1)
td1 <- cbind(td1, dummys1)
# td1 %>% ggplot(aes(x = prev_days_rev)) + geom_histogram() + facet_wrap(~ unq_clust)
#converte set_date to character string to merge with the sampled_hauls
tow_dates$set_date_chr <- as.character(tow_dates$set_date)
td1$set_date_chr <- as.character(td1$set_date)
#Add in charcater set_date for sampled_hauls
sampled_hauls$set_date_chr <- as.character(sampled_hauls$set_date)
td1 <- td1 %>% select(unq_clust, set_date_chr, dummy_prev_days, prev_days_rev, dummy_prev_year_days,
prev_year_days_rev)
sampled_hauls <- sampled_hauls %>% left_join(td1, by = c("unq_clust", "set_date_chr"))
#Add in dummy variable for first tow
first_hauls <- sampled_hauls %>% filter(fished == TRUE, haul_num == 1) %>% select(fished_haul)
sampled_hauls$dummy_first <- 0
sampled_hauls[which(sampled_hauls$fished_haul %in% first_hauls$fished_haul), 'dummy_first'] <- 1
sampled_hauls$dummy_not_first <- sampled_hauls$dummy_first
sampled_hauls[which(sampled_hauls$dummy_not_first == 1), "dummy_not_first"] <- 2
sampled_hauls[which(sampled_hauls$dummy_not_first == 0), "dummy_not_first"] <- 1
sampled_hauls[which(sampled_hauls$dummy_not_first == 2), "dummy_not_first"] <- 0
#Make sure that missing values have dummy variable value of 1 for prev_days
sampled_hauls[which(sampled_hauls$dummy_prev_days != 0), 'dummy_prev_days'] <- 0
sampled_hauls[which(sampled_hauls$prev_days_rev == 0), "dummy_prev_days"] <- 1
sampled_hauls[which(sampled_hauls$dummy_prev_year_days != 0), 'dummy_prev_year_days'] <- 1
# sampled_hauls %>% filter(fished_haul == 128004)
#-----------------------------------------------------------------------------
#Format as mlogit.data
rdo <- sampled_hauls %>% select(haul_id, unq_clust, haul_num, distance, fished, fished_haul,
dummy_prev_days, prev_days_rev, dummy_prev_year_days, prev_year_days_rev,
dummy_first, dummy_not_first)
rdo <- rdo %>% group_by(fished_haul) %>% mutate(alt_tow = 1:length(haul_id)) %>% as.data.frame
#-----------------------------------------------------------------------------
#Fit mlogit models returning the coefficients, the models, and the data going into the
# models
#Filter out tows with missing values for distance
rdo <- rdo %>% filter(is.na(distance) == FALSE)
#Split first tows and later tows
# tows1 <- rdo %>% filter(fished == TRUE, haul_num == 1) %>% select(fished_haul)
# first_tows <- rdo %>% filter(fished_haul %in% tows1$fished_haul)
# first_tows <- mlogit.data(first_tows, shape = 'long', choice = 'fished', alt.var = 'alt_tow',
# chid.var = 'fished_haul')
#Fit the model for everything at once
the_tows <- mlogit.data(rdo, shape = 'long', choice = 'fished', alt.var = 'alt_tow',
chid.var = 'fished_haul')
mf <- mFormula(fished ~ prev_days_rev * dummy_first +
distance * dummy_first + prev_days_rev * dummy_not_first +
distance * dummy_not_first - distance - prev_days_rev - 1 -
dummy_first - dummy_not_first + dummy_prev_days + dummy_prev_year_days)
res <- mlogit(mf, the_tows)
#Second tows
# second_tows <- rdo[which(rdo$fished_haul %in% tows1$fished_haul == FALSE), ]
# second_tows <- mlogit.data(second_tows, shape = 'long', choice = 'fished', alt.var = 'alt_tow',
# chid.var = 'fished_haul')
# # res2 <- mlogit(fished ~ distance + prev_days_rev + dummy_prev_days - 1, second_tows)
# res2 <- mlogit(fished ~ distance + prev_days_rev + dummy_prev_days + dummy_prev_year_days - 1, second_tows)
# outs <- list(coefs1 = coef(res1), coefs2 = coef(res2), mod1 = res1, mod2 = res2,
# first_tows = first_tows, second_tows = second_tows)
#List coefficients and rename to align with jeem paper
coefs <- coef(res)
coefs <- plyr::rename(coefs, c('dummy_prev_days' = 'dum30',
"dummy_prev_year_days" = "dum30y", "prev_days_rev:dummy_first" = "rev1",
"dummy_first:distance" = 'dist1', "prev_days_rev:dummy_not_first" = "rev",
"distance:dummy_not_first" = 'dist'))
coefs <- data.frame(coefs = round(coefs[c('dist', 'dist1', 'rev', 'rev1', 'dum30', 'dum30y')],
digits = 5))
ps <- summary(res)$CoefTable[, 4]
ps <- plyr::rename(ps, c('dummy_prev_days' = 'dum30',
"dummy_prev_year_days" = "dum30y", "prev_days_rev:dummy_first" = "rev1",
"dummy_first:distance" = 'dist1', "prev_days_rev:dummy_not_first" = "rev",
"distance:dummy_not_first" = 'dist'))
ps <- ps[c('dist', 'dist1', 'rev', 'rev1', 'dum30', 'dum30y')]
#Add significance values
coefs$p_values <- round(ps, digits = 5)
coefs$significance <- " "
coefs[which(coefs$p_values <= .10), 'significance'] <- "."
coefs[which(coefs$p_values <= .05), 'significance'] <- "*"
coefs[which(coefs$p_values <= .01), 'significance'] <- "**"
coefs[which(coefs$p_values <= .001), 'significance'] <- "***"
outs <- list(coefs = coefs, mod = res)
return(outs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.