Performance = R6::R6Class("Performance",
public = list(
list.reward.epi = NULL, # take reward vector of each episode
list_discount_reward_epi = NULL, # discounted reward per episode
list.rewardPerEpisode = NULL, # sum up reward of each episode
rewardPerStep = NULL,
list_steps_epi = NULL,
list.infos = NULL,
epiLookBack = NULL,
epi_idx = NULL,
glogger = NULL,
agent = NULL,
r.vec.epi = NULL,
gamma = NULL,
total_steps = NULL,
list_models = NULL,
store_model_flag = NULL,
initialize = function(agent) {
self$epiLookBack = 100L
self$agent = agent
self$gamma = self$agent$conf$get("agent.gamma")
self$glogger = self$agent$glogger
self$list.reward.epi = list()
self$list.infos = list()
self$list_discount_reward_epi = list()
self$epi_idx = 0L
self$list.rewardPerEpisode = list()
self$list_steps_epi = list()
self$r.vec.epi = vector(mode = "numeric", length = self$agent$env$maxStepPerEpisode)
self$store_model_flag = self$agent$conf$get("agent.store.model")
if (is.null(self$store_model_flag)) self$store_model_flag = FALSE
if (self$store_model_flag) self$list_models = list()
},
computeDiscount = function(rewardvec) {
discounted_r = vector(mode = "double", length = length(rewardvec))
running_add = 0
i = length(rewardvec)
while (i > 0) {
running_add = running_add * self$gamma + rewardvec[i]
discounted_r[i] = running_add
i = i - 1L
}
discounted_r
},
persist = function(path) {
perf = self$clone()
save(perf, file = path)
},
getAccPerf = function(interval = 100L) {
self$list.rewardPerEpisode = lapply(self$list.reward.epi, function(x) sum(x))
epi_idx = length(self$list.rewardPerEpisode)
winstart = max(1L, epi_idx - interval)
vec = unlist(self$list.rewardPerEpisode)
mean(vec[winstart:epi_idx], na.rm = TRUE)
},
psummary = function() {
s1 = sprintf("steps per episode:%s \n", toString(self$list_steps_epi))
self$list.rewardPerEpisode = lapply(self$list.reward.epi, function(x) sum(x))
s2 = sprintf("total reward per episode: %s \n", toString(self$list.rewardPerEpisode))
self$rewardPerStep = unlist(self$list.rewardPerEpisode) / unlist(self$list_steps_epi)
s3 = sprintf("reward per step per episode:%s \n", toString(self$rewardPerStep))
paste(s1, s2, s3)
},
plot = function(smooth = TRUE) {
self$list.rewardPerEpisode = lapply(self$list.reward.epi, function(x) sum(x))
env_name = self$agent$env$name
class_name = class(self$agent)[1]
title = substitute(paste("Rewards per Episode of ", class_name, " for ", env_name, sep = ""), list(class_name = class_name, env_name = env_name))
rewards = unlist(self$list.rewardPerEpisode)
df = data.frame(episode = seq_along(rewards),
rewards = rewards)
if (smooth) {
ggplot2::ggplot(df, aes(episode, rewards), col = "brown1") +
geom_point(alpha = 0.2) +
theme_bw() +
labs(
title = title,
x = "Episode",
y = "Rewards per episode"
) +
coord_cartesian(ylim = range(rewards)) +
geom_smooth(se = FALSE, size = 1) +
geom_hline(yintercept = median(rewards), size = 1, col = "black", lty = 2)
} else {
ggplot2::ggplot(df, aes(episode, rewards)) +
geom_line() +
theme_bw() +
labs(
title = title,
x = "Episode",
y = "Rewards per episode"
) + coord_cartesian(ylim = range(rewards))
}
},
toScalar = function() {
self$getAccPerf(100L)
},
extractInfo = function() {
self$list.infos = lapply(self$agent$mem$samples, function(x) x$info)
},
afterAll = function() {
self$psummary() # print out performance
ns = self$agent$conf$conf.log.perf$resultTbPath
if (self$glogger$flag) self$persist(file.path(ns))
self$extractInfo()
},
afterEpisode = function() {
self$agent$interact$idx_episode = self$agent$interact$idx_episode + 1L
self$agent$interact$glogger$log.nn$info("Episode: %i, steps:%i\n", self$agent$interact$idx_episode, self$agent$interact$step_in_episode)
rewards = sum(self$r.vec.epi[1L:self$agent$interact$step_in_episode])
self$agent$interact$toConsole("Episode: %i finished with steps:%i, rewards:%f global step %i \n", self$agent$interact$idx_episode, self$agent$interact$step_in_episode, rewards, self$agent$interact$global_step_len)
self$epi_idx = self$epi_idx + 1L
self$list.reward.epi[[self$epi_idx]] = vector(mode = "list")
self$list.reward.epi[[self$epi_idx]] = self$r.vec.epi[1L:self$agent$interact$step_in_episode] # the reward vector
self$list_discount_reward_epi[[self$epi_idx]] = self$computeDiscount(self$r.vec.epi[1L:self$agent$interact$step_in_episode])
self$list_steps_epi[[self$epi_idx]] = self$total_steps = self$agent$interact$step_in_episode # the number of steps
rew = self$getAccPerf(self$epiLookBack)
self$agent$interact$toConsole("Last %d episodes average reward %f \n", self$epiLookBack, rew)
if (self$store_model_flag) {
len = length(self$list_models)
self$list_models[[len + 1L]] = self$agent$model$clone(deep = TRUE)
}
},
print = function() {
}
)
)
PerfRescue = R6::R6Class("PerfRescue",
inherit = Performance,
public = list(
epi_wait_ini = NULL, # number of episode to wait until to reinitialize
epi_wait_expl = NULL, # number of episode to wait until to increase epsilon for exploration
recent_win = NULL,
recent_door = NULL,
bad_ratio = NULL,
good_cnt = NULL,
wait_epi = NULL,
wait_cnt = NULL,
wait_middle = NULL,
reset_cnt = NULL,
initialize = function() {
self$reset_cnt = 0L
self$wait_epi = rlR.conf4log[["policy.epi_wait_ini"]]
self$wait_cnt = 0L
self$good_cnt = 0L
self$recent_win = 20L
self$recent_door = 40L
self$bad_ratio = 0.99
self$wait_middle = rlR.conf4log[["policy.epi_wait_middle"]]
self$epi_wait_ini = rlR.conf4log[["policy.epi_wait_ini"]]
self$epi_wait_expl = rlR.conf4log[["policy.epi_wait_expl"]]
},
success = function() {
ok_reward = self$agent$env$ok_reward
ok_step = self$agent$env$ok_step
if (is.null(ok_reward) || is.null(ok_step)) {
return(FALSE)
}
if (self$getAccPerf(ok_step) > ok_reward) {
return(TRUE)
}
return(FALSE)
},
isBad = function() {
pwin = self$getAccPerf(self$recent_win)
pdoor = self$getAccPerf(self$recent_door)
self$agent$interact$toConsole("Last %d episodes average reward %f \n", self$recent_win, pwin)
self$agent$interact$toConsole("Last %d episodes average reward %f \n", self$recent_door, pdoor)
all_rewards = unlist(self$list.rewardPerEpisode)
flag1 = pwin < self$bad_ratio * pdoor
flag2 = pwin < (1/self$bad_ratio) * self$getAccPerf(100L)
flag2old = flag2
flag3 = pwin < median(all_rewards)
flag4 = pwin < mean(all_rewards)
flag22 = (flag2 || flag2old)
if (!flag22) self$good_cnt = self$good_cnt + 1L
else self$good_cnt = 0L
res = c(flag1, flag2, flag3, flag4, flag22)
names(res) = c("bad_small", "bad_middle", "bad_big1", "bad_big2", "bad_middle2")
self$agent$interact$toConsole("%s", toString(res))
return(res)
},
rescue = function() {
flag = self$isBad()
self$wait_epi = min(self$epi_wait_expl, self$wait_epi + 1)
if (flag[1]) {
self$agent$interact$toConsole("\n bad perform for last window, %d times \n", self$wait_cnt + 1L)
self$wait_cnt = self$wait_cnt + 1L
ratio = exp(-self$agent$policy$logdecay * self$total_steps)
#self$agent$policy$epsilon = min(1, self$agent$policy$epsilon * ratio) #FIXME: shall we increase explore here ? Again and again exporation will never converge
flag_new_start = self$wait_cnt > self$wait_middle
flag_start = all(flag) && flag_new_start
if (self$wait_cnt > self$wait_epi || flag_start) {
if (flag[2] || flag[3]) {
self$agent$interact$toConsole("\n\n### going to reset brain ###\n\n\n")
self$agent$setBrain()
self$wait_epi = self$agent$conf$get("policy.epi_wait_expl")
self$reset_cnt = self$reset_cnt + 1L
self$agent$policy$epsilon = self$agent$policy$maxEpsilon
self$wait_cnt = 0
} else {
self$wait_cnt = max(0, self$wait_cnt - 1)
self$agent$policy$epsilon = self$agent$policy$maxEpsilon
}
}
} else {
if (self$good_cnt > 5L) {
self$agent$interact$toConsole("\n# success more than 5 \n")
self$wait_cnt = max(0, self$wait_cnt - self$wait_epi)
}}
#else if (flag["bad_middle2"])
# self$wait_cnt = max(0, self$wait_cnt - 1)
# }
self$agent$interact$toConsole("\n wait cnt: %d times \n", self$wait_cnt)
} # fun
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.