# @title Deep Deterministic Policy Gradient
#
# @format \code{\link{R6Class}} object
# @description Continous action agent
# reference Lillicrap, T. P., Hunt, J. J., Pritzel, A., Heess, N., Erez, T., Tassa, Y., … Wierstra, D. (2016). Continuous control with deep reinforcement learning. In ICLR.
# Inherited from \code{AgentActorCritic}:
# @section Methods:
# @inheritSection AgentArmed Methods
# @return [\code{\link{AgentDDPG}}].
AgentDDPG = R6::R6Class("AgentDDPG",
inherit = AgentActorCritic,
public = list(
tau = NULL, # bilinear combination of target and update network
optimize = NULL,
grad2a = NULL,
explore = NULL,
a_bound = NULL,
ph_critic2act = NULL, # place holder
actor_pred = NULL,
model = NULL,
list.states.next = NULL,
list.states.old = NULL,
input_action_update = NULL,
input_state_update = NULL,
input_state_actor_update = NULL,
input_actor_update_weights = NULL,
brain_actor_update = NULL,
brain_critic_update = NULL,
brain_actor_target = NULL, # used to predict $a$ in bellman equation
brain_critic_target = NULL, # used to create target in bellman equation
replay_actions = NULL,
np = NULL,
batch_acts = NULL, # acts from replay memory
batch_acts_target_policy = NULL, # acts according to policy with respect to states
batch_predicted_acts = NULL, # acts according to policy with respect to states
batch_state = NULL,
batch_state_new = NULL,
batch_targets_critic = NULL,
initialize = function(env, conf) {
K = keras::backend()
self$explore = 1.0
self$np = reticulate::import("numpy", convert = FALSE)
self$tau = 0.1
super$initialize(env, conf)
K$set_session(self$sess)
if (!is.null(self$env$env$action_space$high)) {
self$a_bound = self$env$env$action_space$high
} else {
self$a_bound = 1.0
}
self$ph_critic2act = tf$placeholder(dtype = tf$float32, shape = shape(NULL, self$act_cnt), name = "criticQ2a") # place holder for action
},
createBrain = function() {
if (self$task == "value_fun") {
tuple = createCriticNetwork.AgentDDPG(state_dim = self$state_dim, action_dim = 1L)
self$input_action_update = tuple$input_action
self$input_state_update = tuple$input_state
return(tuple$model)
} else if (self$task == "policy_fun"){
tuple = createActorNetwork.AgentDDPG(state_dim = self$state_dim, action_dim = 1L, a_bound = self$a_bound)
self$input_state_actor_update = tuple$input_state
self$input_actor_update_weights = tuple$weights
return(tuple$model)
}
},
customizeBrain = function(fun) {
self$setBrain()
},
setBrain = function() {
self$task = "value_fun"
self$brain_critic_target = SurroDDPG$new(self)
self$brain_critic_update = SurroDDPG$new(self)
self$task = "policy_fun"
self$brain_actor_target = SurroDDPG$new(self)
self$brain_actor_update = SurroDDPG$new(self)
self$model = self$brain_critic_update
self$trainActorSessInit()
self$sess$run(tf$global_variables_initializer())
},
# target: $r_i + gamma Q_{target}(s_new, \mu(s))$, target $a$ is computed through policy (self$p.next), while the input $a$ is from replay memory
# policy_action is generated from the target policy network
extractCriticTarget = function(i) {
done = ReplayMem$extractDone(self$list.replay[[i]])
y = self$list.rewards[[i]] + self$gamma * self$p.next[i, ]
if (done) y = self$list.rewards[[i]]
return(as.array(y))
},
# input: (state, action)
# output: state-action value
trainCritic = function() {
self$getYhat()
self$batch_acts = Reduce(rbind, self$list.acts)
self$batch_state = Reduce(rbind, self$list.states.old)
len = length(self$list.replay)
list.targets = lapply(1:len, self$extractCriticTarget)
self$batch_targets_critic = Reduce(rbind, list.targets)
# yhat of critic is Q_{update}(s_i, a_i)
self$fitUpdateCriticNetwork(action_input = self$batch_acts, state_input = self$batch_state, yhat = self$batch_targets_critic)
},
# actor is also with loss mse since action is continous!!
#trainActorSessInit = function(state_input, input_criticQ2act) {
trainActorSessInit = function() {
# chain rule: set initial value of the gradient to be -ph_critic2act
tensor_grad_policy2theta = tf$gradients(ys = self$brain_actor_update$model$output, xs = self$brain_actor_update$model$weights, grad_ys = tf$negative(self$ph_critic2act)) # grad_ys is amplitude modulation on gradient
# The final gradients are Q(s_t,a = \mu(s_t)) with respect to \theta^{\mu}(actor network weights), the graph is \theta^{mu}(weights of actor network) -> action(a = \mu(s)) -> Q(s, a)
# grad is gradient, vars are the variable to be applied the gradients
#grad_and_vars = reticulate::tuple(tensor_grad_policy2theta, self$brain_actor_target$model$weights)
grad_and_vars = mapply(reticulate::tuple, tensor_grad_policy2theta, self$brain_actor_target$model$weights)
#x <- 1:3
#y <- 4:6
#mapply(list, x, y, SIMPLIFY = F) # gives a list of 3 tuples res[[3]][[1L]] = 3, res[[3]][[2L]] = 6
#mapply(c, x, y, SIMPLIFY=F) # gives a list of 3 tuples, res[[3]] = c(3, 6)
opt = tf$train$AdamOptimizer(0.001)
self$optimize = opt$apply_gradients(grad_and_vars) # grad_and_vars is "List of (gradient, variable) pairs as returned by compute_gradients()", opt$apply_gradients is the second step of opt$minimize where the first part is opt$compute_gradient
},
trainActor = function() {
self$setCriticGradient()
sname = self$brain_actor_update$model$input$name
aname = self$ph_critic2act$name
sstate = self$np$array(self$batch_state)
scritic2act = self$np$array(self$grad2a)
feed_dict = py_dict(c(sname, aname), c(sstate, scritic2act))
self$sess$run(self$optimize, feed_dict = feed_dict)
},
setCriticGradient = function() {
# $a = \mu(s_i)$
self$batch_predicted_acts = self$brain_actor_update$pred(self$batch_state)
# $\nabla_aQ(s_i, a = \mu(s_i))$
self$grad2a = self$brain_critic_update$calGradients2Action(state_input = self$batch_state, action_input = self$batch_predicted_acts)
self$grad2a = self$grad2a[[1L]] # the return from tensorflow is a list, grad2a should be a batch_size * scalar
#NOTE: grad2a is $\nabla_a Q(s_i, a = \mu(s_i))$ where $\mu(s_i)$ is the policy network
# s ->[policy \mu_{\theta}(s)] a | (a, s) ->[value] Q(w,a,s)
# w in Q(w, a, s) is updated via bellman equation with fixed (a:a_i, s:s_i)
# $\theta$ in $\mu_{\theta}(s)$ is updated in a way to maximize the Q(s_i, a=\mu(s_i))
# gradient for $\theta$ is $\nabla_{\theta} Q(s_i, a = \mu(s_i)) = $\nabla_{a} Q(s_i, a = \mu(s_i)) %*% \nabla_{a} Q(s_i, a = \mu(s_i))$ which is matrix multiplication for chain rule, so element wise mulitiplication for usuall policy network does not work here.
},
replay = function(size) {
self$setBatch(size)
self$trainCritic()
self$trainActor()
self$updateModel() # time consuming 0.3 s
},
# Ornstein–Uhlenbeck process: c(Gaussian Process, Markov Process, Temporirily Homogeneous)
# random walk in continous time (Wiener Process) Continous time AR(1)
# Over time, the process tends to drift towards its long-term mean: such a process is called mean-reverting. (Going back and forth around the mean). properties of the process have been changed so that there is a tendency of the walk to move back towards a central location, with a greater attraction when the process is further away from the center.
# $dx_t = \theta(\mu - x_t)dt + \sigma dWt$ where $W_t$ is the Wiener Process
# the probability density follows the Fokker-Planck equation with the infinite time solution $f(x) = \sqrt(\theta / (\pi \sigma^2))e^{-\theta(x-\mu)^2/\sigma^2}$
# stationary distribution is gaussian with var(x) = \sigma^2/(2 * theta)
ou = function(act) {
mu = 0 # going back and forth around 0
theta = 0.60 # exponential shoulder factor
sigma = 0.30
# one step differential equation change, since action_new = action_old + ou(action_old)
# so d(action) = action_new - action_old = ou(action_old) = \theta(\mu - x_t)dt + \sigma dWt
# where the difference of Wiener process dWt is white noise
theta * (mu - act) + sigma * rnorm(n = 1, mean = 0, sd = 1)
},
evaluateArm = function(state) {
act_cc_nn = self$brain_actor_update$pred(state)
#cat(sprintf("action: %f", act_cc_nn))
noise = self$explore * self$ou(act_cc_nn)
noise = rnorm(n = 1, mean = 0, sd = self$explore)
#cat(sprintf("noise: %f", noise))
self$vec.arm.q = act_cc_nn + noise
self$explore = self$explore * 0.9995
},
act = function(state) {
checkmate::assert_array(state)
state = array_reshape(state, c(1, self$state_dim))
self$evaluateArm(state)
return(self$vec.arm.q) # gym need an array as action
},
updateModel = function() {
# actor
uaw = self$brain_actor_update$getWeights()
uaw = lapply(uaw, function(x) x * self$tau)
taw = self$brain_actor_target$getWeights()
taw = lapply(taw, function(x) x * (1.0 - self$tau))
www = mapply("+", uaw, taw)
self$brain_actor_target$setWeights(www)
# critic
uaw = self$brain_critic_update$getWeights()
uaw = lapply(uaw, function(x) x * self$tau)
taw = self$brain_critic_target$getWeights()
taw = lapply(taw, function(x) x * (1.0 - self$tau))
www = mapply("+", uaw, taw)
self$brain_critic_target$setWeights(www)
},
predTargetCritic = function(action_input, state_input) {
#FIXME: the fixed order of action_input and state_input might be problematic
res = keras::predict_on_batch(self$brain_critic_target$model, x = list(action_input, state_input))
return(res)
},
fitUpdateCriticNetwork = function(action_input, state_input, yhat) {
#FIXME: the fixed order of action_input and state_input might be problematic
keras::fit(self$brain_critic_update$model, x = list(action_input, state_input), y = yhat, epochs = 1L, verbose = FALSE)
},
getYhat = function(...) {
self$batch_state_new = Reduce(rbind, self$list.states.next)
self$batch_acts_target_policy = self$brain_actor_target$pred(self$batch_state_new)
self$p.next = self$predTargetCritic(self$batch_acts_target_policy, self$batch_state_new)
},
setBatch = function(batchsize) {
self$list.replay = self$mem$sample.fun(batchsize)
self$list.states.old = lapply(self$list.replay, ReplayMem$extractOldState)
self$list.states.next = lapply(self$list.replay, ReplayMem$extractNextState)
self$list.rewards = lapply(self$list.replay, ReplayMem$extractReward)
self$list.acts = lapply(self$list.replay, ReplayMem$extractAction)
temp = simplify2array(self$list.states.old) # R array put elements columnwise
mdim = dim(temp)
norder = length(mdim)
self$replay.x = aperm(temp, c(norder, 1:(norder - 1)))
},
afterStep = function() {
self$policy$afterStep()
self$replay(self$replay.size)
},
afterEpisode = function(interact) {
cat(sprintf("explore factor: %f", self$explore))
self$policy$afterEpisode()
self$mem$afterEpisode()
}
))
agent.brain.dict.AgentDDPG = function() list(policy_fun = createActorNetwork.AgentDDPG, value_fun = createCriticNetwork.AgentDDPG)
AgentDDPG$info = function() {
"Deep Deterministic Policy Gradient for Continous Control"
}
AgentDDPG$test = function() {
library("profvis")
library("rlR")
profvis({
env = makeGymEnv("Pendulum-v0")
conf = getDefaultConf("AgentDDPG")
agent = initAgent("AgentDDPG", env, conf)
agent$learn(300L)
}
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.