#' @title Wrapper for Gym OpenAI environment
#' @description Depends on Gym API definition
#' @param name The name defined in gym
#' @param state_preprocess list(fun = identity, par = NULL), list of function and argument pair to preprocess the state
#' @param act_cheat action cheat
#' @param repeat_n_act the number of repeats to the same action arm selected by the surrogate
#' @param observ_stack_len number of states to stack to form the aggregated states
#' @return The wrapped environment
#' @export
makeGymEnv = function(name ="CartPole-v0", state_preprocess = list(fun = identity, par = NULL), act_cheat = NULL, repeat_n_act = 1L, observ_stack_len = 1L) {
gspace = reticulate::import("gym.spaces", delay_load = TRUE)
gym = reticulate::import("gym", delay_load = TRUE)
gym$logger$set_level(40) # supress warning
gym$logger$setLevel(40)
genv = gym$make(name)
flag_continous = ifelse(grepl("float", toString(genv$action_space$dtype)), TRUE, FALSE) # if action is in continous space
atari = grepl("AtariEnv", toString(genv$unwrapped))
env = NULL
if (atari) {
env = EnvGymAtari$new(genv, name, state_preprocess = state_preprocess, act_cheat = act_cheat, repeat_n_act = repeat_n_act, observ_stack_len = observ_stack_len)
} else {
env = EnvGym$new(genv, name, state_preprocess = state_preprocess, act_cheat = act_cheat, repeat_n_act = repeat_n_act, observ_stack_len = observ_stack_len) # EnvGym is a wrapper to original gym environment
}
return(env)
}
# The Discrete space allows a fixed range of non-negative numbers, so in this case valid actions are either 0 or 1
# The Box space represents an n-dimensional box, so valid observations will be an array of 4 numbers
# note all processing to image should be consistent in EnvGym::reset, EnvGym::step and replaymem
subsample = function(state) {
I = state[seq(30L, 210L, 3L), seq(1L, 160L, 2L), ] # subsample
I = 0.299 * I[, , 1L] + 0.587 * I[, , 2L] + 0.114 * I[, , 3L] # RGB to gray formula
res = array_reshape(I, c(dim(I), 1L)) # append an extra dim
# res = array(as.integer(res), dim = dim(res)) # store integer is less memory hungry
# res = res/128.0 - 1.0 # normalize to -1 till +1
return(res)
}
EnvGym = R6::R6Class("EnvGym",
inherit = Environment,
private = list(
# more elements in private is more stable, but makes debug harder
old_dim = NULL, # orginal dimension
new_dim = NULL,
old_state = NULL, # for video to remove flickering
initActCnt = function() {
if (!"n" %in% names(self$env$action_space)) {
# if "n" is not in the names of the list, i.e. genv$action_space$n does not exist
flag_multiple_shape = length(self$env$action_space$shape) > 1L
if (flag_multiple_shape) {
stop("currently no support for action space that have multiple shapes!")
}
self$act_cnt = self$env$action_space$shape[[1L]]
} else {
self$act_cnt = self$env$action_space$n # get the number of actions/control bits
}
## FIXME: this should be set by user
# if (self$flag_tensor) {
## since which("NOOP" == env$env$unwrapped$get_action_meanings()) will always generate 1
# self$act_cheat = 1L:(self$act_cnt - 1L) # do not allow NO-OP operation
# }
#
if (!is.null(self$act_cheat)) {
self$act_cnt = length(self$act_cheat)
}
},
initStateDim = function() {
self$state_dim = unlist(self$env$observation_space$shape)
if (is.null(self$state_dim)) {
self$state_dim = self$env$observation_space$n
}
if (is.null(self$state_dim)) {
stop("environment$observation_space has neither shape or n field!")
}
self$flag_tensor = length(self$state_dim) > 1L # judge if video input before change state_dim
private$old_dim = self$state_dim
# keep the array order(only change the dimension) rather than increase the order
# since two frames can be taken difference automatically by DNN and DNN only handles order-3D array
#FIXME: by default, rlR handles up to order 3 tensor: RGB IMAGE. Might need to change in future
private$new_dim = self$initSubsample()
if (self$observ_stack_len > 1L) {
self$flag_stack_frame = TRUE
self$state_dim = c(private$new_dim[1L:2L], private$new_dim[3] * self$observ_stack_len)
} else {
self$state_dim = private$new_dim # no stacking
}
}
),
public = list(
# some fields are defined in father class
env = NULL,
state_preprocess = NULL,
act_cheat = NULL,
repeat_n_act = NULL, # number of frames to escape
state_cache = NULL, # store adjacent states to stack into short history
flag_stack_frame = NULL,
flag_tensor = NULL,
flag_box = NULL,
# act_cheat is a vector like c(5,7) which maps arm 1 to action 5 and arm 2 to action 7.
# rendering the Pong-v0 in a ipy-notebook shows that the ball needs 20 frames to travel
# observ_stack_len is the number of observations one should stack, but will not change the order of the state tensor
# observ_stack_len is set here since the Env should return the same dimension to the agent.
# replaymem$sample.fun has to be changed if observ_stack is to be used.
# subsample_dim: the size of image after subsampling
initialize = function(genv, name, state_preprocess = list(fun = identity, par = NULL), act_cheat = NULL, repeat_n_act = 1L, observ_stack_len = 1L) {
self$flag_stack_frame = FALSE
self$state_cache = vector(mode = "list", observ_stack_len)
self$observ_stack_len = observ_stack_len
self$env = genv
self$flag_continous = ifelse(grepl("float", toString(genv$action_space$dtype)), TRUE, FALSE) # if action is in continous space
self$flag_box = grepl("Box", toString(self$env$action_space))
self$name = name
self$state_preprocess = state_preprocess$fun
if (grepl("Discrete", toString(self$env$observation_space))) self$state_preprocess = function(x) x + 1L
self$act_cheat = act_cheat
self$repeat_n_act = repeat_n_act
private$initStateDim()
private$initActCnt()
env_max_step = self$env$spec$max_episode_steps
if (!is.null(env_max_step)) self$maxStepPerEpisode = env_max_step
else self$maxStepPerEpisode = 1e4
},
setActCheat = function(act_cheat) {
self$act_cheat = act_cheat
self$act_cnt = length(self$act_cheat)
},
initSubsample = function() {
state = self$env$reset() #FIXME: only return state when reset is called
hstate = self$state_preprocess(state)
new_state_dim = dim(hstate)
if (is.null(new_state_dim)) {
new_state_dim = private$old_dim
}
return(new_state_dim)
},
render = function(...) {
self$env$render(...)
},
step = function(action_input) {
action = action_input
if (!is.null(self$act_cheat)) {
# act_cheat must be applied before minus 1 operation below since R has no 0 index!
action = as.integer(self$act_cheat[action] + 1L) # gym convention is used in act_cheat(a vector mapping 1,2,3 to gym convention action starting from 0), +1 convert it back to R convention
}
if (!self$flag_continous) {
action = action - 1L # The class in which the current code lies is Gym Specific
action = as.integer(action)
}
s_r_d_info = self$env$step(action)
names(s_r_d_info) = c("state", "reward", "done", "info")
s_r_d_info[["state"]] = self$state_preprocess(s_r_d_info[["state"]]) # preprocessing
if (self$flag_box) s_r_d_info[["state"]] = t(s_r_d_info[["state"]]) # for continous action, transpose the state space, for "Pendulum-v0" etc, the state return is 3*1 instead of 1*3
s_r_d_info
},
#FIXME: self$state_cache is initiliazed in reset by stacking the same frame self$observ_stack_len times
stackLatestFrame = function(cur_state) {
if (self$observ_stack_len >= 2L) {
for (i in self$observ_stack_len:2L) {
self$state_cache[[i]] = self$state_cache[[i - 1L]]
}}
self$state_cache[[1L]] = cur_state
arr_stack = abind::abind(self$state_cache)
return(arr_stack)
},
reset = function() {
s = self$env$reset()
s = self$state_preprocess(s)
##FIXME: is this the right way to initialize old_state? reset is called at episode start?
# if (self$agent$interact$global_step_len == 0) {
# private$old_state = s
# self$state_cache = lapply(1L:self$observ_stack_len, function(x) s)
# }
# if (self$flag_stack_frame) {
# s = self$stackLatestFrame(s)
# }
r = NULL
return(list(s, r, FALSE, ""))
},
afterAll = function() {
self$env$close()
},
randomRun = function(steps, render = TRUE) {
checkmate::assert_int(steps)
ss = self$env$reset()
for (i in 1:steps) {
if (render) env$render()
a = env$action_space$sample()
r = env$step(a)
}
env$close()
env
},
overview = function() {
cat(sprintf("\naction cnt: %s \n", toString(self$act_cnt)))
cat(sprintf("state original dim: %s \n", toString(private$old_dim)))
flag_vec = private$new_dim != private$old_dim
if (flag_vec[1L]) {
cat(sprintf("state dim after preprocessing: %s \n", toString(private$new_dim)))
cat(sprintf("with stacking: %s \n", toString(self$state_dim)))
}
cat(sprintf("%s\n", ifelse(self$flag_continous, "continous action", "discrete action")))
},
showImage = function(img) {
img %>%
imager::as.cimg() %>% # to image
imager::mirror("y") %>% # mirror at y axis
imager::imrotate(90L) %>% # rotate by 90 degree
graphics::plot(axes = FALSE)
},
showPreprocess = function() {
s = self$env$reset()
s = self$state_preprocess(s)
self$showImage(s)
},
snapshot = function(steps = 25L, preprocess) {
checkmate::assert_int(steps)
ss = self$env$reset()
if (is.null(self$env$action_space$sample)) {
stop("no support for snapshot for this environment")
}
for (i in 1:steps) {
a = self$env$action_space$sample()
r = self$env$step(a)
}
if (preprocess) {
pimg = self$state_preprocess(r[[1L]])
self$showImage(pimg)
} else {
img = self$env$render(mode = "rgb_array")
img = img / 255.
self$env$close()
self$showImage(img)
}
}
)
)
EnvGymAtari = R6::R6Class("EnvGymAtari",
inherit = EnvGym,
public = list(
reset = function() {
s = self$env$reset()
s = self$state_preprocess(s)
#FIXME: is this the right way to initialize old_state? reset is called at episode start?
if (self$agent$interact$global_step_len == 0) {
private$old_state = s
self$state_cache = lapply(1L:self$observ_stack_len, function(x) s)
}
if (self$flag_stack_frame) {
s = self$stackLatestFrame(s)
}
r = NULL
return(list(s, r, FALSE, ""))
},
# action_input starts from 1 according to R convention since action is caculated by policy
step = function(action_input) {
action = action_input
if (!is.null(self$act_cheat)) {
# act_cheat must be applied before minus 1 operation below since R has no 0 index!
action = as.integer(self$act_cheat[action] + 1L) # gym convention is used in act_cheat(a vector mapping 1,2,3 to gym convention action starting from 0), +1 convert it back to R convention
}
if (!self$flag_continous) {
action = action - 1L # The class in which the current code lies is Gym Specific
action = as.integer(action)
}
list_s_r_d_info = lapply(1:self$repeat_n_act, function(i) self$env$step(action)) # repeat the same choice for self$repeat_n_act times. length(list_s_r_d_info) = self$repeat_n_act
rewards = sapply(list_s_r_d_info, function(x) x[[2L]]) # extract reward for each action repeat: the second element of each s_r_d_info return is reward
#rewards = sapply(rewards, sign) # reward clipping
dones = sapply(list_s_r_d_info, function(x) x[[3L]])
s_r_d_info = list_s_r_d_info[[self$repeat_n_act]]
names(s_r_d_info) = c("state", "reward", "done", "info")
s_r_d_info[["reward"]] = sum(rewards)
s_r_d_info[["done"]] = any(dones)
s_r_d_info[["state"]] = self$state_preprocess(s_r_d_info[["state"]]) # preprocessing
# if (self$flag_tensor) {
# s_r_d_info[["state"]] = pmax(s_r_d_info[["state"]], private$old_state) # remove flickering
# private$old_state = s_r_d_info[["state"]]
# }
if (self$flag_stack_frame) s_r_d_info[["state"]] = self$stackLatestFrame(s_r_d_info[["state"]])
#FIXME: might be buggy if continous space get preprocessed
if (grepl("Box", toString(self$env$action_space))) s_r_d_info[["state"]] = t(s_r_d_info[["state"]]) # for continous action, transpose the state space, for "Pendulum-v0" etc, the state return is 3*1 instead of 1*3
s_r_d_info
}
)
)
EnvGymActCheat = R6::R6Class("EnvGymActCheat",
inherit = EnvGym,
public = list(
step = function(action_input) {
action = action_input
if (!is.null(self$act_cheat)) {
# act_cheat must be applied before minus 1 operation below since R has no 0 index!
action = as.integer(self$act_cheat[action] + 1L) # gym convention is used in act_cheat(a vector mapping 1,2,3 to gym convention action starting from 0), +1 convert it back to R convention
}
if (!self$flag_continous) {
action = action - 1L # The class in which the current code lies is Gym Specific
action = as.integer(action)
}
s_r_d_info = self$env$step(action)
names(s_r_d_info) = c("state", "reward", "done", "info")
s_r_d_info[["state"]] = self$state_preprocess(s_r_d_info[["state"]]) # preprocessing
#FIXME: might be buggy if continous space get preprocessed
if (grepl("Box", toString(self$env$action_space))) s_r_d_info[["state"]] = t(s_r_d_info[["state"]]) # for continous action, transpose the state space, for "Pendulum-v0" etc, the state return is 3*1 instead of 1*3
s_r_d_info
})
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.