knitr::opts_chunk$set(message=FALSE, warning=FALSE, fig.width=6, fig.height=3, fig.align="center", size = "footnotesize") library(FinMetric) gof <- tibble::tribble( ~raw, ~clean, ~fmt, "nobs", "N", 0, "r.squared", "$R^2$", 2) def.chunk.hook <- knitr::knit_hooks$get("chunk") knitr::knit_hooks$set(chunk = function(x, options) { x <- def.chunk.hook(x, options) ifelse(options$size != "normalsize", paste0("\n \\", options$size,"\n\n", x, "\n\n \\normalsize"), x) }) # define figure template plot_task <- function(...){ ggplot(...) + theme_bw() + ggthemes::scale_colour_stata("s2color") + ggthemes::scale_fill_stata("s2color") + theme(legend.position = "bottom") + scale_linetype_manual(values = c("solid", "longdash", "dotted")) } # define table template table_task <- function(...){ knitr::kable(..., booktab = T, format = "latex", linesep = "") %>% kableExtra::kable_styling(position = 'center', latex_options = "HOLD_position") }
\begin{enumerate} \item \textcolor{Lblue}{Lecture Blue, Lblue} \item \textcolor{Lred}{Lecture Red, Lred} \item \textcolor{Lbg}{Lecture Blue Green, Lbg} \item \textcolor{Llg}{Lecture lightgreen, Llg} \item \textcolor{Ldg}{Lecture darkgreen, Ldg} \end{enumerate}
\begin{algorithm} \caption{An algorithm with caption}\label{alg:cap} \begin{algorithmic} \Require $n \geq 0$ \Ensure $y = x^n$ \State $y \gets 1$ \State $X \gets x$ \State $N \gets n$ \While{$N \neq 0$} \If{$N$ is even} \State $X \gets X \times X$ \State $N \gets \frac{N}{2}$ \Comment{This is a comment} \ElsIf{$N$ is odd} \State $y \gets y \times X$ \State $N \gets N - 1$ \EndIf \EndWhile \end{algorithmic} \end{algorithm}
\setcounter{section}{3} \section{Policy Gradients} \begin{itemize} \item Create two graphs: \begin{itemize} \item In the first graph, compare the learning curves (average return vs. number of environment steps) for the experiments prefixed with \verb|cartpole|. (The small batch experiments.) \item In the second graph, compare the learning curves for the experiments prefixed with \verb|cartpole_lb|. (The large batch experiments.) \end{itemize} \textbf{For all plots in this assignment, the $x$-axis should be number of environment steps, logged as} \verb|Train_EnvstepsSoFar| \textbf{(\textit{not} number of policy gradient iterations).} \item Answer the following questions briefly: \begin{itemize} \item Which value estimator has better performance without advantage normalization: the trajectory-centric one, or the one using reward-to-go? \item Did advantage normalization help? \item Did the batch size make an impact? \end{itemize} \item Provide the exact command line configurations (or \texttt{#@params} settings in Colab) you used to run your experiments, including any parameters changed from their defaults. \end{itemize}
\newpage\section{Neural Network Baseline} \begin{itemize} \item Plot a learning curve for the baseline loss. \item Plot a learning curve for the eval return. You should expect to achieve an average return over 300 for the baselined version. \item Run another experiment with a decreased number of baseline gradient steps (\verb|-bgs|) and/or baseline learning rate (\verb|-blr|). How does this affect (a) the baseline learning curve and (b) the performance of the policy? \item \textbf{Optional:} Add \verb|-na| back to see how much it improves things. Also, set \verb|video_log_freq 10|, then open TensorBoard and go to the ``Images'' tab to see some videos of your HalfCheetah walking along! \end{itemize}
\newpage\section{Generalized Advantage Estimation} \begin{itemize} \item Provide a single plot with the learning curves for the \verb|LunarLander-v2| experiments that you tried. Describe in words how $\lambda$ affected task performance. The run with the best performance should achieve an average score close to 200 (180+). \item Consider the parameter $\lambda$. What does $\lambda = 0$ correspond to? What about $\lambda = 1$? Relate this to the task performance in \verb|LunarLander-v2| in one or two sentences. \end{itemize}
\newpage\section{Hyperparameter Tuning} \begin{enumerate} \item Provide a set of hyperparameters that achieve high return on \verb|InvertedPendulum-v4| in as few environment steps as possible. \item Show learning curves for the average returns with your hyperparameters and with the default settings, with environment steps on the $x$-axis. Returns should be averaged over 5 seeds. \end{enumerate}
\newpage\section{(Extra Credit) Humanoid} \begin{enumerate} \item Plot a learning curve for the Humanoid-v4 environment. You should expect to achieve an average return of at least 600 by the end of training. Discuss what changes, if any, you made to complete this problem (for example: optimizations to the original code, hyperparameter changes, algorithmic changes). \end{enumerate}
\newpage\section{Analysis} \label{sec:analysis} Consider the following infinite-horizon MDP: % https://q.uiver.app/#q=WzAsMixbMCwwLCJzIl0sWzAsMSwic19GIl0sWzAsMSwiYV8yIl1d [\begin{tikzcd} s_1 & {s_F} \arrow["{a_2}", from=1-1, to=1-2] \arrow["{a_1}", loop left, from=1-1, to=1-1] \end{tikzcd}] \newcommand{\Rmax}[0]{R_{\textrm{max}}} \newcommand{\E}[0]{\mathbb{E}} \newcommand{\var}[0]{\textrm{Var}} \crefname{question}{part}{parts} \Crefname{question}{Part}{Parts} \newcommand\question[1][]{\item\refstepcounter{subsection}\label[question]{#1}}
At each step, the agent stays in state $s_1$ and receives reward 1 if it takes action $a_1$, and receives reward 0 and terminates the episode otherwise. Parametrize the policy as stationary (not dependent on time) with a single parameter: [\pi_\theta(a_1|s_1) = \theta, \pi_\theta(a_2|s_1) = 1-\theta]
\begin{enumerate} \question[sec:analysis1] Applying policy gradients \begin{enumerate} \item Use policy gradients to compute the gradient of the expected return $R(\tau)$ with respect to the parameter $\theta$. \textbf{Do not use discounting.}
\textbf{Hint}: to compute $\sum_{k=1}^\infty k\alpha^{k-1}$, you can write: \[\sum_{k=1}^\infty k\alpha^{k-1} = \sum_{k=1}^\infty \frac{d}{d\alpha}\alpha^k = \frac{d}{d\alpha}\sum_{k=1}^\infty\alpha^k\]
\textbf{Solution:}
\item \label{exact_gradient} Compute the expected return of the policy $\E_{\tau \sim \pi_\theta} R(\tau)$ directly. Compute the gradient of this expression with respect to $\theta$ and verify that this matches the policy gradient.
\textbf{Solution:}
\end{enumerate} \newpage \question[sec:analysis2] Compute the variance of the policy gradient in closed form and describe the properties of the variance with respect to $\theta$. For what value(s) of $\theta$ is variance minimal? Maximal? (Once you have an exact expression for the variance you can eyeball the min/max).
\textbf{Hint:} Once you have it expressed as a sum of terms $P(\theta)/Q(\theta)$ where $P$ and $Q$ are polynomials, you can use a symbolic computing program (Mathematica, SymPy, etc) to simplify to a single rational expression.
\textbf{Solution:}
\newpage \question[sec:analysis3] Apply return-to-go as an advantage estimator. \begin{enumerate} \item Write the modified policy gradient and confirm that it is unbiased.
\textbf{Solution:}
\item Compute the variance of the return-to-go policy gradient and plot it on $[0, 1]$ alongside the variance of the original estimator.
\textbf{Solution:}
\end{enumerate} \newpage \question[sec:analysis4] Consider a finite-horizon $H$-step MDP with sparse reward: % https://q.uiver.app/#q=WzAsNixbMCwwLCJzXzEiXSxbMSwwLCJzXzIiXSxbMiwwLCJzXzMiXSxbMiwxLCJzX0YiXSxbMywwLCJcXGRvdHMiXSxbNCwwLCJzX0giXSxbMCwxLCJhXzEiXSxbMSwyLCJhXzEiXSxbMCwzLCJhXzIiLDJdLFsyLDQsImFfMSJdLFs0LDVdLFsxLDMsImFfMiJdLFsyLDMsImFfMiIsMV0sWzQsMywiYV8yIiwxXV0= [\begin{tikzcd} {s_1} & {s_2} & {s_3} & \dots & {s_H} \ && {s_F} \arrow["{a_1}", from=1-1, to=1-2] \arrow["{a_1}", from=1-2, to=1-3] \arrow["{a_2}"', from=1-1, to=2-3] \arrow["{a_1}", from=1-3, to=1-4] \arrow[from=1-4, to=1-5] \arrow["{a_2}", from=1-2, to=2-3] \arrow["{a_2}"{description}, from=1-3, to=2-3] \arrow["{a_2}"{description}, from=1-4, to=2-3] \end{tikzcd}]
The agent receives reward $\Rmax$ if it arrives at $s_H$ and reward $0$ if it arrives at $s_F$ (a terminal state). In other words, the return for a trajectory $\tau$ is given by: [R(\tau) = \begin{cases}1 & \tau \textrm{ ends at } s_H \ 0 & \tau \textrm{ ends at } s_F \end{cases}] Using the same policy parametrization as above, consider off-policy policy gradients via importance sampling. Assume we want to compute policy gradients for a policy $\pi_\theta$ with samples drawn from $\pi_{\theta'}$. \begin{enumerate} \item Write the policy gradient with importance sampling.
\item Compute its variance.
\end{enumerate}
\end{enumerate}
\newpage\section{Survey} \label{sec:survey} Please estimate, in minutes, for each problem, how much time you spent (a) writing code and (b) waiting for the results. This will help us calibrate the difficulty for future homeworks. \begin{itemize} \item \textbf{Policy Gradients:} \item \textbf{Neural Network Baseline:} \item \textbf{Generalized Advantage Estimation:} \item \textbf{Hyperparameters and Sample Efficiency:} \item \textbf{Humanoid:} \item \textbf{Humanoid:} \item \textbf{Analysis -- applying policy gradients:} \item \textbf{Analysis -- PG variance:} \item \textbf{Analysis -- return-to-go:} \item \textbf{Analysis -- importance sampling:} \end{itemize}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.