From f216fdf1e0e2cd1a1450f9ce5621dd5af814ba89 Mon Sep 17 00:00:00 2001 From: Kaixiang Lin Date: Mon, 24 Jun 2019 18:22:46 -0700 Subject: [PATCH] first commit. --- .gitignore | 175 ++++ README.md | 48 + code.md | 65 ++ dopamine/dopamine/__init__.py | 15 + dopamine/dopamine/agents/__init__.py | 15 + dopamine/dopamine/agents/agent_utils.py | 81 ++ dopamine/dopamine/agents/dqn/__init__.py | 15 + dopamine/dopamine/agents/dqn/configs/dqn.gin | 35 + .../dopamine/agents/dqn/configs/dqn_icml.gin | 33 + .../agents/dqn/configs/dqn_nature.gin | 36 + dopamine/dopamine/agents/dqn/dqn_agent.py | 521 +++++++++++ dopamine/dopamine/agents/dqnrpg/__init__.py | 15 + .../dopamine/agents/dqnrpg/configs/dqnrpg.gin | 35 + .../dopamine/agents/dqnrpg/dqnrpg_agent.py | 585 ++++++++++++ dopamine/dopamine/agents/epg/__init__.py | 15 + dopamine/dopamine/agents/epg/configs/epg.gin | 36 + .../dopamine/agents/epg/configs/epg_pong.gin | 35 + dopamine/dopamine/agents/epg/epg_agent.py | 550 ++++++++++++ .../agents/implicit_quantile/__init__.py | 15 + .../configs/implicit_quantile.gin | 40 + .../configs/implicit_quantile_icml.gin | 37 + .../implicit_quantile_agent.py | 358 ++++++++ .../agents/implicit_quantilerpg/__init__.py | 15 + .../configs/implicit_quantilerpg.gin | 41 + .../implicit_quantilerpg_agent.py | 431 +++++++++ dopamine/dopamine/agents/lpg/__init__.py | 15 + dopamine/dopamine/agents/lpg/configs/lpg.gin | 36 + dopamine/dopamine/agents/lpg/lpg_agent.py | 590 +++++++++++++ dopamine/dopamine/agents/rainbow/__init__.py | 15 + .../dopamine/agents/rainbow/configs/c51.gin | 35 + .../agents/rainbow/configs/c51_icml.gin | 36 + .../agents/rainbow/configs/rainbow.gin | 35 + .../agents/rainbow/configs/rainbow_aaai.gin | 37 + .../dopamine/agents/rainbow/rainbow_agent.py | 504 +++++++++++ .../dopamine/agents/rainbowrpg/__init__.py | 15 + .../agents/rainbowrpg/configs/c51rpg.gin | 35 + .../agents/rainbowrpg/configs/rainbowrpg.gin | 35 + .../agents/rainbowrpg/rainbowrpg_agent.py | 699 +++++++++++++++ dopamine/dopamine/agents/repg/__init__.py | 15 + .../dopamine/agents/repg/configs/repg.gin | 36 + dopamine/dopamine/agents/repg/repg_agent.py | 607 +++++++++++++ dopamine/dopamine/agents/rpg/__init__.py | 15 + dopamine/dopamine/agents/rpg/configs/rpg.gin | 36 + .../dopamine/agents/rpg/configs/rpg_pong.gin | 36 + dopamine/dopamine/agents/rpg/rpg_agent.py | 613 +++++++++++++ dopamine/dopamine/atari/__init__.py | 15 + dopamine/dopamine/atari/preprocessing.py | 216 +++++ dopamine/dopamine/atari/run_experiment.py | 592 +++++++++++++ dopamine/dopamine/atari/train.py | 186 ++++ dopamine/dopamine/common/__init__.py | 14 + dopamine/dopamine/common/checkpointer.py | 177 ++++ .../dopamine/common/iteration_statistics.py | 49 + dopamine/dopamine/common/logger.py | 105 +++ dopamine/dopamine/replay_memory/__init__.py | 15 + .../replay_memory/circular_replay_buffer.py | 835 ++++++++++++++++++ .../prioritized_replay_buffer.py | 327 +++++++ dopamine/dopamine/replay_memory/sum_tree.py | 205 +++++ dopamine/dopamine/utils/__init__.py | 15 + dopamine/dopamine/utils/test_utils.py | 34 + dopamine/gym/preprocessing.py | 54 ++ dopamine/setup.py | 92 ++ 61 files changed, 9623 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 code.md create mode 100644 dopamine/dopamine/__init__.py create mode 100644 dopamine/dopamine/agents/__init__.py create mode 100644 dopamine/dopamine/agents/agent_utils.py create mode 100644 dopamine/dopamine/agents/dqn/__init__.py create mode 100644 dopamine/dopamine/agents/dqn/configs/dqn.gin create mode 100644 dopamine/dopamine/agents/dqn/configs/dqn_icml.gin create mode 100644 dopamine/dopamine/agents/dqn/configs/dqn_nature.gin create mode 100644 dopamine/dopamine/agents/dqn/dqn_agent.py create mode 100644 dopamine/dopamine/agents/dqnrpg/__init__.py create mode 100644 dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin create mode 100644 dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py create mode 100644 dopamine/dopamine/agents/epg/__init__.py create mode 100644 dopamine/dopamine/agents/epg/configs/epg.gin create mode 100644 dopamine/dopamine/agents/epg/configs/epg_pong.gin create mode 100644 dopamine/dopamine/agents/epg/epg_agent.py create mode 100644 dopamine/dopamine/agents/implicit_quantile/__init__.py create mode 100644 dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin create mode 100644 dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin create mode 100644 dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py create mode 100644 dopamine/dopamine/agents/implicit_quantilerpg/__init__.py create mode 100644 dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin create mode 100644 dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py create mode 100644 dopamine/dopamine/agents/lpg/__init__.py create mode 100644 dopamine/dopamine/agents/lpg/configs/lpg.gin create mode 100644 dopamine/dopamine/agents/lpg/lpg_agent.py create mode 100644 dopamine/dopamine/agents/rainbow/__init__.py create mode 100644 dopamine/dopamine/agents/rainbow/configs/c51.gin create mode 100644 dopamine/dopamine/agents/rainbow/configs/c51_icml.gin create mode 100644 dopamine/dopamine/agents/rainbow/configs/rainbow.gin create mode 100644 dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin create mode 100644 dopamine/dopamine/agents/rainbow/rainbow_agent.py create mode 100644 dopamine/dopamine/agents/rainbowrpg/__init__.py create mode 100644 dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin create mode 100644 dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin create mode 100644 dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py create mode 100644 dopamine/dopamine/agents/repg/__init__.py create mode 100644 dopamine/dopamine/agents/repg/configs/repg.gin create mode 100644 dopamine/dopamine/agents/repg/repg_agent.py create mode 100644 dopamine/dopamine/agents/rpg/__init__.py create mode 100644 dopamine/dopamine/agents/rpg/configs/rpg.gin create mode 100644 dopamine/dopamine/agents/rpg/configs/rpg_pong.gin create mode 100644 dopamine/dopamine/agents/rpg/rpg_agent.py create mode 100644 dopamine/dopamine/atari/__init__.py create mode 100644 dopamine/dopamine/atari/preprocessing.py create mode 100644 dopamine/dopamine/atari/run_experiment.py create mode 100644 dopamine/dopamine/atari/train.py create mode 100644 dopamine/dopamine/common/__init__.py create mode 100644 dopamine/dopamine/common/checkpointer.py create mode 100644 dopamine/dopamine/common/iteration_statistics.py create mode 100644 dopamine/dopamine/common/logger.py create mode 100644 dopamine/dopamine/replay_memory/__init__.py create mode 100644 dopamine/dopamine/replay_memory/circular_replay_buffer.py create mode 100644 dopamine/dopamine/replay_memory/prioritized_replay_buffer.py create mode 100644 dopamine/dopamine/replay_memory/sum_tree.py create mode 100644 dopamine/dopamine/utils/__init__.py create mode 100644 dopamine/dopamine/utils/test_utils.py create mode 100644 dopamine/gym/preprocessing.py create mode 100644 dopamine/setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c3d5de --- /dev/null +++ b/.gitignore @@ -0,0 +1,175 @@ +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +.DS_Store +*/temp/* +*.pyc +*./.idea/* +.idea/* +*.DS_Store* +*.ipynb_checkpoints/* +notebooks/.ipynb_checkpoints/* +*.dropbox* +*Icon* +*/__pycache__/* +*/.ipynb_checkpoints/* +## Intermediate documents: +*.dvi +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.brf +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.snm +*.vrb + +# cprotect +*.cpt + +#(e)ledmac/(e)ledpar +*.end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls + +# gnuplottex +*-gnuplottex-* + +# hyperref +*.brf + +# knitr +*-concordance.tex +*.tikz +*-tikzDictionary + +# listings +*.lol + +# makeidx +*.idx +*.ilg +*.ind +*.ist + +# minitoc +*.maf +*.mtc +*.mtc[0-9] +*.mtc[1-9][0-9] + +# minted +_minted* +*.pyg +*.pyc +# morewrites +*.mw + +# mylatexformat +*.fmt + +# nomencl +*.nlo + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +#pythontex +*.pytxcode +pythontex-files-*/ + +# Texpad +.texpadtmp + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# todonotes +*.tdo + +# xindy +*.xdy + +# xypic precompiled matrices +*.xyc + +# WinEdt +*.bak +*.sav + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +main.pdf + +*.dropbox* + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b5c107d --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +# Ranking Policy Gradient +Ranking Policy Gradient (RPG) is a sample-efficienct policy gradient method +that learns optimal ranking of actions with respect to the long term reward. +This codebase contains the implementation of RPG using the +[dopamine](https://github.com/google/dopamine) framework. + + +## Instructions + + +### Install via source +#### Step 1. +Follow the install [instruction](https://github.com/KaixiangLin/dopamine/blob/master/README.md#install-via-source) of +dopamine framework for [Ubuntu](https://github.com/KaixiangLin/dopamine/blob/master/README.md#ubuntu) +or [Max OS X](https://github.com/KaixiangLin/dopamine/blob/master/README.md#mac-os-x). + +#### Step 2. +Download the RPG source, i.e. + +``` +git clone git@github.com:illidanlab/rpg.git +``` + + +## Running the tests + +``` +cd ./rpg/dopamine +python -um dopamine.atari.train \ + --agent_name=rpg \ + --base_dir=/tmp/dopamine \ + --random_seed 1 \ + --game_name=Pong \ + --gin_files='dopamine/agents/rpg/configs/rpg.gin' +``` + +## Reproduce +To reproduce the results in the paper, please refer to the instruction in [here](code.md). + +### Reference + +If you use this RPG implementation in your work, please consider citing the following papers: +``` +TODO(RPG): +``` + +## Acknowledgments +TODO(dopamine framework, fundings). diff --git a/code.md b/code.md new file mode 100644 index 0000000..8e2f717 --- /dev/null +++ b/code.md @@ -0,0 +1,65 @@ +# Overview + +This document explain the structure of this codebase and hyperparameters of experiments. + + +## File organization + +### Step 1. +Please refer to the instruction of dopamine structure in [here](https://github.com/KaixiangLin/dopamine/blob/master/docs/README.md#file-organization) + +### Step 2. +We add variants of RPG agents in [this folder](dopamine/dopamine/agents) and we explain each agent as follows: + + +| Folder | Exploration | Supervision | +|---|---|---| +| rpg | epsilon-greedy | RPG (Hinge loss) | +| lpg | epsilon-greedy| LPG (Cross-Entropy) | +| epg | EPG | LPG (Cross-Entropy) | +|repg | EPG | RPG (Hinge loss) | +|implicit_quantilerpg| implicit_quantile | RPG (Hinge loss) | + + +* EPG: EPG is the stochastic listwise policy gradient +with off-policy supervised learning, which is the vanilla policy gradient trained +with off-policy supervised learning. The exploration and supervision agent is parameterized +by the same neural network. The supervision agent minimizes the cross-entropy loss +over the near-optimal trajectories collected in an online fashion. + +* LPG: LPG is the deterministic listwise policy gradient with off-policy supervised learning. +We choose an action greedily based on the value of logits during the evaluation, and it stochastically +explores the environment as EPG. + +* RPG: RPG explores the environment using a separate agent: epsilon-greedy, EPG in Pong and +Implicit Quantile in other games. Then rpg conducts supervised +learning by minimizing the hinge loss. + +In this codebase, the folder [rpg](dopamine/dopamine/agents/rpg) +contain the code of RPG with epsilon-greedy exploration, and similarly [repg](dopamine/dopamine/agents/repg) for EPG exploration, +[implicit_quantilerpg](dopamine/dopamine/agents/implicit_quantilerpg) + for implicit quantile network exploration. + +The agents with relatively simple exploration strategy (rpg, lpg, epg, repg) perform well on Pong, +comparing to the state-of-the-arts, since there are higher chance to hit the good trajectories with in Pong. +For more complicated games, we adopt implicit quantile network as the exploration agent. + +## Hyperparameters +The hyperparameters of networks, optimizers, etc., are same as the [baselines](https://github.com/KaixiangLin/dopamine/tree/master/baselines) in dopamine. +The trajectory reward threshold c (see Def 5 in the paper (TODO)) for each game is given as follows: + +| game | c | +|---|---| +| Boxing | 100 | +| Breakout | 400 | +| Bowling | 80 | +| BankHeist | 1100 | +| DoubleDunk | 18 | +| Pitfall | 0 | +| Pong | 1 | +| Robotank| 65 | + + + + + diff --git a/dopamine/dopamine/__init__.py b/dopamine/dopamine/__init__.py new file mode 100644 index 0000000..f2b5d90 --- /dev/null +++ b/dopamine/dopamine/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name = 'dopamine' diff --git a/dopamine/dopamine/agents/__init__.py b/dopamine/dopamine/agents/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/agent_utils.py b/dopamine/dopamine/agents/agent_utils.py new file mode 100644 index 0000000..38b6d78 --- /dev/null +++ b/dopamine/dopamine/agents/agent_utils.py @@ -0,0 +1,81 @@ +import random +from collections import deque + +def collect_trajectory(agent, reward): + """for pong """ + if reward < 0: + agent.replay_buffer.clear() + elif reward > 0: + agent.replay_buffer.add(agent._last_observation, agent.action, reward, False) + while agent.replay_buffer.size() > 0: + experience = agent.replay_buffer.get_sample() + state, action, reward, _ = experience + agent._store_transition(state, action, reward, False) + else: + agent.replay_buffer.add(agent._last_observation, agent.action, reward, False) + + + +class ReplayBufferRegular(object): + """ for uniformly sampling. + + """ + + def __init__(self, buffer_size, random_seed=1234): + self.buffer_size = buffer_size + self.count = 0 + # Right side of deque contains newest experience + self.buffer = deque() + random.seed(random_seed) + self.ptr, self.path_start_idx = 0, 0 + + def add(self, state, action, reward, terminal): + experience = [state, action, reward, terminal] + assert self.count < self.buffer_size + self.buffer.append(experience) + self.count += 1 + self.ptr += 1 + # else: + # self.path_start_idx -= 1 + # self.ptr = self.buffer_size - 1 + # self.buffer.popleft() + # self.buffer.append(experience) + + def get_sample(self): + self.count -= 1 + return self.buffer.popleft() + + def size(self): + return self.count + + def clear(self): + self.buffer.clear() + self.count = 0 + self.ptr = 0 + self.path_start_idx = 0 + + + +""" Threshold of episodic return for each game """ +# we only collect trajectory that has return larger than following +episodic_return = {"Pong": 21, + "Breakout": 200, + "Bowling": 80, + "Boxing": 100, + "Freeway": 33, + "BankHeist": 1100, + "Robotank": 65, + "Pitfall": 0, + "DoubleDunk":18} + +# When we have the return on evaluation phase that is greater than following, +# we stop training +episodic_return_switch = {"Pong": 21, + "Breakout": 200, + "Bowling": 80, # maximum can be more than as 93 + "Boxing": 100, + "Freeway": 32, + "BankHeist": 1100, + "Robotank": 65, + "Pitfall": -0.1, + "DoubleDunk":18} diff --git a/dopamine/dopamine/agents/dqn/__init__.py b/dopamine/dopamine/agents/dqn/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/dqn/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/dqn/configs/dqn.gin b/dopamine/dopamine/agents/dqn/configs/dqn.gin new file mode 100644 index 0000000..06c3427 --- /dev/null +++ b/dopamine/dopamine/agents/dqn/configs/dqn.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.dqn.dqn_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +DQNAgent.gamma = 0.99 +DQNAgent.update_horizon = 1 +DQNAgent.min_replay_history = 20000 # agent steps +DQNAgent.update_period = 4 +DQNAgent.target_update_period = 8000 # agent steps +DQNAgent.epsilon_train = 0.01 +DQNAgent.epsilon_eval = 0.001 +DQNAgent.epsilon_decay_period = 250000 # agent steps +DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +DQNAgent.optimizer = @tf.train.RMSPropOptimizer() + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = True +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/dqn/configs/dqn_icml.gin b/dopamine/dopamine/agents/dqn/configs/dqn_icml.gin new file mode 100644 index 0000000..c4dcb24 --- /dev/null +++ b/dopamine/dopamine/agents/dqn/configs/dqn_icml.gin @@ -0,0 +1,33 @@ +# Hyperparameters used for reporting DQN results in Bellemare et al. (2017). +import dopamine.atari.run_experiment +import dopamine.agents.dqn.dqn_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +DQNAgent.gamma = 0.99 +DQNAgent.update_horizon = 1 +DQNAgent.min_replay_history = 50000 # agent steps +DQNAgent.update_period = 4 +DQNAgent.target_update_period = 10000 # agent steps +DQNAgent.epsilon_train = 0.01 +DQNAgent.epsilon_eval = 0.001 +DQNAgent.epsilon_decay_period = 1000000 # agent steps +DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +DQNAgent.optimizer = @tf.train.RMSPropOptimizer() + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' +# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015). +Runner.sticky_actions = False +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/dqn/configs/dqn_nature.gin b/dopamine/dopamine/agents/dqn/configs/dqn_nature.gin new file mode 100644 index 0000000..024bff4 --- /dev/null +++ b/dopamine/dopamine/agents/dqn/configs/dqn_nature.gin @@ -0,0 +1,36 @@ +# Hyperparameters used in Mnih et al. (2015). +import dopamine.atari.preprocessing +import dopamine.atari.run_experiment +import dopamine.agents.dqn.dqn_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +DQNAgent.gamma = 0.99 +DQNAgent.update_horizon = 1 +DQNAgent.min_replay_history = 50000 # agent steps +DQNAgent.update_period = 4 +DQNAgent.target_update_period = 10000 # agent steps +DQNAgent.epsilon_train = 0.1 +DQNAgent.epsilon_eval = 0.05 +DQNAgent.epsilon_decay_period = 1000000 # agent steps +DQNAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +DQNAgent.optimizer = @tf.train.RMSPropOptimizer() + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' +# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015). +Runner.sticky_actions = False +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +AtariPreprocessing.terminal_on_life_loss = True + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/dqn/dqn_agent.py b/dopamine/dopamine/agents/dqn/dqn_agent.py new file mode 100644 index 0000000..3509441 --- /dev/null +++ b/dopamine/dopamine/agents/dqn/dqn_agent.py @@ -0,0 +1,521 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a DQN agent.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random + + + +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf + +import gin.tf + +slim = tf.contrib.slim + + +NATURE_DQN_OBSERVATION_SHAPE = (84, 84) # Size of downscaled Atari 2600 frame. +NATURE_DQN_DTYPE = tf.uint8 # DType of Atari 2600 observations. +NATURE_DQN_STACK_SIZE = 4 # Number of frames in the state stack. + + +def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon): + """Returns the current epsilon for the agent's epsilon-greedy policy. + + This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et + al., 2015). The schedule is as follows: + Begin at 1. until warmup_steps steps have been taken; then + Linearly decay epsilon from 1. to epsilon in decay_period steps; and then + Use epsilon from there on. + + Args: + decay_period: float, the period over which epsilon is decayed. + step: int, the number of training steps completed so far. + warmup_steps: int, the number of steps taken before epsilon is decayed. + epsilon: float, the final value to which to decay the epsilon parameter. + + Returns: + A float, the current epsilon value computed according to the schedule. + """ + steps_left = decay_period + warmup_steps - step + bonus = (1.0 - epsilon) * steps_left / decay_period + bonus = np.clip(bonus, 0., 1. - epsilon) + return epsilon + bonus + + +@gin.configurable +class DQNAgent(object): + """An implementation of the DQN agent.""" + + def __init__(self, + sess, + num_actions, + observation_shape=NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=NATURE_DQN_DTYPE, + stack_size=NATURE_DQN_STACK_SIZE, + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + target_update_period=8000, + epsilon_fn=linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + tf_device='/cpu:*', + use_staging=True, + max_tf_checkpoints_to_keep=3, + optimizer=tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints describing the observation shape. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to + keep. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + assert isinstance(observation_shape, tuple) + tf.logging.info('Creating %s agent with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t gamma: %f', gamma) + tf.logging.info('\t update_horizon: %f', update_horizon) + tf.logging.info('\t min_replay_history: %d', min_replay_history) + tf.logging.info('\t update_period: %d', update_period) + tf.logging.info('\t target_update_period: %d', target_update_period) + tf.logging.info('\t epsilon_train: %f', epsilon_train) + tf.logging.info('\t epsilon_eval: %f', epsilon_eval) + tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) + tf.logging.info('\t tf_device: %s', tf_device) + tf.logging.info('\t use_staging: %s', use_staging) + tf.logging.info('\t optimizer: %s', optimizer) + + self.num_actions = num_actions + self.observation_shape = tuple(observation_shape) + self.observation_dtype = observation_dtype + self.stack_size = stack_size + self.gamma = gamma + self.update_horizon = update_horizon + self.cumulative_gamma = math.pow(gamma, update_horizon) + self.min_replay_history = min_replay_history + self.target_update_period = target_update_period + self.epsilon_fn = epsilon_fn + self.epsilon_train = epsilon_train + self.epsilon_eval = epsilon_eval + self.epsilon_decay_period = epsilon_decay_period + self.update_period = update_period + self.eval_mode = False + self.training_steps = 0 + self.optimizer = optimizer + self.summary_writer = summary_writer + self.summary_writing_frequency = summary_writing_frequency + + with tf.device(tf_device): + # Create a placeholder for the state input to the DQN network. + # The last axis indicates the number of consecutive frames stacked. + state_shape = (1,) + self.observation_shape + (stack_size,) + self.state = np.zeros(state_shape) + self.state_ph = tf.placeholder(self.observation_dtype, state_shape, + name='state_ph') + self._replay = self._build_replay_buffer(use_staging) + + self._build_networks() + + self._train_op = self._build_train_op() + self._sync_qt_ops = self._build_sync_op() + + if self.summary_writer is not None: + # All tf.summaries should have been defined prior to running this. + self._merged_summaries = tf.summary.merge_all() + self._sess = sess + self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) + + # Variables to be initialized by the agent once it interacts with the + # environment. + self._observation = None + self._last_observation = None + + def _get_network_type(self): + """Returns the type of the outputs of a Q value network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('DQN_network', ['q_values']) + + def _network_template(self, state): + """Builds the convolutional network used to compute the agent's Q-values. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d(net, 32, [8, 8], stride=4) + net = slim.conv2d(net, 64, [4, 4], stride=2) + net = slim.conv2d(net, 64, [3, 3], stride=1) + net = slim.flatten(net) + net = slim.fully_connected(net, 512) + q_values = slim.fully_connected(net, self.num_actions, activation_fn=None) + return self._get_network_type()(q_values) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + self.target_convnet = tf.make_template('Target', self._network_template) + self._net_outputs = self.online_convnet(self.state_ph) + # TODO(bellemare): Ties should be broken. They are unlikely to happen when + # using a deep network, but may affect performance with a linear + # approximation scheme. + self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + self._replay_next_target_net_outputs = self.target_convnet( + self._replay.next_states) + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_target_q_op(self): + """Build an op used as a target for the Q-value. + + Returns: + target_q_op: An op calculating the Q-value. + """ + # Get the maximum Q-value across the actions dimension. + replay_next_qt_max = tf.reduce_max( + self._replay_next_target_net_outputs.q_values, 1) + # Calculate the Bellman target value. + # Q_t = R_t + \gamma^N * Q'_t+1 + # where, + # Q'_t+1 = \argmax_a Q(S_t+1, a) + # (or) 0 if S_t is a terminal state, + # and + # N is the update horizon (by default, N=1). + return self._replay.rewards + self.cumulative_gamma * replay_next_qt_max * ( + 1. - tf.cast(self._replay.terminals, tf.float32)) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + replay_action_one_hot = tf.one_hot( + self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') + replay_chosen_q = tf.reduce_sum( + self._replay_net_outputs.q_values * replay_action_one_hot, + reduction_indices=1, + name='replay_chosen_q') + + target = tf.stop_gradient(self._build_target_q_op()) + loss = tf.losses.huber_loss( + target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('HuberLoss', tf.reduce_mean(loss)) + return self.optimizer.minimize(tf.reduce_mean(loss)) + + def _build_sync_op(self): + """Builds ops for assigning weights from online to target network. + + Returns: + ops: A list of ops assigning weights from online to target network. + """ + # Get trainable variables from online and target DQNs + sync_qt_ops = [] + trainables_online = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online') + trainables_target = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, scope='Target') + for (w_online, w_target) in zip(trainables_online, trainables_target): + # Assign weights from online to target network. + sync_qt_ops.append(w_target.assign(w_online, use_locking=True)) + return sync_qt_ops + + def begin_episode(self, observation): + """Returns the agent's first action for this episode. + + Args: + observation: numpy array, the environment's initial observation. + + Returns: + int, the selected action. + """ + self._reset_state() + self._record_observation(observation) + + if not self.eval_mode: + self._train_step() + + self.action = self._select_action() + return self.action + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + self._store_transition(self._last_observation, self.action, reward, False) + self._train_step() + + self.action = self._select_action() + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + self._store_transition(self._observation, self.action, reward, True) + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn( + self.epsilon_decay_period, + self.training_steps, + self.min_replay_history, + self.epsilon_train) + if random.random() <= epsilon: + # Choose a random action with probability epsilon. + return random.randint(0, self.num_actions - 1) + else: + # Choose the action with highest Q-value at the current state. + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + if self._replay.memory.add_count > self.min_replay_history: + if self.training_steps % self.update_period == 0: + self._sess.run(self._train_op) + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + if self.training_steps % self.target_update_period == 0: + self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + def _record_observation(self, observation): + """Records an observation and update state. + + Extracts a frame from the observation vector and overwrites the oldest + frame in the state buffer. + + Args: + observation: numpy array, an observation from the environment. + """ + # Set current observation. We do the reshaping to handle environments + # without frame stacking. + observation = np.reshape(observation, self.observation_shape) + self._observation = observation[..., 0] + self._observation = np.reshape(observation, self.observation_shape) + # Swap out the oldest frame with the current frame. + self.state = np.roll(self.state, -1, axis=-1) + self.state[0, ..., -1] = self._observation + + def _store_transition(self, last_observation, action, reward, is_terminal): + """Stores an experienced transition. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer: + (last_observation, action, reward, is_terminal). + + Pedantically speaking, this does not actually store an entire transition + since the next state is recorded on the following time step. + + Args: + last_observation: numpy array, last observation. + action: int, the action taken. + reward: float, the reward. + is_terminal: bool, indicating if the current state is a terminal state. + """ + self._replay.add(last_observation, action, reward, is_terminal) + + def _reset_state(self): + """Resets the agent state by filling it with zeros.""" + self.state.fill(0) + + def bundle_and_checkpoint(self, checkpoint_dir, iteration_number): + """Returns a self-contained bundle of the agent's state. + + This is used for checkpointing. It will return a dictionary containing all + non-TensorFlow objects (to be saved into a file by the caller), and it saves + all TensorFlow objects into a checkpoint file. + + Args: + checkpoint_dir: str, directory where TensorFlow objects will be saved. + iteration_number: int, iteration number to use for naming the checkpoint + file. + + Returns: + A dict containing additional Python objects to be checkpointed by the + experiment. If the checkpoint directory does not exist, returns None. + """ + if not tf.gfile.Exists(checkpoint_dir): + return None + # Call the Tensorflow saver to checkpoint the graph. + self._saver.save( + self._sess, + os.path.join(checkpoint_dir, 'tf_ckpt'), + global_step=iteration_number) + # Checkpoint the out-of-graph replay buffer. + self._replay.save(checkpoint_dir, iteration_number) + bundle_dictionary = {} + bundle_dictionary['state'] = self.state + bundle_dictionary['eval_mode'] = self.eval_mode + bundle_dictionary['training_steps'] = self.training_steps + return bundle_dictionary + + def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary): + """Restores the agent from a checkpoint. + + Restores the agent's Python objects to those specified in bundle_dictionary, + and restores the TensorFlow objects to those specified in the + checkpoint_dir. If the checkpoint_dir does not exist, will not reset the + agent's state. + + Args: + checkpoint_dir: str, path to the checkpoint saved by tf.Save. + iteration_number: int, checkpoint version, used when restoring replay + buffer. + bundle_dictionary: dict, containing additional Python objects owned by + the agent. + + Returns: + bool, True if unbundling was successful. + """ + try: + # self._replay.load() will throw a NotFoundError if it does not find all + # the necessary files, in which case we abort the process & return False. + self._replay.load(checkpoint_dir, iteration_number) + except tf.errors.NotFoundError: + return False + for key in self.__dict__: + if key in bundle_dictionary: + self.__dict__[key] = bundle_dictionary[key] + # Restore the agent's TensorFlow graph. + self._saver.restore(self._sess, + os.path.join(checkpoint_dir, + 'tf_ckpt-{}'.format(iteration_number))) + return True diff --git a/dopamine/dopamine/agents/dqnrpg/__init__.py b/dopamine/dopamine/agents/dqnrpg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/dqnrpg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin b/dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin new file mode 100644 index 0000000..da6d1bb --- /dev/null +++ b/dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.dqnrpg.dqnrpg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +DQNRPGAgent.gamma = 0.99 +DQNRPGAgent.update_horizon = 1 +DQNRPGAgent.min_replay_history = 20000 # agent steps +DQNRPGAgent.update_period = 4 +DQNRPGAgent.target_update_period = 8000 # agent steps +DQNRPGAgent.epsilon_train = 0.01 +DQNRPGAgent.epsilon_eval = 0 +DQNRPGAgent.epsilon_decay_period = 250000 # agent steps +DQNRPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +DQNRPGAgent.optimizer = @tf.train.RMSPropOptimizer() + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Breakout' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 30 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py b/dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py new file mode 100644 index 0000000..6291d8c --- /dev/null +++ b/dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py @@ -0,0 +1,585 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a DQN agent.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random + +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf + +import gin.tf +from dopamine.agents.agent_utils import * +slim = tf.contrib.slim + +NATURE_DQN_OBSERVATION_SHAPE = (84, 84) # Size of downscaled Atari 2600 frame. +NATURE_DQN_DTYPE = tf.uint8 # DType of Atari 2600 observations. +NATURE_DQN_STACK_SIZE = 4 # Number of frames in the state stack. + + +def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon): + """Returns the current epsilon for the agent's epsilon-greedy policy. + + This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et + al., 2015). The schedule is as follows: + Begin at 1. until warmup_steps steps have been taken; then + Linearly decay epsilon from 1. to epsilon in decay_period steps; and then + Use epsilon from there on. + + Args: + decay_period: float, the period over which epsilon is decayed. + step: int, the number of training steps completed so far. + warmup_steps: int, the number of steps taken before epsilon is decayed. + epsilon: float, the final value to which to decay the epsilon parameter. + + Returns: + A float, the current epsilon value computed according to the schedule. + """ + steps_left = decay_period + warmup_steps - step + bonus = (1.0 - epsilon) * steps_left / decay_period + bonus = np.clip(bonus, 0., 1. - epsilon) + return epsilon + bonus + + +@gin.configurable +class DQNRPGAgent(object): + """An implementation of the DQN agent.""" + + def __init__(self, + sess, + num_actions, + observation_shape=NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=NATURE_DQN_DTYPE, + stack_size=NATURE_DQN_STACK_SIZE, + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + target_update_period=8000, + epsilon_fn=linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + tf_device='/cpu:*', + use_staging=True, + max_tf_checkpoints_to_keep=3, + optimizer=tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints describing the observation shape. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to + keep. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + assert isinstance(observation_shape, tuple) + tf.logging.info('Creating %s agent with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t gamma: %f', gamma) + tf.logging.info('\t update_horizon: %f', update_horizon) + tf.logging.info('\t min_replay_history: %d', min_replay_history) + tf.logging.info('\t update_period: %d', update_period) + tf.logging.info('\t target_update_period: %d', target_update_period) + tf.logging.info('\t epsilon_train: %f', epsilon_train) + tf.logging.info('\t epsilon_eval: %f', epsilon_eval) + tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) + tf.logging.info('\t tf_device: %s', tf_device) + tf.logging.info('\t use_staging: %s', use_staging) + tf.logging.info('\t optimizer: %s', optimizer) + + self.num_actions = num_actions + self.observation_shape = tuple(observation_shape) + self.observation_dtype = observation_dtype + self.stack_size = stack_size + self.gamma = gamma + self.update_horizon = update_horizon + self.cumulative_gamma = math.pow(gamma, update_horizon) + self.min_replay_history = min_replay_history + self.target_update_period = target_update_period + self.epsilon_fn = epsilon_fn + self.epsilon_train = epsilon_train + self.epsilon_eval = epsilon_eval + self.epsilon_decay_period = epsilon_decay_period + self.update_period = update_period + self.eval_mode = False + self.training_steps = 0 + self.optimizer = optimizer # DQN optimizer. + self.optimizer_rpg = tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True) # optimizer for RPG + self.summary_writer = summary_writer + self.summary_writing_frequency = summary_writing_frequency + self.start_training = 1000 # todo task specific + + with tf.device(tf_device): + # Create a placeholder for the state input to the DQN network. + # The last axis indicates the number of consecutive frames stacked. + state_shape = (1,) + self.observation_shape + (stack_size,) + self.state = np.zeros(state_shape) + self.state_ph = tf.placeholder(self.observation_dtype, state_shape, + name='state_ph') + self._replay = self._build_replay_buffer(use_staging) + self._replay_opt = self._build_replay_buffer(use_staging) # store optimal trajectory + self._build_networks() + + self._train_op, self._train_op_rpg = self._build_train_op() + self._sync_qt_ops = self._build_sync_op() + + # replay buffer for rpg. only store good trajectories. + self.replay_buffer_temp = ReplayBufferRegular(100000) # temporarily + + + if self.summary_writer is not None: + # All tf.summaries should have been defined prior to running this. + self._merged_summaries = tf.summary.merge_all() + self._sess = sess + self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) + + # Variables to be initialized by the agent once it interacts with the + # environment. + self._observation = None + self._last_observation = None + + def _get_network_type(self): + """Returns the type of the outputs of a Q value network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('DQN_network', ['q_values']) + + def _network_template(self, state): + """Builds the convolutional network used to compute the agent's Q-values. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d(net, 32, [8, 8], stride=4) + net = slim.conv2d(net, 64, [4, 4], stride=2) + net = slim.conv2d(net, 64, [3, 3], stride=1) + net = slim.flatten(net) + net = slim.fully_connected(net, 512) + q_values = slim.fully_connected(net, self.num_actions, activation_fn=None) + return self._get_network_type()(q_values) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + # DQN explore net. + self.online_convnet = tf.make_template('Online', self._network_template) + self.target_convnet = tf.make_template('Target', self._network_template) + + self._net_outputs = self.online_convnet(self.state_ph) + # TODO(bellemare): Ties should be broken. They are unlikely to happen when + # using a deep network, but may affect performance with a linear + # approximation scheme. + self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + self._replay_next_target_net_outputs = self.target_convnet( + self._replay.next_states) + + # RPG learning net. + self.rpg_convnet = tf.make_template('RPG', self._network_template) + self._rpg_net_outputs = self.rpg_convnet(self.state_ph) + self._q_argmax_rpg = tf.argmax(self._rpg_net_outputs.q_values, axis=1)[0] + self._replay_rpg_net_outputs = self.rpg_convnet(self._replay_opt.states) + + + + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_target_q_op(self): + """Build an op used as a target for the Q-value. + + Returns: + target_q_op: An op calculating the Q-value. + """ + # Get the maximum Q-value across the actions dimension. + replay_next_qt_max = tf.reduce_max( + self._replay_next_target_net_outputs.q_values, 1) + # Calculate the Bellman target value. + # Q_t = R_t + \gamma^N * Q'_t+1 + # where, + # Q'_t+1 = \argmax_a Q(S_t+1, a) + # (or) 0 if S_t is a terminal state, + # and + # N is the update horizon (by default, N=1). + return self._replay.rewards + self.cumulative_gamma * replay_next_qt_max * ( + 1. - tf.cast(self._replay.terminals, tf.float32)) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + + # RPG loss + replay_opt_action_one_hot = tf.one_hot( + self._replay_opt.actions, self.num_actions, 1., 0., name='action_one_hot_rpg') + replay_chosen_q_rpg = tf.reduce_sum( + self._replay_rpg_net_outputs.q_values * replay_opt_action_one_hot, + reduction_indices=1, + name='replay_chosen_q_rpg') + margin = 1 + qvalue = self._replay_rpg_net_outputs.q_values + # debug self.temp_action_one_hot = replay_action_one_hot + # self.temp_qvalue = qvalue + self.temp1 = (qvalue + margin) * (1 - replay_opt_action_one_hot) + qvalue * replay_opt_action_one_hot + self.temp2 = -(tf.reshape(replay_chosen_q_rpg, [-1, 1]) * tf.ones([1, self.num_actions])) \ + * ((1 - replay_opt_action_one_hot) + (replay_opt_action_one_hot)) + self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2) + rpg_loss = tf.reduce_mean(self.hingeloss) + + # DQN loss + replay_action_one_hot = tf.one_hot( + self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') + replay_chosen_q = tf.reduce_sum( + self._replay_net_outputs.q_values * replay_action_one_hot, + reduction_indices=1, + name='replay_chosen_q') + target = tf.stop_gradient(self._build_target_q_op()) + loss = tf.losses.huber_loss( + target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) + mean_loss = tf.reduce_mean(loss) + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('HuberLoss', mean_loss) + tf.summary.scalar("hingeLossRPG", rpg_loss) + return self.optimizer.minimize(mean_loss), self.optimizer_rpg.minimize(rpg_loss) + + def _build_sync_op(self): + """Builds ops for assigning weights from online to target network. + + Returns: + ops: A list of ops assigning weights from online to target network. + """ + # Get trainable variables from online and target DQNs + sync_qt_ops = [] + trainables_online = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online') + trainables_target = tf.get_collection( + tf.GraphKeys.TRAINABLE_VARIABLES, scope='Target') + for (w_online, w_target) in zip(trainables_online, trainables_target): + # Assign weights from online to target network. + sync_qt_ops.append(w_target.assign(w_online, use_locking=True)) + return sync_qt_ops + + def begin_episode(self, observation): + """Returns the agent's first action for this episode. + + Args: + observation: numpy array, the environment's initial observation. + + Returns: + int, the selected action. + """ + self._reset_state() + self._record_observation(observation) + + if not self.eval_mode: + self._train_step() + + self.action = self._select_action() + return self.action + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + self._store_transition(self._last_observation, self.action, reward, False) + self.replay_buffer_temp.add(self._last_observation, self.action, reward, False) + self._train_step() + + self.action = self._select_action() + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + self.replay_buffer_temp.clear() # this episode is not optimal + self._store_transition(self._observation, self.action, reward, True) + + def end_episode_(self, reward, total_reward, step_number): + """ This episodes is optimal trajectory """ + if not self.eval_mode: + # for DQN + self._store_transition(self._observation, self.action, reward, True) + + # replay buffer for RPG. + self.replay_buffer_temp.add(self._observation, self.action, reward, True) + count = step_number + while count > 0: + experience = self.replay_buffer_temp.get_sample() + state, action, reward, _ = experience + count -= 1 + # self.replay_buffer_opt.add(state, action, reward, False) + self._replay_opt.add(state, action, reward, False) + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + if self.eval_mode is not True: + epsilon = self.epsilon_fn( + self.epsilon_decay_period, + self.training_steps, + self.min_replay_history, + self.epsilon_train) + if random.random() <= epsilon: + # Choose a random action with probability epsilon. + return random.randint(0, self.num_actions - 1) + else: + # Choose the action with highest Q-value at the current state. + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + else: + # evaluation mode: use rpg. + return self._sess.run(self._q_argmax_rpg, {self.state_ph: self.state}) + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + if self._replay.memory.add_count > self.min_replay_history: + if self.training_steps % self.update_period == 0: + self._sess.run(self._train_op) + if self._replay_opt.memory.add_count > self.start_training: + self._sess.run(self._train_op_rpg) + + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + if self.training_steps % self.target_update_period == 0: + self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + def _record_observation(self, observation): + """Records an observation and update state. + + Extracts a frame from the observation vector and overwrites the oldest + frame in the state buffer. + + Args: + observation: numpy array, an observation from the environment. + """ + # Set current observation. We do the reshaping to handle environments + # without frame stacking. + observation = np.reshape(observation, self.observation_shape) + self._observation = observation[..., 0] + self._observation = np.reshape(observation, self.observation_shape) + # Swap out the oldest frame with the current frame. + self.state = np.roll(self.state, -1, axis=-1) + self.state[0, ..., -1] = self._observation + + def _store_transition(self, last_observation, action, reward, is_terminal): + """Stores an experienced transition. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer: + (last_observation, action, reward, is_terminal). + + Pedantically speaking, this does not actually store an entire transition + since the next state is recorded on the following time step. + + Args: + last_observation: numpy array, last observation. + action: int, the action taken. + reward: float, the reward. + is_terminal: bool, indicating if the current state is a terminal state. + """ + self._replay.add(last_observation, action, reward, is_terminal) + + def _reset_state(self): + """Resets the agent state by filling it with zeros.""" + self.state.fill(0) + + def bundle_and_checkpoint(self, checkpoint_dir, iteration_number): + """Returns a self-contained bundle of the agent's state. + + This is used for checkpointing. It will return a dictionary containing all + non-TensorFlow objects (to be saved into a file by the caller), and it saves + all TensorFlow objects into a checkpoint file. + + Args: + checkpoint_dir: str, directory where TensorFlow objects will be saved. + iteration_number: int, iteration number to use for naming the checkpoint + file. + + Returns: + A dict containing additional Python objects to be checkpointed by the + experiment. If the checkpoint directory does not exist, returns None. + """ + if not tf.gfile.Exists(checkpoint_dir): + return None + # Call the Tensorflow saver to checkpoint the graph. + self._saver.save( + self._sess, + os.path.join(checkpoint_dir, 'tf_ckpt'), + global_step=iteration_number) + # Checkpoint the out-of-graph replay buffer. + self._replay.save(checkpoint_dir, iteration_number) + bundle_dictionary = {} + bundle_dictionary['state'] = self.state + bundle_dictionary['eval_mode'] = self.eval_mode + bundle_dictionary['training_steps'] = self.training_steps + return bundle_dictionary + + def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary): + """Restores the agent from a checkpoint. + + Restores the agent's Python objects to those specified in bundle_dictionary, + and restores the TensorFlow objects to those specified in the + checkpoint_dir. If the checkpoint_dir does not exist, will not reset the + agent's state. + + Args: + checkpoint_dir: str, path to the checkpoint saved by tf.Save. + iteration_number: int, checkpoint version, used when restoring replay + buffer. + bundle_dictionary: dict, containing additional Python objects owned by + the agent. + + Returns: + bool, True if unbundling was successful. + """ + try: + # self._replay.load() will throw a NotFoundError if it does not find all + # the necessary files, in which case we abort the process & return False. + self._replay.load(checkpoint_dir, iteration_number) + except tf.errors.NotFoundError: + return False + for key in self.__dict__: + if key in bundle_dictionary: + self.__dict__[key] = bundle_dictionary[key] + # Restore the agent's TensorFlow graph. + self._saver.restore(self._sess, + os.path.join(checkpoint_dir, + 'tf_ckpt-{}'.format(iteration_number))) + return True diff --git a/dopamine/dopamine/agents/epg/__init__.py b/dopamine/dopamine/agents/epg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/epg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/epg/configs/epg.gin b/dopamine/dopamine/agents/epg/configs/epg.gin new file mode 100644 index 0000000..1188e19 --- /dev/null +++ b/dopamine/dopamine/agents/epg/configs/epg.gin @@ -0,0 +1,36 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.epg.epg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +EPGAgent.gamma = 0.99 +EPGAgent.game_name = 'Pong' # Boxing, Pong +EPGAgent.update_horizon = 1 +EPGAgent.min_replay_history = 20000 # agent steps, step more than this, stop exploration. +EPGAgent.update_period = 4 +EPGAgent.epsilon_train = 0 +EPGAgent.epsilon_eval = 0 +EPGAgent.epsilon_decay_period = 250000 # agent steps +EPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +EPGAgent.optimizer = @tf.train.RMSPropOptimizer() +EPGAgent.margin = 1 + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' # Boxing, Pong +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = True +Runner.num_iterations = 15 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 10000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/epg/configs/epg_pong.gin b/dopamine/dopamine/agents/epg/configs/epg_pong.gin new file mode 100644 index 0000000..c8c6445 --- /dev/null +++ b/dopamine/dopamine/agents/epg/configs/epg_pong.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.epg.epg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +EPGAgent.gamma = 0.99 +EPGAgent.update_horizon = 1 +EPGAgent.min_replay_history = 20000 # agent steps, step more than this, stop exploration. +EPGAgent.update_period = 4 +EPGAgent.epsilon_train = 0.0001 +EPGAgent.epsilon_eval = 0 +EPGAgent.epsilon_decay_period = 250000 # agent steps +EPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +EPGAgent.optimizer = @tf.train.RMSPropOptimizer() +EPGAgent.margin = 1 + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 15 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 10000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/epg/epg_agent.py b/dopamine/dopamine/agents/epg/epg_agent.py new file mode 100644 index 0000000..6a17c8f --- /dev/null +++ b/dopamine/dopamine/agents/epg/epg_agent.py @@ -0,0 +1,550 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a DQN agent.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random +from dopamine.agents.agent_utils import * +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf + +import gin.tf +from collections import deque +import tensorflow +from tensorflow.distributions import Categorical + +slim = tf.contrib.slim + +NATURE_DQN_OBSERVATION_SHAPE = (84, 84) # Size of downscaled Atari 2600 frame. +NATURE_DQN_DTYPE = tf.uint8 # DType of Atari 2600 observations. +NATURE_DQN_STACK_SIZE = 4 # Number of frames in the state stack. + + +def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon): + """Returns the current epsilon for the agent's epsilon-greedy policy. + + This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et + al., 2015). The schedule is as follows: + Begin at 1. until warmup_steps steps have been taken; then + Linearly decay epsilon from 1. to epsilon in decay_period steps; and then + Use epsilon from there on. + + Args: + decay_period: float, the period over which epsilon is decayed. + step: int, the number of training steps completed so far. + warmup_steps: int, the number of steps taken before epsilon is decayed. + epsilon: float, the final value to which to decay the epsilon parameter. + + Returns: + A float, the current epsilon value computed according to the schedule. + """ + steps_left = decay_period + warmup_steps - step + bonus = (1.0 - epsilon) * steps_left / decay_period + bonus = np.clip(bonus, 0., 1. - epsilon) + return epsilon + bonus + + +@gin.configurable +class EPGAgent(object): + """An implementation of the DQN agent.""" + + def __init__(self, + sess, + num_actions, + game_name="Pong", + observation_shape=NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=NATURE_DQN_DTYPE, + stack_size=NATURE_DQN_STACK_SIZE, + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + epsilon_fn=linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + margin=1, + tf_device='/cpu:*', + use_staging=True, + max_tf_checkpoints_to_keep=3, + optimizer=tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints describing the observation shape. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to + keep. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + assert isinstance(observation_shape, tuple) + tf.logging.info('Creating %s agent with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t gamma: %f', gamma) + tf.logging.info('\t update_horizon: %f', update_horizon) + tf.logging.info('\t min_replay_history: %d', min_replay_history) + tf.logging.info('\t update_period: %d', update_period) + # tf.logging.info('\t random_seed: %d', random_seed) + tf.logging.info('\t epsilon_train: %f', epsilon_train) + tf.logging.info('\t epsilon_eval: %f', epsilon_eval) + tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) + tf.logging.info('\t tf_device: %s', tf_device) + tf.logging.info('\t use_staging: %s', use_staging) + tf.logging.info('\t optimizer: %s', optimizer) + tf.logging.info('\t game: %s', game_name) + + self.game_name = game_name + self.num_actions = num_actions + self.observation_shape = tuple(observation_shape) + self.observation_dtype = observation_dtype + self.stack_size = stack_size + self.gamma = gamma + self.update_horizon = update_horizon + self.cumulative_gamma = math.pow(gamma, update_horizon) + self.min_replay_history = min_replay_history + self.epsilon_fn = epsilon_fn + self.epsilon_train = epsilon_train + self.epsilon_eval = epsilon_eval + self.epsilon_decay_period = epsilon_decay_period + self.update_period = update_period + self.eval_mode = False + self.training_steps = 0 + self.optimizer = optimizer + self.summary_writer = summary_writer + self.summary_writing_frequency = summary_writing_frequency + self.margin = margin + self.start_training = 1000 # todo task specific PONG IS 1000 + self.highest_reward = 6 # todo task specific + self.isPrinted = False + self.current_replay_size = 0 + self.epsilon_current = 1 + + with tf.device(tf_device): + # Create a placeholder for the state input to the DQN network. + # The last axis indicates the number of consecutive frames stacked. + state_shape = (1,) + self.observation_shape + (stack_size,) + self.state = np.zeros(state_shape) + self.state_ph = tf.placeholder(self.observation_dtype, state_shape, + name='state_ph') + self._replay = self._build_replay_buffer(use_staging) + + self._build_networks() + + self._train_op = self._build_train_op() + + self.replay_buffer = ReplayBufferRegular(100000) + + if self.summary_writer is not None: + # All tf.summaries should have been defined prior to running this. + self._merged_summaries = tf.summary.merge_all() + self._sess = sess + self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) + + # Variables to be initialized by the agent once it interacts with the + # environment. + self._observation = None + self._last_observation = None + + def _get_network_type(self): + """Returns the type of the outputs of a Q value network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('DQN_network', ['q_values']) + + def _network_template(self, state): + """Builds the convolutional network used to compute the agent's Q-values. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d(net, 32, [8, 8], stride=4) + net = slim.conv2d(net, 64, [4, 4], stride=2) + net = slim.conv2d(net, 64, [3, 3], stride=1) + net = slim.flatten(net) + net = slim.fully_connected(net, 512) + q_values = slim.fully_connected(net, self.num_actions, activation_fn=None) + return self._get_network_type()(q_values) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + + self._net_outputs = self.online_convnet(self.state_ph) + # using a deep network, but may affect performance with a linear + # approximation scheme. + # self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + # treat self._net_outputs.q_values as logits + self.logsoftmaxprob = tf.nn.log_softmax(self._net_outputs.q_values) + self.sample = Categorical(logits=self.logsoftmaxprob).sample(1) + + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + + replay_action_one_hot = tf.one_hot( + self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') + logits = self._replay_net_outputs.q_values + self.logsoftmaxprob = tf.nn.log_softmax(logits) + self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1) + # self.temp_loss = self.neglogprob # * self.y_pl + loss = self.actor_loss = tf.reduce_mean(self.neglogprob) + self.replay_action_one_hot = replay_action_one_hot + + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('hingeLoss', loss) + return self.optimizer.minimize(loss) + + def begin_episode(self, observation): + """Returns the agent's first action for this episode. + + Args: + observation: numpy array, the environment's initial observation. + + Returns: + int, the selected action. + """ + self._reset_state() + self._record_observation(observation) + + if not self.eval_mode: + self._train_step() + + self.action = self._select_action() + return self.action + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + + self._train_step() + + self.action = self._select_action() + if isinstance(self.action, np.ndarray): + pass + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + + self.epsilon_current = self.training_steps + self.current_replay_size = self._replay.memory.add_count + return self._sess.run(self.sample, {self.state_ph: self.state})[0][0] + + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + + if self._replay.memory.add_count > self.start_training: + if self.training_steps % self.update_period == 0: + + # debug checked. + # _, neglogprob, logsoftmaxprob, \ + # actor_loss, replay_action_one_hot = self._sess.run([self._train_op, + # self.neglogprob, + # self.logsoftmaxprob, + # self.actor_loss, + # self.replay_action_one_hot]) + self._sess.run(self._train_op) + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + # if self.training_steps % self.target_update_period == 0: + # self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False: + print("start training at {}".format(self.training_steps)) + self.isPrinted = True + + def _record_observation(self, observation): + """Records an observation and update state. + + Extracts a frame from the observation vector and overwrites the oldest + frame in the state buffer. + + Args: + observation: numpy array, an observation from the environment. + """ + # Set current observation. We do the reshaping to handle environments + # without frame stacking. + observation = np.reshape(observation, self.observation_shape) + self._observation = observation[..., 0] + self._observation = np.reshape(observation, self.observation_shape) + # Swap out the oldest frame with the current frame. + self.state = np.roll(self.state, -1, axis=-1) + self.state[0, ..., -1] = self._observation + + def _store_transition(self, last_observation, action, reward, is_terminal): + """Stores an experienced transition. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer: + (last_observation, action, reward, is_terminal). + + Pedantically speaking, this does not actually store an entire transition + since the next state is recorded on the following time step. + + Args: + last_observation: numpy array, last observation. + action: int, the action taken. + reward: float, the reward. + is_terminal: bool, indicating if the current state is a terminal state. + """ + self._replay.add(last_observation, action, reward, is_terminal) + + def _reset_state(self): + """Resets the agent state by filling it with zeros.""" + self.state.fill(0) + + def bundle_and_checkpoint(self, checkpoint_dir, iteration_number): + """Returns a self-contained bundle of the agent's state. + + This is used for checkpointing. It will return a dictionary containing all + non-TensorFlow objects (to be saved into a file by the caller), and it saves + all TensorFlow objects into a checkpoint file. + + Args: + checkpoint_dir: str, directory where TensorFlow objects will be saved. + iteration_number: int, iteration number to use for naming the checkpoint + file. + + Returns: + A dict containing additional Python objects to be checkpointed by the + experiment. If the checkpoint directory does not exist, returns None. + """ + if not tf.gfile.Exists(checkpoint_dir): + return None + # Call the Tensorflow saver to checkpoint the graph. + self._saver.save( + self._sess, + os.path.join(checkpoint_dir, 'tf_ckpt'), + global_step=iteration_number) + # Checkpoint the out-of-graph replay buffer. + self._replay.save(checkpoint_dir, iteration_number) + bundle_dictionary = {} + bundle_dictionary['state'] = self.state + bundle_dictionary['eval_mode'] = self.eval_mode + bundle_dictionary['training_steps'] = self.training_steps + return bundle_dictionary + + def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary): + """Restores the agent from a checkpoint. + + Restores the agent's Python objects to those specified in bundle_dictionary, + and restores the TensorFlow objects to those specified in the + checkpoint_dir. If the checkpoint_dir does not exist, will not reset the + agent's state. + + Args: + checkpoint_dir: str, path to the checkpoint saved by tf.Save. + iteration_number: int, checkpoint version, used when restoring replay + buffer. + bundle_dictionary: dict, containing additional Python objects owned by + the agent. + + Returns: + bool, True if unbundling was successful. + """ + try: + # self._replay.load() will throw a NotFoundError if it does not find all + # the necessary files, in which case we abort the process & return False. + self._replay.load(checkpoint_dir, iteration_number) + except tf.errors.NotFoundError: + return False + for key in self.__dict__: + if key in bundle_dictionary: + self.__dict__[key] = bundle_dictionary[key] + # Restore the agent's TensorFlow graph. + self._saver.restore(self._sess, + os.path.join(checkpoint_dir, + 'tf_ckpt-{}'.format(iteration_number))) + return True + + +class ReplayBufferRegular(object): + """ for uniformly sampling. + + """ + + def __init__(self, buffer_size, random_seed=1234): + self.buffer_size = buffer_size + self.count = 0 + # Right side of deque contains newest experience + self.buffer = deque() + random.seed(random_seed) + self.ptr, self.path_start_idx = 0, 0 + + def add(self, state, action, reward, terminal): + experience = [state, action, reward, terminal] + assert self.count < self.buffer_size + self.buffer.append(experience) + self.count += 1 + self.ptr += 1 + # else: + # self.path_start_idx -= 1 + # self.ptr = self.buffer_size - 1 + # self.buffer.popleft() + # self.buffer.append(experience) + + def get_sample(self): + self.count -= 1 + return self.buffer.popleft() + + def size(self): + return self.count + + def clear(self): + self.buffer.clear() + self.count = 0 + self.ptr = 0 + self.path_start_idx = 0 diff --git a/dopamine/dopamine/agents/implicit_quantile/__init__.py b/dopamine/dopamine/agents/implicit_quantile/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantile/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin new file mode 100644 index 0000000..4719e02 --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin @@ -0,0 +1,40 @@ +# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. + +import dopamine.agents.implicit_quantile.implicit_quantile_agent +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +ImplicitQuantileAgent.kappa = 1.0 +ImplicitQuantileAgent.num_tau_samples = 64 +ImplicitQuantileAgent.num_tau_prime_samples = 64 +ImplicitQuantileAgent.num_quantile_samples = 32 +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 3 +RainbowAgent.min_replay_history = 20000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 250000 # agent steps +# IQN currently does not support prioritized replay. +RainbowAgent.replay_scheme = 'uniform' +RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +Runner.game_name = 'Breakout' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 30 +Runner.training_steps = 250000 +Runner.evaluation_steps = 125000 +Runner.max_steps_per_episode = 27000 + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin new file mode 100644 index 0000000..265dde1 --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin @@ -0,0 +1,37 @@ +# Hyperparameters follow Dabney et al. (2018) +import dopamine.agents.implicit_quantile.implicit_quantile_agent +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +ImplicitQuantileAgent.kappa = 1.0 +ImplicitQuantileAgent.num_tau_samples = 64 +ImplicitQuantileAgent.num_tau_prime_samples = 64 +ImplicitQuantileAgent.num_quantile_samples = 32 +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 1 +RainbowAgent.min_replay_history = 50000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 10000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 1000000 # agent steps +RainbowAgent.replay_scheme = 'uniform' +RainbowAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +tf.train.AdamOptimizer.learning_rate = 0.00005 +tf.train.AdamOptimizer.epsilon = 0.0003125 + +Runner.game_name = 'Pong' +Runner.sticky_actions = False +Runner.num_iterations = 200 +Runner.training_steps = 250000 +Runner.evaluation_steps = 125000 +Runner.max_steps_per_episode = 27000 + +AtariPreprocessing.terminal_on_life_loss = True + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py b/dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py new file mode 100644 index 0000000..1b689bc --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py @@ -0,0 +1,358 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The implicit quantile networks (IQN) agent. + +The agent follows the description given in "Implicit Quantile Networks for +Distributional RL" (Dabney et. al, 2018). +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +import math + + +from dopamine.agents.rainbow import rainbow_agent +import numpy as np +import tensorflow as tf + +import gin.tf + +slim = tf.contrib.slim + + +@gin.configurable +class ImplicitQuantileAgent(rainbow_agent.RainbowAgent): + """An extension of Rainbow to perform implicit quantile regression.""" + + def __init__(self, + sess, + num_actions, + kappa=1.0, + num_tau_samples=32, + num_tau_prime_samples=32, + num_quantile_samples=32, + quantile_embedding_dim=64, + double_dqn=False, + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the Graph. + + Most of this constructor's parameters are IQN-specific hyperparameters whose + values are taken from Dabney et al. (2018). + + Args: + sess: `tf.Session` object for running associated ops. + num_actions: int, number of actions the agent can take at any state. + kappa: float, Huber loss cutoff. + num_tau_samples: int, number of online quantile samples for loss + estimation. + num_tau_prime_samples: int, number of target quantile samples for loss + estimation. + num_quantile_samples: int, number of quantile samples for computing + Q-values. + quantile_embedding_dim: int, embedding dimension for the quantile input. + double_dqn: boolean, whether to perform double DQN style learning + as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + self.kappa = kappa + # num_tau_samples = N below equation (3) in the paper. + self.num_tau_samples = num_tau_samples + # num_tau_prime_samples = N' below equation (3) in the paper. + self.num_tau_prime_samples = num_tau_prime_samples + # num_quantile_samples = k below equation (3) in the paper. + self.num_quantile_samples = num_quantile_samples + # quantile_embedding_dim = n above equation (4) in the paper. + self.quantile_embedding_dim = quantile_embedding_dim + # option to perform double dqn. + self.double_dqn = double_dqn + + super(ImplicitQuantileAgent, self).__init__( + sess=sess, + num_actions=num_actions, + summary_writer=summary_writer, + summary_writing_frequency=summary_writing_frequency) + + def _get_network_type(self): + """Returns the type of the outputs of the implicit quantile network. + + Returns: + _network_type object defining the outputs of the network. + """ + return collections.namedtuple( + 'iqn_network', ['quantile_values', 'quantiles']) + + def _network_template(self, state, num_quantiles): + r"""Builds an Implicit Quantile ConvNet. + + Takes state and quantile as inputs and outputs state-action quantile values. + + Args: + state: A `tf.placeholder` for the RL state. + num_quantiles: int, number of quantile inputs. + + Returns: + _network_type object containing quantile value outputs of the network. + """ + + weights_initializer = slim.variance_scaling_initializer( + factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) + + state_net = tf.cast(state, tf.float32) + state_net = tf.div(state_net, 255.) + state_net = slim.conv2d( + state_net, 32, [8, 8], stride=4, + weights_initializer=weights_initializer) + state_net = slim.conv2d( + state_net, 64, [4, 4], stride=2, + weights_initializer=weights_initializer) + state_net = slim.conv2d( + state_net, 64, [3, 3], stride=1, + weights_initializer=weights_initializer) + state_net = slim.flatten(state_net) + state_net_size = state_net.get_shape().as_list()[-1] + state_net_tiled = tf.tile(state_net, [num_quantiles, 1]) + + batch_size = state_net.get_shape().as_list()[0] + quantiles_shape = [num_quantiles * batch_size, 1] + quantiles = tf.random_uniform( + quantiles_shape, minval=0, maxval=1, dtype=tf.float32) + + quantile_net = tf.tile(quantiles, [1, self.quantile_embedding_dim]) + pi = tf.constant(math.pi) + quantile_net = tf.cast(tf.range( + 1, self.quantile_embedding_dim + 1, 1), tf.float32) * pi * quantile_net + quantile_net = tf.cos(quantile_net) + quantile_net = slim.fully_connected(quantile_net, state_net_size, + weights_initializer=weights_initializer) + # Hadamard product. + net = tf.multiply(state_net_tiled, quantile_net) + + net = slim.fully_connected( + net, 512, weights_initializer=weights_initializer) + quantile_values = slim.fully_connected( + net, + self.num_actions, + activation_fn=None, + weights_initializer=weights_initializer) + + return self._get_network_type()(quantile_values=quantile_values, + quantiles=quantiles) + + def _build_networks(self): + """Builds the IQN computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's quantile values. + self.target_convnet: For computing the next state's target quantile + values. + self._net_outputs: The actual quantile values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' quantile values. + self._replay_next_target_net_outputs: The replayed next states' target + quantile values. + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + self.target_convnet = tf.make_template('Target', self._network_template) + + # Compute the Q-values which are used for action selection in the current + # state. + self._net_outputs = self.online_convnet(self.state_ph, + self.num_quantile_samples) + # Shape of self._net_outputs.quantile_values: + # num_quantile_samples x num_actions. + # e.g. if num_actions is 2, it might look something like this: + # Vals for Quantile .2 Vals for Quantile .4 Vals for Quantile .6 + # [[0.1, 0.5], [0.15, -0.3], [0.15, -0.2]] + # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3]. + self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0) + self._q_argmax = tf.argmax(self._q_values, axis=0) + + self._replay_net_outputs = self.online_convnet(self._replay.states, + self.num_tau_samples) + # Shape: (num_tau_samples x batch_size) x num_actions. + self._replay_net_quantile_values = self._replay_net_outputs.quantile_values + self._replay_net_quantiles = self._replay_net_outputs.quantiles + + # Do the same for next states in the replay buffer. + self._replay_net_target_outputs = self.target_convnet( + self._replay.next_states, self.num_tau_prime_samples) + # Shape: (num_tau_prime_samples x batch_size) x num_actions. + vals = self._replay_net_target_outputs.quantile_values + self._replay_net_target_quantile_values = vals + + # Compute Q-values which are used for action selection for the next states + # in the replay buffer. Compute the argmax over the Q-values. + if self.double_dqn: + outputs_action = self.online_convnet(self._replay.next_states, + self.num_quantile_samples) + else: + outputs_action = self.target_convnet(self._replay.next_states, + self.num_quantile_samples) + + # Shape: (num_quantile_samples x batch_size) x num_actions. + target_quantile_values_action = outputs_action.quantile_values + # Shape: num_quantile_samples x batch_size x num_actions. + target_quantile_values_action = tf.reshape(target_quantile_values_action, + [self.num_quantile_samples, + self._replay.batch_size, + self.num_actions]) + # Shape: batch_size x num_actions. + self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean( + target_quantile_values_action, axis=0)) + self._replay_next_qt_argmax = tf.argmax( + self._replay_net_target_q_values, axis=1) + + def _build_target_quantile_values_op(self): + """Build an op used as a target for return values at given quantiles. + + Returns: + An op calculating the target quantile return. + """ + batch_size = tf.shape(self._replay.rewards)[0] + # Shape of rewards: (num_tau_prime_samples x batch_size) x 1. + rewards = self._replay.rewards[:, None] + rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1]) + + is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals) + # Incorporate terminal state to discount factor. + # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1. + gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier + gamma_with_terminal = tf.tile(gamma_with_terminal[:, None], + [self.num_tau_prime_samples, 1]) + + # Get the indices of the maximium Q-value across the action dimension. + # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1. + + replay_next_qt_argmax = tf.tile( + self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1]) + + # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1. + batch_indices = tf.cast(tf.range( + self.num_tau_prime_samples * batch_size)[:, None], tf.int64) + + # Shape of batch_indexed_target_values: + # (num_tau_prime_samples x batch_size) x 2. + batch_indexed_target_values = tf.concat( + [batch_indices, replay_next_qt_argmax], axis=1) + + # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1. + target_quantile_values = tf.gather_nd( + self._replay_net_target_quantile_values, + batch_indexed_target_values)[:, None] + + return rewards + gamma_with_terminal * target_quantile_values + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + batch_size = tf.shape(self._replay.rewards)[0] + + target_quantile_values = tf.stop_gradient( + self._build_target_quantile_values_op()) + # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is + # the manner in which the target_quantile_values are tiled. + target_quantile_values = tf.reshape(target_quantile_values, + [self.num_tau_prime_samples, + batch_size, 1]) + # Transpose dimensions so that the dimensionality is batch_size x + # self.num_tau_prime_samples x 1 to prepare for computation of + # Bellman errors. + # Final shape of target_quantile_values: + # batch_size x num_tau_prime_samples x 1. + target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2]) + + # Shape of indices: (num_tau_samples x batch_size) x 1. + # Expand dimension by one so that it can be used to index into all the + # quantiles when using the tf.gather_nd function (see below). + indices = tf.range(self.num_tau_samples * batch_size)[:, None] + + # Expand the dimension by one so that it can be used to index into all the + # quantiles when using the tf.gather_nd function (see below). + reshaped_actions = self._replay.actions[:, None] + reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1]) + # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2. + reshaped_actions = tf.concat([indices, reshaped_actions], axis=1) + + chosen_action_quantile_values = tf.gather_nd( + self._replay_net_quantile_values, reshaped_actions) + # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner + # in which the quantile values are tiled. + chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values, + [self.num_tau_samples, + batch_size, 1]) + # Transpose dimensions so that the dimensionality is batch_size x + # self.num_tau_samples x 1 to prepare for computation of + # Bellman errors. + # Final shape of chosen_action_quantile_values: + # batch_size x num_tau_samples x 1. + chosen_action_quantile_values = tf.transpose( + chosen_action_quantile_values, [1, 0, 2]) + + # Shape of bellman_erors and huber_loss: + # batch_size x num_tau_prime_samples x num_tau_samples x 1. + bellman_errors = target_quantile_values[ + :, :, None, :] - chosen_action_quantile_values[:, None, :, :] + # The huber loss (see Section 2.3 of the paper) is defined via two cases: + # case_one: |bellman_errors| <= kappa + # case_two: |bellman_errors| > kappa + huber_loss_case_one = tf.to_float( + tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2 + huber_loss_case_two = tf.to_float( + tf.abs(bellman_errors) > self.kappa) * self.kappa * ( + tf.abs(bellman_errors) - 0.5 * self.kappa) + huber_loss = huber_loss_case_one + huber_loss_case_two + + # Reshape replay_quantiles to batch_size x num_tau_samples x 1 + replay_quantiles = tf.reshape( + self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1]) + replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2]) + + # Tile by num_tau_prime_samples along a new dimension. Shape is now + # batch_size x num_tau_prime_samples x num_tau_samples x 1. + # These quantiles will be used for computation of the quantile huber loss + # below (see section 2.3 of the paper). + replay_quantiles = tf.to_float(tf.tile( + replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1])) + # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1. + quantile_huber_loss = (tf.abs(replay_quantiles - tf.stop_gradient( + tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa + # Sum over current quantile value (num_tau_samples) dimension, + # average over target quantile value (num_tau_prime_samples) dimension. + # Shape: batch_size x num_tau_prime_samples x 1. + loss = tf.reduce_sum(quantile_huber_loss, axis=2) + # Shape: batch_size x 1. + loss = tf.reduce_mean(loss, axis=1) + + # TODO(kumasaurabh): Add prioritized replay functionality here. + update_priorities_op = tf.no_op() + with tf.control_dependencies([update_priorities_op]): + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss)) + return self.optimizer.minimize(tf.reduce_mean(loss)), tf.reduce_mean(loss) diff --git a/dopamine/dopamine/agents/implicit_quantilerpg/__init__.py b/dopamine/dopamine/agents/implicit_quantilerpg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantilerpg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin b/dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin new file mode 100644 index 0000000..06ebcdf --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin @@ -0,0 +1,41 @@ +# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. + +import dopamine.agents.implicit_quantilerpg.implicit_quantilerpg_agent +import dopamine.agents.rainbow.rainbow_agent +import dopamine.agents.rainbowrpg.rainbowrpg_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +ImplicitQuantileRPGAgent.kappa = 1.0 +ImplicitQuantileRPGAgent.num_tau_samples = 64 +ImplicitQuantileRPGAgent.num_tau_prime_samples = 64 +ImplicitQuantileRPGAgent.num_quantile_samples = 32 +RainbowRPGAgent.gamma = 0.99 +RainbowRPGAgent.update_horizon = 3 +RainbowRPGAgent.min_replay_history = 20000 # agent steps +RainbowRPGAgent.update_period = 4 +RainbowRPGAgent.target_update_period = 8000 # agent steps +RainbowRPGAgent.epsilon_train = 0.01 +RainbowRPGAgent.epsilon_eval = 0.001 +RainbowRPGAgent.epsilon_decay_period = 250000 # agent steps +# IQN currently does not support prioritized replay. +RainbowRPGAgent.replay_scheme = 'uniform' +RainbowRPGAgent.tf_device = '/gpu:0' # '/cpu:*' use for non-GPU version +RainbowRPGAgent.optimizer = @tf.train.AdamOptimizer() + +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +Runner.game_name = 'Breakout' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 15 +Runner.training_steps = 250000 +Runner.evaluation_steps = 125000 +Runner.max_steps_per_episode = 27000 + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py b/dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py new file mode 100644 index 0000000..2e8140a --- /dev/null +++ b/dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py @@ -0,0 +1,431 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The implicit quantile networks (IQN) agent. + +The agent follows the description given in "Implicit Quantile Networks for +Distributional RL" (Dabney et. al, 2018). +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +import math +from dopamine.replay_memory import circular_replay_buffer +from dopamine.agents.rainbow import rainbow_agent +from dopamine.agents.rainbowrpg import rainbowrpg_agent +import numpy as np +import tensorflow as tf + +import gin.tf +from dopamine.agents.agent_utils import * + +slim = tf.contrib.slim + + +@gin.configurable +class ImplicitQuantileRPGAgent(rainbowrpg_agent.RainbowRPGAgent): + """An extension of Rainbow to perform implicit quantile regression.""" + + def __init__(self, + sess, + num_actions, + kappa=1.0, + num_tau_samples=32, + num_tau_prime_samples=32, + num_quantile_samples=32, + quantile_embedding_dim=64, + double_dqn=False, + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the Graph. + + Most of this constructor's parameters are IQN-specific hyperparameters whose + values are taken from Dabney et al. (2018). + + Args: + sess: `tf.Session` object for running associated ops. + num_actions: int, number of actions the agent can take at any state. + kappa: float, Huber loss cutoff. + num_tau_samples: int, number of online quantile samples for loss + estimation. + num_tau_prime_samples: int, number of target quantile samples for loss + estimation. + num_quantile_samples: int, number of quantile samples for computing + Q-values. + quantile_embedding_dim: int, embedding dimension for the quantile input. + double_dqn: boolean, whether to perform double DQN style learning + as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + self.kappa = kappa + # num_tau_samples = N below equation (3) in the paper. + self.num_tau_samples = num_tau_samples + # num_tau_prime_samples = N' below equation (3) in the paper. + self.num_tau_prime_samples = num_tau_prime_samples + # num_quantile_samples = k below equation (3) in the paper. + self.num_quantile_samples = num_quantile_samples + # quantile_embedding_dim = n above equation (4) in the paper. + self.quantile_embedding_dim = quantile_embedding_dim + # option to perform double dqn. + self.double_dqn = double_dqn + + super(ImplicitQuantileRPGAgent, self).__init__( + sess=sess, + num_actions=num_actions, + summary_writer=summary_writer, + summary_writing_frequency=summary_writing_frequency) + + self.start_training = 1000 + + + def _get_network_type_rpg(self): + """Returns the type of the outputs of a value distribution network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('c51_network', + ['q_values', 'logits', 'probabilities']) + + def _network_template_rpg(self, state): + """Builds a convolutional network that outputs Q-value distributions. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + weights_initializer = slim.variance_scaling_initializer( + factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) + + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d( + net, 32, [8, 8], stride=4, weights_initializer=weights_initializer) + net = slim.conv2d( + net, 64, [4, 4], stride=2, weights_initializer=weights_initializer) + net = slim.conv2d( + net, 64, [3, 3], stride=1, weights_initializer=weights_initializer) + net = slim.flatten(net) + net = slim.fully_connected( + net, 512, weights_initializer=weights_initializer) + net = slim.fully_connected( + net, + self.num_actions * self._num_atoms, + activation_fn=None, + weights_initializer=weights_initializer) + + logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms]) + probabilities = tf.contrib.layers.softmax(logits) + q_values = tf.reduce_sum(self._support * probabilities, axis=2) + return self._get_network_type_rpg()(q_values, logits, probabilities) + + def _build_networks_rpg(self): + # RPG learning net. + self.rpg_convnet = tf.make_template('RPG', self._network_template_rpg) + self._rpg_net_outputs = self.rpg_convnet(self.state_ph) + self._q_argmax_rpg = tf.argmax(self._rpg_net_outputs.q_values, axis=1)[0] + self._replay_rpg_net_outputs = self.rpg_convnet(self._replay_opt.states) + + def _build_train_op_rpg(self): + # RPG loss + replay_action_one_hot = tf.one_hot( + self._replay_opt.actions, self.num_actions, 1., 0., name='action_one_hot_rpg') + replay_chosen_q = tf.reduce_sum( + self._replay_rpg_net_outputs.q_values * replay_action_one_hot, + reduction_indices=1, + name='replay_chosen_q_rpg') + margin = 1 + qvalue = self._replay_rpg_net_outputs.q_values + # debug self.temp_action_one_hot = replay_action_one_hot + self.temp_qvalue = qvalue + self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot + self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \ + * ((1 - replay_action_one_hot) + (replay_action_one_hot)) + self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2) + rpg_loss = tf.reduce_mean(self.hingeloss) + return self.optimizer_rpg.minimize(rpg_loss) + + def _get_network_type(self): + """Returns the type of the outputs of the implicit quantile network. + + Returns: + _network_type object defining the outputs of the network. + """ + return collections.namedtuple( + 'iqn_network', ['quantile_values', 'quantiles']) + + def _network_template(self, state, num_quantiles): + r"""Builds an Implicit Quantile ConvNet. + + Takes state and quantile as inputs and outputs state-action quantile values. + + Args: + state: A `tf.placeholder` for the RL state. + num_quantiles: int, number of quantile inputs. + + Returns: + _network_type object containing quantile value outputs of the network. + """ + + weights_initializer = slim.variance_scaling_initializer( + factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) + + state_net = tf.cast(state, tf.float32) + state_net = tf.div(state_net, 255.) + state_net = slim.conv2d( + state_net, 32, [8, 8], stride=4, + weights_initializer=weights_initializer) + state_net = slim.conv2d( + state_net, 64, [4, 4], stride=2, + weights_initializer=weights_initializer) + state_net = slim.conv2d( + state_net, 64, [3, 3], stride=1, + weights_initializer=weights_initializer) + state_net = slim.flatten(state_net) + state_net_size = state_net.get_shape().as_list()[-1] + state_net_tiled = tf.tile(state_net, [num_quantiles, 1]) + + batch_size = state_net.get_shape().as_list()[0] + quantiles_shape = [num_quantiles * batch_size, 1] + quantiles = tf.random_uniform( + quantiles_shape, minval=0, maxval=1, dtype=tf.float32) + + quantile_net = tf.tile(quantiles, [1, self.quantile_embedding_dim]) + pi = tf.constant(math.pi) + quantile_net = tf.cast(tf.range( + 1, self.quantile_embedding_dim + 1, 1), tf.float32) * pi * quantile_net + quantile_net = tf.cos(quantile_net) + quantile_net = slim.fully_connected(quantile_net, state_net_size, + weights_initializer=weights_initializer) + # Hadamard product. + net = tf.multiply(state_net_tiled, quantile_net) + + net = slim.fully_connected( + net, 512, weights_initializer=weights_initializer) + quantile_values = slim.fully_connected( + net, + self.num_actions, + activation_fn=None, + weights_initializer=weights_initializer) + + return self._get_network_type()(quantile_values=quantile_values, + quantiles=quantiles) + + def _build_networks(self): + """Builds the IQN computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's quantile values. + self.target_convnet: For computing the next state's target quantile + values. + self._net_outputs: The actual quantile values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' quantile values. + self._replay_next_target_net_outputs: The replayed next states' target + quantile values. + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + self.target_convnet = tf.make_template('Target', self._network_template) + + # Compute the Q-values which are used for action selection in the current + # state. + self._net_outputs = self.online_convnet(self.state_ph, + self.num_quantile_samples) + # Shape of self._net_outputs.quantile_values: + # num_quantile_samples x num_actions. + # e.g. if num_actions is 2, it might look something like this: + # Vals for Quantile .2 Vals for Quantile .4 Vals for Quantile .6 + # [[0.1, 0.5], [0.15, -0.3], [0.15, -0.2]] + # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3]. + self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0) + self._q_argmax = tf.argmax(self._q_values, axis=0) + + self._replay_net_outputs = self.online_convnet(self._replay.states, + self.num_tau_samples) + # Shape: (num_tau_samples x batch_size) x num_actions. + self._replay_net_quantile_values = self._replay_net_outputs.quantile_values + self._replay_net_quantiles = self._replay_net_outputs.quantiles + + # Do the same for next states in the replay buffer. + self._replay_net_target_outputs = self.target_convnet( + self._replay.next_states, self.num_tau_prime_samples) + # Shape: (num_tau_prime_samples x batch_size) x num_actions. + vals = self._replay_net_target_outputs.quantile_values + self._replay_net_target_quantile_values = vals + + # Compute Q-values which are used for action selection for the next states + # in the replay buffer. Compute the argmax over the Q-values. + if self.double_dqn: + outputs_action = self.online_convnet(self._replay.next_states, + self.num_quantile_samples) + else: + outputs_action = self.target_convnet(self._replay.next_states, + self.num_quantile_samples) + + # Shape: (num_quantile_samples x batch_size) x num_actions. + target_quantile_values_action = outputs_action.quantile_values + # Shape: num_quantile_samples x batch_size x num_actions. + target_quantile_values_action = tf.reshape(target_quantile_values_action, + [self.num_quantile_samples, + self._replay.batch_size, + self.num_actions]) + # Shape: batch_size x num_actions. + self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean( + target_quantile_values_action, axis=0)) + self._replay_next_qt_argmax = tf.argmax( + self._replay_net_target_q_values, axis=1) + + def _build_target_quantile_values_op(self): + """Build an op used as a target for return values at given quantiles. + + Returns: + An op calculating the target quantile return. + """ + batch_size = tf.shape(self._replay.rewards)[0] + # Shape of rewards: (num_tau_prime_samples x batch_size) x 1. + rewards = self._replay.rewards[:, None] + rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1]) + + is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals) + # Incorporate terminal state to discount factor. + # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1. + gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier + gamma_with_terminal = tf.tile(gamma_with_terminal[:, None], + [self.num_tau_prime_samples, 1]) + + # Get the indices of the maximium Q-value across the action dimension. + # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1. + + replay_next_qt_argmax = tf.tile( + self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1]) + + # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1. + batch_indices = tf.cast(tf.range( + self.num_tau_prime_samples * batch_size)[:, None], tf.int64) + + # Shape of batch_indexed_target_values: + # (num_tau_prime_samples x batch_size) x 2. + batch_indexed_target_values = tf.concat( + [batch_indices, replay_next_qt_argmax], axis=1) + + # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1. + target_quantile_values = tf.gather_nd( + self._replay_net_target_quantile_values, + batch_indexed_target_values)[:, None] + + return rewards + gamma_with_terminal * target_quantile_values + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + batch_size = tf.shape(self._replay.rewards)[0] + + target_quantile_values = tf.stop_gradient( + self._build_target_quantile_values_op()) + # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is + # the manner in which the target_quantile_values are tiled. + target_quantile_values = tf.reshape(target_quantile_values, + [self.num_tau_prime_samples, + batch_size, 1]) + # Transpose dimensions so that the dimensionality is batch_size x + # self.num_tau_prime_samples x 1 to prepare for computation of + # Bellman errors. + # Final shape of target_quantile_values: + # batch_size x num_tau_prime_samples x 1. + target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2]) + + # Shape of indices: (num_tau_samples x batch_size) x 1. + # Expand dimension by one so that it can be used to index into all the + # quantiles when using the tf.gather_nd function (see below). + indices = tf.range(self.num_tau_samples * batch_size)[:, None] + + # Expand the dimension by one so that it can be used to index into all the + # quantiles when using the tf.gather_nd function (see below). + reshaped_actions = self._replay.actions[:, None] + reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1]) + # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2. + reshaped_actions = tf.concat([indices, reshaped_actions], axis=1) + + chosen_action_quantile_values = tf.gather_nd( + self._replay_net_quantile_values, reshaped_actions) + # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner + # in which the quantile values are tiled. + chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values, + [self.num_tau_samples, + batch_size, 1]) + # Transpose dimensions so that the dimensionality is batch_size x + # self.num_tau_samples x 1 to prepare for computation of + # Bellman errors. + # Final shape of chosen_action_quantile_values: + # batch_size x num_tau_samples x 1. + chosen_action_quantile_values = tf.transpose( + chosen_action_quantile_values, [1, 0, 2]) + + # Shape of bellman_erors and huber_loss: + # batch_size x num_tau_prime_samples x num_tau_samples x 1. + bellman_errors = target_quantile_values[ + :, :, None, :] - chosen_action_quantile_values[:, None, :, :] + # The huber loss (see Section 2.3 of the paper) is defined via two cases: + # case_one: |bellman_errors| <= kappa + # case_two: |bellman_errors| > kappa + huber_loss_case_one = tf.to_float( + tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2 + huber_loss_case_two = tf.to_float( + tf.abs(bellman_errors) > self.kappa) * self.kappa * ( + tf.abs(bellman_errors) - 0.5 * self.kappa) + huber_loss = huber_loss_case_one + huber_loss_case_two + + # Reshape replay_quantiles to batch_size x num_tau_samples x 1 + replay_quantiles = tf.reshape( + self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1]) + replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2]) + + # Tile by num_tau_prime_samples along a new dimension. Shape is now + # batch_size x num_tau_prime_samples x num_tau_samples x 1. + # These quantiles will be used for computation of the quantile huber loss + # below (see section 2.3 of the paper). + replay_quantiles = tf.to_float(tf.tile( + replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1])) + # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1. + quantile_huber_loss = (tf.abs(replay_quantiles - tf.stop_gradient( + tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa + # Sum over current quantile value (num_tau_samples) dimension, + # average over target quantile value (num_tau_prime_samples) dimension. + # Shape: batch_size x num_tau_prime_samples x 1. + loss = tf.reduce_sum(quantile_huber_loss, axis=2) + # Shape: batch_size x 1. + loss = tf.reduce_mean(loss, axis=1) + + # TODO(kumasaurabh): Add prioritized replay functionality here. + update_priorities_op = tf.no_op() + with tf.control_dependencies([update_priorities_op]): + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss)) + return self.optimizer.minimize(tf.reduce_mean(loss)), tf.reduce_mean(loss) diff --git a/dopamine/dopamine/agents/lpg/__init__.py b/dopamine/dopamine/agents/lpg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/lpg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/lpg/configs/lpg.gin b/dopamine/dopamine/agents/lpg/configs/lpg.gin new file mode 100644 index 0000000..532856e --- /dev/null +++ b/dopamine/dopamine/agents/lpg/configs/lpg.gin @@ -0,0 +1,36 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.lpg.lpg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +LPGAgent.gamma = 0.99 +LPGAgent.game_name = 'Pong' # Boxing, Pong +LPGAgent.update_horizon = 1 +LPGAgent.min_replay_history = 200000 # agent steps, step more than this, stop exploration. +LPGAgent.update_period = 4 +LPGAgent.epsilon_train = 0.0001 +LPGAgent.epsilon_eval = 0 +LPGAgent.epsilon_decay_period = 250000 # agent steps +LPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +LPGAgent.optimizer = @tf.train.RMSPropOptimizer() +LPGAgent.margin = 1 + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' # Boxing, Pong +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 15 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 10000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 512 diff --git a/dopamine/dopamine/agents/lpg/lpg_agent.py b/dopamine/dopamine/agents/lpg/lpg_agent.py new file mode 100644 index 0000000..f96d8e2 --- /dev/null +++ b/dopamine/dopamine/agents/lpg/lpg_agent.py @@ -0,0 +1,590 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a DQN agent.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random +from dopamine.agents.agent_utils import * +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf +from tensorflow.distributions import Categorical + +import gin.tf +from collections import deque + +slim = tf.contrib.slim + +NATURE_DQN_OBSERVATION_SHAPE = (84, 84) # Size of downscaled Atari 2600 frame. +NATURE_DQN_DTYPE = tf.uint8 # DType of Atari 2600 observations. +NATURE_DQN_STACK_SIZE = 4 # Number of frames in the state stack. + + +def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon): + """Returns the current epsilon for the agent's epsilon-greedy policy. + + This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et + al., 2015). The schedule is as follows: + Begin at 1. until warmup_steps steps have been taken; then + Linearly decay epsilon from 1. to epsilon in decay_period steps; and then + Use epsilon from there on. + + Args: + decay_period: float, the period over which epsilon is decayed. + step: int, the number of training steps completed so far. + warmup_steps: int, the number of steps taken before epsilon is decayed. + epsilon: float, the final value to which to decay the epsilon parameter. + + Returns: + A float, the current epsilon value computed according to the schedule. + """ + steps_left = decay_period + warmup_steps - step + bonus = (1.0 - epsilon) * steps_left / decay_period + bonus = np.clip(bonus, 0., 1. - epsilon) + return epsilon + bonus + + +@gin.configurable +class LPGAgent(object): + """An implementation of the DQN agent.""" + + def __init__(self, + sess, + num_actions, + game_name="Pong", + observation_shape=NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=NATURE_DQN_DTYPE, + stack_size=NATURE_DQN_STACK_SIZE, + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + epsilon_fn=linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + margin=1, + tf_device='/cpu:*', + use_staging=True, + max_tf_checkpoints_to_keep=3, + optimizer=tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints describing the observation shape. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to + keep. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + assert isinstance(observation_shape, tuple) + tf.logging.info('Creating %s agent with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t gamma: %f', gamma) + tf.logging.info('\t update_horizon: %f', update_horizon) + tf.logging.info('\t min_replay_history: %d', min_replay_history) + tf.logging.info('\t update_period: %d', update_period) + # tf.logging.info('\t random_seed: %d', random_seed) + tf.logging.info('\t epsilon_train: %f', epsilon_train) + tf.logging.info('\t epsilon_eval: %f', epsilon_eval) + tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) + tf.logging.info('\t tf_device: %s', tf_device) + tf.logging.info('\t use_staging: %s', use_staging) + tf.logging.info('\t optimizer: %s', optimizer) + tf.logging.info('\t game: %s', game_name) + + self.game_name = game_name + self.num_actions = num_actions + self.observation_shape = tuple(observation_shape) + self.observation_dtype = observation_dtype + self.stack_size = stack_size + self.gamma = gamma + self.update_horizon = update_horizon + self.cumulative_gamma = math.pow(gamma, update_horizon) + self.min_replay_history = min_replay_history + self.epsilon_fn = epsilon_fn + self.epsilon_train = epsilon_train + self.epsilon_eval = epsilon_eval + self.epsilon_decay_period = epsilon_decay_period + self.update_period = update_period + self.eval_mode = False + self.training_steps = 0 + self.optimizer = optimizer + self.summary_writer = summary_writer + self.summary_writing_frequency = summary_writing_frequency + self.margin = margin + self.start_training = 1000 + # todo task specific FOR PONG IS 1000 IF THIS IS TOO SMALL WE END UP WITH A DETERMINISTIC POLICY QUCKKLY + self.highest_reward = 6 # todo task specific + self.isPrinted = False + self.current_replay_size = 0 + self.epsilon_current = 1 + + with tf.device(tf_device): + # Create a placeholder for the state input to the DQN network. + # The last axis indicates the number of consecutive frames stacked. + state_shape = (1,) + self.observation_shape + (stack_size,) + self.state = np.zeros(state_shape) + self.state_ph = tf.placeholder(self.observation_dtype, state_shape, + name='state_ph') + self._replay = self._build_replay_buffer(use_staging) + + self._build_networks() + + self._train_op = self._build_train_op() + + self.replay_buffer = ReplayBufferRegular(100000) + + if self.summary_writer is not None: + # All tf.summaries should have been defined prior to running this. + self._merged_summaries = tf.summary.merge_all() + self._sess = sess + self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) + + # Variables to be initialized by the agent once it interacts with the + # environment. + self._observation = None + self._last_observation = None + + def _get_network_type(self): + """Returns the type of the outputs of a Q value network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('DQN_network', ['q_values']) + + def _network_template(self, state): + """Builds the convolutional network used to compute the agent's Q-values. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d(net, 32, [8, 8], stride=4) + net = slim.conv2d(net, 64, [4, 4], stride=2) + net = slim.conv2d(net, 64, [3, 3], stride=1) + net = slim.flatten(net) + net = slim.fully_connected(net, 512) + q_values = slim.fully_connected(net, self.num_actions, activation_fn=None) + return self._get_network_type()(q_values) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + + self._net_outputs = self.online_convnet(self.state_ph) + # using a deep network, but may affect performance with a linear + # approximation scheme. + self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + # treat self._net_outputs.q_values as logits + self.logsoftmaxprob = tf.nn.log_softmax(self._net_outputs.q_values) + self.sample = Categorical(logits=self.logsoftmaxprob).sample(1) + + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + + replay_action_one_hot = tf.one_hot( + self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') + logits = self._replay_net_outputs.q_values + self.logsoftmaxprob = tf.nn.log_softmax(logits) + self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1) + # self.temp_loss = self.neglogprob # * self.y_pl + loss = self.actor_loss = tf.reduce_mean(self.neglogprob) + self.replay_action_one_hot = replay_action_one_hot + + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('hingeLoss', loss) + return self.optimizer.minimize(loss) + + def begin_episode(self, observation): + """Returns the agent's first action for this episode. + + Args: + observation: numpy array, the environment's initial observation. + + Returns: + int, the selected action. + """ + self._reset_state() + self._record_observation(observation) + + if not self.eval_mode: + self._train_step() + + self.action = self._select_action() + return self.action + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + # if reward < 0: + # self.replay_buffer.clear() + # elif reward > 0: + # self.replay_buffer.add(self._last_observation, self.action, reward, False) + # while self.replay_buffer.size() > 0: + # experience = self.replay_buffer.get_sample() + # state, action, reward, _ = experience + # self._store_transition(state, action, reward, False) + # else: + # self.replay_buffer.add(self._last_observation, self.action, reward, False) + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + + self._train_step() + + self.action = self._select_action() + if isinstance(self.action, np.ndarray): + pass + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + + def _select_action_training(self): + """Use EPG to select action during training, """ + return self._sess.run(self.sample, {self.state_ph: self.state})[0][0] + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + exploration = "Randomexplore" + if exploration == "EPG": + self.epsilon_current = 0 + self.current_replay_size = self._replay.memory.add_count + if self.eval_mode: + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + return self._select_action_training() + + elif exploration == "Randomexplore": + # epsilon greedy explore. + # epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn( + # self.epsilon_decay_period, + # self.training_steps, + # self.min_replay_history, + # self.epsilon_train) + if self.training_steps < self.min_replay_history: + epsilon = 1 + else: + epsilon = self.epsilon_train + if self.eval_mode: + epsilon = self.epsilon_eval + self.epsilon_current = epsilon + self.current_replay_size = self._replay.memory.add_count + # + if random.random() <= epsilon: + # Choose a random action with probability epsilon. + return random.randint(0, self.num_actions - 1) + else: + # Choose the action with highest Q-value at the current state. + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + + if self._replay.memory.add_count > self.start_training: + if self.training_steps % self.update_period == 0: + + # debug checked. + # _, neglogprob, logsoftmaxprob, \ + # actor_loss, replay_action_one_hot = self._sess.run([self._train_op, + # self.neglogprob, + # self.logsoftmaxprob, + # self.actor_loss, + # self.replay_action_one_hot]) + self._sess.run(self._train_op) + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + # if self.training_steps % self.target_update_period == 0: + # self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False: + print("start training at {}".format(self.training_steps)) + self.isPrinted = True + + def _record_observation(self, observation): + """Records an observation and update state. + + Extracts a frame from the observation vector and overwrites the oldest + frame in the state buffer. + + Args: + observation: numpy array, an observation from the environment. + """ + # Set current observation. We do the reshaping to handle environments + # without frame stacking. + observation = np.reshape(observation, self.observation_shape) + self._observation = observation[..., 0] + self._observation = np.reshape(observation, self.observation_shape) + # Swap out the oldest frame with the current frame. + self.state = np.roll(self.state, -1, axis=-1) + self.state[0, ..., -1] = self._observation + + def _store_transition(self, last_observation, action, reward, is_terminal): + """Stores an experienced transition. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer: + (last_observation, action, reward, is_terminal). + + Pedantically speaking, this does not actually store an entire transition + since the next state is recorded on the following time step. + + Args: + last_observation: numpy array, last observation. + action: int, the action taken. + reward: float, the reward. + is_terminal: bool, indicating if the current state is a terminal state. + """ + self._replay.add(last_observation, action, reward, is_terminal) + + def _reset_state(self): + """Resets the agent state by filling it with zeros.""" + self.state.fill(0) + + def bundle_and_checkpoint(self, checkpoint_dir, iteration_number): + """Returns a self-contained bundle of the agent's state. + + This is used for checkpointing. It will return a dictionary containing all + non-TensorFlow objects (to be saved into a file by the caller), and it saves + all TensorFlow objects into a checkpoint file. + + Args: + checkpoint_dir: str, directory where TensorFlow objects will be saved. + iteration_number: int, iteration number to use for naming the checkpoint + file. + + Returns: + A dict containing additional Python objects to be checkpointed by the + experiment. If the checkpoint directory does not exist, returns None. + """ + if not tf.gfile.Exists(checkpoint_dir): + return None + # Call the Tensorflow saver to checkpoint the graph. + self._saver.save( + self._sess, + os.path.join(checkpoint_dir, 'tf_ckpt'), + global_step=iteration_number) + # Checkpoint the out-of-graph replay buffer. + self._replay.save(checkpoint_dir, iteration_number) + bundle_dictionary = {} + bundle_dictionary['state'] = self.state + bundle_dictionary['eval_mode'] = self.eval_mode + bundle_dictionary['training_steps'] = self.training_steps + return bundle_dictionary + + def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary): + """Restores the agent from a checkpoint. + + Restores the agent's Python objects to those specified in bundle_dictionary, + and restores the TensorFlow objects to those specified in the + checkpoint_dir. If the checkpoint_dir does not exist, will not reset the + agent's state. + + Args: + checkpoint_dir: str, path to the checkpoint saved by tf.Save. + iteration_number: int, checkpoint version, used when restoring replay + buffer. + bundle_dictionary: dict, containing additional Python objects owned by + the agent. + + Returns: + bool, True if unbundling was successful. + """ + try: + # self._replay.load() will throw a NotFoundError if it does not find all + # the necessary files, in which case we abort the process & return False. + self._replay.load(checkpoint_dir, iteration_number) + except tf.errors.NotFoundError: + return False + for key in self.__dict__: + if key in bundle_dictionary: + self.__dict__[key] = bundle_dictionary[key] + # Restore the agent's TensorFlow graph. + self._saver.restore(self._sess, + os.path.join(checkpoint_dir, + 'tf_ckpt-{}'.format(iteration_number))) + return True + + +class ReplayBufferRegular(object): + """ for uniformly sampling. + + """ + + def __init__(self, buffer_size, random_seed=1234): + self.buffer_size = buffer_size + self.count = 0 + # Right side of deque contains newest experience + self.buffer = deque() + random.seed(random_seed) + self.ptr, self.path_start_idx = 0, 0 + + def add(self, state, action, reward, terminal): + experience = [state, action, reward, terminal] + assert self.count < self.buffer_size + self.buffer.append(experience) + self.count += 1 + self.ptr += 1 + # else: + # self.path_start_idx -= 1 + # self.ptr = self.buffer_size - 1 + # self.buffer.popleft() + # self.buffer.append(experience) + + def get_sample(self): + self.count -= 1 + return self.buffer.popleft() + + def size(self): + return self.count + + def clear(self): + self.buffer.clear() + self.count = 0 + self.ptr = 0 + self.path_start_idx = 0 diff --git a/dopamine/dopamine/agents/rainbow/__init__.py b/dopamine/dopamine/agents/rainbow/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/rainbow/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/rainbow/configs/c51.gin b/dopamine/dopamine/agents/rainbow/configs/c51.gin new file mode 100644 index 0000000..a73d4ed --- /dev/null +++ b/dopamine/dopamine/agents/rainbow/configs/c51.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow the settings from Bellemare et al. (2017), but we +# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to +# ensure apples-to-apples comparison. +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 1 +RainbowAgent.min_replay_history = 20000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 250000 # agent steps +RainbowAgent.replay_scheme = 'uniform' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +tf.train.AdamOptimizer.learning_rate = 0.00025 +tf.train.AdamOptimizer.epsilon = 0.0003125 + +Runner.game_name = 'Breakout' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 30 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/rainbow/configs/c51_icml.gin b/dopamine/dopamine/agents/rainbow/configs/c51_icml.gin new file mode 100644 index 0000000..b06aa7d --- /dev/null +++ b/dopamine/dopamine/agents/rainbow/configs/c51_icml.gin @@ -0,0 +1,36 @@ +# Hyperparameters used in Bellemare et al. (2017). +import dopamine.atari.preprocessing +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 1 +RainbowAgent.min_replay_history = 50000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 10000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 1000000 # agent steps +RainbowAgent.replay_scheme = 'uniform' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +tf.train.AdamOptimizer.learning_rate = 0.00025 +tf.train.AdamOptimizer.epsilon = 0.0003125 + +Runner.game_name = 'Pong' +# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015). +Runner.sticky_actions = False +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +AtariPreprocessing.terminal_on_life_loss = True + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/rainbow/configs/rainbow.gin b/dopamine/dopamine/agents/rainbow/configs/rainbow.gin new file mode 100644 index 0000000..6a4d92b --- /dev/null +++ b/dopamine/dopamine/agents/rainbow/configs/rainbow.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow Hessel et al. (2018), except for sticky_actions, +# which was False (not using sticky actions) in the original paper. +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 3 +RainbowAgent.min_replay_history = 20000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0 # changed by lkx +RainbowAgent.epsilon_decay_period = 250000 # agent steps +RainbowAgent.replay_scheme = 'prioritized' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +# Note these parameters are different from C51's. +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +Runner.game_name = 'Bowling' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False # changed by lkx +Runner.num_iterations = 15 # changed by lkx +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin b/dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin new file mode 100644 index 0000000..48be0f6 --- /dev/null +++ b/dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin @@ -0,0 +1,37 @@ +# Hyperparameters follow Hessel et al. (2018). +import dopamine.atari.preprocessing +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 3 +RainbowAgent.min_replay_history = 20000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 250000 # agent steps +RainbowAgent.replay_scheme = 'prioritized' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +# Note these parameters are different from C51's. +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +Runner.game_name = 'Pong' +# Deterministic ALE version used in the AAAI paper. +Runner.sticky_actions = False +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +AtariPreprocessing.terminal_on_life_loss = True + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/rainbow/rainbow_agent.py b/dopamine/dopamine/agents/rainbow/rainbow_agent.py new file mode 100644 index 0000000..67ec08a --- /dev/null +++ b/dopamine/dopamine/agents/rainbow/rainbow_agent.py @@ -0,0 +1,504 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a simplified Rainbow agent. + +Specifically, we implement the following components from Rainbow: + + * n-step updates; + * prioritized replay; and + * distributional RL. + +These three components were found to significantly impact the performance of +the Atari game-playing agent. + +Furthermore, our implementation does away with some minor hyperparameter +choices. Specifically, we + + * keep the beta exponent fixed at beta=0.5, rather than increase it linearly; + * remove the alpha parameter, which was set to alpha=0.5 throughout the paper. + +Details in "Rainbow: Combining Improvements in Deep Reinforcement Learning" by +Hessel et al. (2018). +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + + + +from dopamine.agents.dqn import dqn_agent +from dopamine.replay_memory import prioritized_replay_buffer +import numpy as np +import tensorflow as tf + +import gin.tf + +slim = tf.contrib.slim + + +@gin.configurable +class RainbowAgent(dqn_agent.DQNAgent): + """A compact implementation of a simplified Rainbow agent.""" + + def __init__(self, + sess, + num_actions, + observation_shape=dqn_agent.NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=dqn_agent.NATURE_DQN_DTYPE, + stack_size=dqn_agent.NATURE_DQN_STACK_SIZE, + num_atoms=51, + vmax=10., + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + target_update_period=8000, + epsilon_fn=dqn_agent.linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + replay_scheme='prioritized', + tf_device='/cpu:*', + use_staging=True, + optimizer=tf.train.AdamOptimizer( + learning_rate=0.00025, epsilon=0.0003125), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints or an int. If single int, the observation + is assumed to be a 2D square. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + num_atoms: int, the number of buckets of the value function distribution. + vmax: float, the value distribution support is [-vmax, vmax]. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + replay_scheme: str, 'prioritized' or 'uniform', the sampling scheme of the + replay memory. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + # We need this because some tools convert round floats into ints. + vmax = float(vmax) + self._num_atoms = num_atoms + self._support = tf.linspace(-vmax, vmax, num_atoms) + self._replay_scheme = replay_scheme + # TODO(b/110897128): Make agent optimizer attribute private. + self.optimizer = optimizer + + super(RainbowAgent, self).__init__( + sess=sess, + num_actions=num_actions, + observation_shape=observation_shape, + observation_dtype=observation_dtype, + stack_size=stack_size, + gamma=gamma, + update_horizon=update_horizon, + min_replay_history=min_replay_history, + update_period=update_period, + target_update_period=target_update_period, + epsilon_fn=epsilon_fn, + epsilon_train=epsilon_train, + epsilon_eval=epsilon_eval, + epsilon_decay_period=epsilon_decay_period, + tf_device=tf_device, + use_staging=use_staging, + optimizer=self.optimizer, + summary_writer=summary_writer, + summary_writing_frequency=summary_writing_frequency) + + def _get_network_type(self): + """Returns the type of the outputs of a value distribution network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('c51_network', + ['q_values', 'logits', 'probabilities']) + + def _network_template(self, state): + """Builds a convolutional network that outputs Q-value distributions. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + weights_initializer = slim.variance_scaling_initializer( + factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) + + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d( + net, 32, [8, 8], stride=4, weights_initializer=weights_initializer) + net = slim.conv2d( + net, 64, [4, 4], stride=2, weights_initializer=weights_initializer) + net = slim.conv2d( + net, 64, [3, 3], stride=1, weights_initializer=weights_initializer) + net = slim.flatten(net) + net = slim.fully_connected( + net, 512, weights_initializer=weights_initializer) + net = slim.fully_connected( + net, + self.num_actions * self._num_atoms, + activation_fn=None, + weights_initializer=weights_initializer) + + logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms]) + probabilities = tf.contrib.layers.softmax(logits) + q_values = tf.reduce_sum(self._support * probabilities, axis=2) + return self._get_network_type()(q_values, logits, probabilities) + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A `WrappedPrioritizedReplayBuffer` object. + + Raises: + ValueError: if given an invalid replay scheme. + """ + if self._replay_scheme not in ['uniform', 'prioritized']: + raise ValueError('Invalid replay scheme: {}'.format(self._replay_scheme)) + return prioritized_replay_buffer.WrappedPrioritizedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma) + + def _build_target_distribution(self): + """Builds the C51 target distribution as per Bellemare et al. (2017). + + First, we compute the support of the Bellman target, r + gamma Z'. Where Z' + is the support of the next state distribution: + + * Evenly spaced in [-vmax, vmax] if the current state is nonterminal; + * 0 otherwise (duplicated num_atoms times). + + Second, we compute the next-state probabilities, corresponding to the action + with highest expected value. + + Finally we project the Bellman target (support + probabilities) onto the + original support. + + Returns: + target_distribution: tf.tensor, the target distribution from the replay. + """ + batch_size = self._replay.batch_size + + # size of rewards: batch_size x 1 + rewards = self._replay.rewards[:, None] + + # size of tiled_support: batch_size x num_atoms + tiled_support = tf.tile(self._support, [batch_size]) + tiled_support = tf.reshape(tiled_support, [batch_size, self._num_atoms]) + + # size of target_support: batch_size x num_atoms + + is_terminal_multiplier = 1. - tf.cast(self._replay.terminals, tf.float32) + # Incorporate terminal state to discount factor. + # size of gamma_with_terminal: batch_size x 1 + gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier + gamma_with_terminal = gamma_with_terminal[:, None] + + target_support = rewards + gamma_with_terminal * tiled_support + + # size of next_qt_argmax: 1 x batch_size + next_qt_argmax = tf.argmax( + self._replay_next_target_net_outputs.q_values, axis=1)[:, None] + batch_indices = tf.range(tf.to_int64(batch_size))[:, None] + # size of next_qt_argmax: batch_size x 2 + batch_indexed_next_qt_argmax = tf.concat( + [batch_indices, next_qt_argmax], axis=1) + + # size of next_probabilities: batch_size x num_atoms + next_probabilities = tf.gather_nd( + self._replay_next_target_net_outputs.probabilities, + batch_indexed_next_qt_argmax) + + return project_distribution(target_support, next_probabilities, + self._support) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + target_distribution = tf.stop_gradient(self._build_target_distribution()) + + # size of indices: batch_size x 1. + indices = tf.range(tf.shape(self._replay_net_outputs.logits)[0])[:, None] + # size of reshaped_actions: batch_size x 2. + reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1) + # For each element of the batch, fetch the logits for its selected action. + chosen_action_logits = tf.gather_nd(self._replay_net_outputs.logits, + reshaped_actions) + + loss = tf.nn.softmax_cross_entropy_with_logits( + labels=target_distribution, + logits=chosen_action_logits) + + if self._replay_scheme == 'prioritized': + # The original prioritized experience replay uses a linear exponent + # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5 + # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested + # a fixed exponent actually performs better, except on Pong. + probs = self._replay.transition['sampling_probabilities'] + loss_weights = 1.0 / tf.sqrt(probs + 1e-10) + loss_weights /= tf.reduce_max(loss_weights) + + # Rainbow and prioritized replay are parametrized by an exponent alpha, + # but in both cases it is set to 0.5 - for simplicity's sake we leave it + # as is here, using the more direct tf.sqrt(). Taking the square root + # "makes sense", as we are dealing with a squared loss. + # Add a small nonzero value to the loss to avoid 0 priority items. While + # technically this may be okay, setting all items to 0 priority will cause + # troubles, and also result in 1.0 / 0.0 = NaN correction terms. + update_priorities_op = self._replay.tf_set_priority( + self._replay.indices, tf.sqrt(loss + 1e-10)) + + # Weight the loss by the inverse priorities. + loss = loss_weights * loss + else: + update_priorities_op = tf.no_op() + + with tf.control_dependencies([update_priorities_op]): + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('CrossEntropyLoss', tf.reduce_mean(loss)) + # Schaul et al. reports a slightly different rule, where 1/N is also + # exponentiated by beta. Not doing so seems more reasonable, and did not + # impact performance in our experiments. + return self.optimizer.minimize(tf.reduce_mean(loss)), loss + + def _store_transition(self, + last_observation, + action, + reward, + is_terminal, + priority=None): + """Stores a transition when in training mode. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer (last_observation, action, reward, + is_terminal, priority). + + Args: + last_observation: Last observation, type determined via observation_type + parameter in the replay_memory constructor. + action: An integer, the action taken. + reward: A float, the reward. + is_terminal: Boolean indicating if the current state is a terminal state. + priority: Float. Priority of sampling the transition. If None, the default + priority will be used. If replay scheme is uniform, the default priority + is 1. If the replay scheme is prioritized, the default priority is the + maximum ever seen [Schaul et al., 2015]. + """ + if priority is None: + priority = (1. if self._replay_scheme == 'uniform' else + self._replay.memory.sum_tree.max_recorded_priority) + + if not self.eval_mode: + self._replay.add(last_observation, action, reward, is_terminal, priority) + + +def project_distribution(supports, weights, target_support, + validate_args=False): + """Projects a batch of (support, weights) onto target_support. + + Based on equation (7) in (Bellemare et al., 2017): + https://arxiv.org/abs/1707.06887 + In the rest of the comments we will refer to this equation simply as Eq7. + + This code is not easy to digest, so we will use a running example to clarify + what is going on, with the following sample inputs: + + * supports = [[0, 2, 4, 6, 8], + [1, 3, 4, 5, 6]] + * weights = [[0.1, 0.6, 0.1, 0.1, 0.1], + [0.1, 0.2, 0.5, 0.1, 0.1]] + * target_support = [4, 5, 6, 7, 8] + + In the code below, comments preceded with 'Ex:' will be referencing the above + values. + + Args: + supports: Tensor of shape (batch_size, num_dims) defining supports for the + distribution. + weights: Tensor of shape (batch_size, num_dims) defining weights on the + original support points. Although for the CategoricalDQN agent these + weights are probabilities, it is not required that they are. + target_support: Tensor of shape (num_dims) defining support of the projected + distribution. The values must be monotonically increasing. Vmin and Vmax + will be inferred from the first and last elements of this tensor, + respectively. The values in this tensor must be equally spaced. + validate_args: Whether we will verify the contents of the + target_support parameter. + + Returns: + A Tensor of shape (batch_size, num_dims) with the projection of a batch of + (support, weights) onto target_support. + + Raises: + ValueError: If target_support has no dimensions, or if shapes of supports, + weights, and target_support are incompatible. + """ + target_support_deltas = target_support[1:] - target_support[:-1] + # delta_z = `\Delta z` in Eq7. + delta_z = target_support_deltas[0] + validate_deps = [] + supports.shape.assert_is_compatible_with(weights.shape) + supports[0].shape.assert_is_compatible_with(target_support.shape) + target_support.shape.assert_has_rank(1) + if validate_args: + # Assert that supports and weights have the same shapes. + validate_deps.append( + tf.Assert( + tf.reduce_all(tf.equal(tf.shape(supports), tf.shape(weights))), + [supports, weights])) + # Assert that elements of supports and target_support have the same shape. + validate_deps.append( + tf.Assert( + tf.reduce_all( + tf.equal(tf.shape(supports)[1], tf.shape(target_support))), + [supports, target_support])) + # Assert that target_support has a single dimension. + validate_deps.append( + tf.Assert( + tf.equal(tf.size(tf.shape(target_support)), 1), [target_support])) + # Assert that the target_support is monotonically increasing. + validate_deps.append( + tf.Assert(tf.reduce_all(target_support_deltas > 0), [target_support])) + # Assert that the values in target_support are equally spaced. + validate_deps.append( + tf.Assert( + tf.reduce_all(tf.equal(target_support_deltas, delta_z)), + [target_support])) + + with tf.control_dependencies(validate_deps): + # Ex: `v_min, v_max = 4, 8`. + v_min, v_max = target_support[0], target_support[-1] + # Ex: `batch_size = 2`. + batch_size = tf.shape(supports)[0] + # `N` in Eq7. + # Ex: `num_dims = 5`. + num_dims = tf.shape(target_support)[0] + # clipped_support = `[\hat{T}_{z_j}]^{V_max}_{V_min}` in Eq7. + # Ex: `clipped_support = [[[ 4. 4. 4. 6. 8.]] + # [[ 4. 4. 4. 5. 6.]]]`. + clipped_support = tf.clip_by_value(supports, v_min, v_max)[:, None, :] + # Ex: `tiled_support = [[[[ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.]] + # [[ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.]]]]`. + tiled_support = tf.tile([clipped_support], [1, 1, num_dims, 1]) + # Ex: `reshaped_target_support = [[[ 4.] + # [ 5.] + # [ 6.] + # [ 7.] + # [ 8.]] + # [[ 4.] + # [ 5.] + # [ 6.] + # [ 7.] + # [ 8.]]]`. + reshaped_target_support = tf.tile(target_support[:, None], [batch_size, 1]) + reshaped_target_support = tf.reshape(reshaped_target_support, + [batch_size, num_dims, 1]) + # numerator = `|clipped_support - z_i|` in Eq7. + # Ex: `numerator = [[[[ 0. 0. 0. 2. 4.] + # [ 1. 1. 1. 1. 3.] + # [ 2. 2. 2. 0. 2.] + # [ 3. 3. 3. 1. 1.] + # [ 4. 4. 4. 2. 0.]] + # [[ 0. 0. 0. 1. 2.] + # [ 1. 1. 1. 0. 1.] + # [ 2. 2. 2. 1. 0.] + # [ 3. 3. 3. 2. 1.] + # [ 4. 4. 4. 3. 2.]]]]`. + numerator = tf.abs(tiled_support - reshaped_target_support) + quotient = 1 - (numerator / delta_z) + # clipped_quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7. + # Ex: `clipped_quotient = [[[[ 1. 1. 1. 0. 0.] + # [ 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 1. 0.] + # [ 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 0. 1.]] + # [[ 1. 1. 1. 0. 0.] + # [ 0. 0. 0. 1. 0.] + # [ 0. 0. 0. 0. 1.] + # [ 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 0. 0.]]]]`. + clipped_quotient = tf.clip_by_value(quotient, 0, 1) + # Ex: `weights = [[ 0.1 0.6 0.1 0.1 0.1] + # [ 0.1 0.2 0.5 0.1 0.1]]`. + weights = weights[:, None, :] + # inner_prod = `\sum_{j=0}^{N-1} clipped_quotient * p_j(x', \pi(x'))` + # in Eq7. + # Ex: `inner_prod = [[[[ 0.1 0.6 0.1 0. 0. ] + # [ 0. 0. 0. 0. 0. ] + # [ 0. 0. 0. 0.1 0. ] + # [ 0. 0. 0. 0. 0. ] + # [ 0. 0. 0. 0. 0.1]] + # [[ 0.1 0.2 0.5 0. 0. ] + # [ 0. 0. 0. 0.1 0. ] + # [ 0. 0. 0. 0. 0.1] + # [ 0. 0. 0. 0. 0. ] + # [ 0. 0. 0. 0. 0. ]]]]`. + inner_prod = clipped_quotient * weights + # Ex: `projection = [[ 0.8 0.0 0.1 0.0 0.1] + # [ 0.8 0.1 0.1 0.0 0.0]]`. + projection = tf.reduce_sum(inner_prod, 3) + projection = tf.reshape(projection, [batch_size, num_dims]) + return projection diff --git a/dopamine/dopamine/agents/rainbowrpg/__init__.py b/dopamine/dopamine/agents/rainbowrpg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/rainbowrpg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin b/dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin new file mode 100644 index 0000000..052cd13 --- /dev/null +++ b/dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow the settings from Bellemare et al. (2017), but we +# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to +# ensure apples-to-apples comparison. +import dopamine.agents.rainbow.rainbow_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 1 +RainbowAgent.min_replay_history = 2000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0 +RainbowAgent.epsilon_decay_period = 250000 # agent steps +RainbowAgent.replay_scheme = 'uniform' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() + +tf.train.AdamOptimizer.learning_rate = 0.00025 +tf.train.AdamOptimizer.epsilon = 0.0003125 + +Runner.game_name = 'Breakout' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 30 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin b/dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin new file mode 100644 index 0000000..d88945b --- /dev/null +++ b/dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin @@ -0,0 +1,35 @@ +# Hyperparameters follow Hessel et al. (2018), except for sticky_actions, +# which was False (not using sticky actions) in the original paper. +import dopamine.agents.rainbowrpg.rainbowrpg_agent +import dopamine.atari.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowRPGAgent.num_atoms = 51 +RainbowRPGAgent.vmax = 10. +RainbowRPGAgent.gamma = 0.99 +RainbowRPGAgent.update_horizon = 3 +RainbowRPGAgent.min_replay_history = 20000 # agent steps lkx +RainbowRPGAgent.update_period = 4 +RainbowRPGAgent.target_update_period = 8000 # agent steps +RainbowRPGAgent.epsilon_train = 0.01 +RainbowRPGAgent.epsilon_eval = 0 # changed by lkx +RainbowRPGAgent.epsilon_decay_period = 250000 # agent steps +RainbowRPGAgent.replay_scheme = 'prioritized' +RainbowRPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowRPGAgent.optimizer = @tf.train.AdamOptimizer() + +# Note these parameters are different from C51's. +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +Runner.game_name = 'Pong' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False # changed by lkx +Runner.num_iterations = 15 # changed by lkx +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 # changed by lkx diff --git a/dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py b/dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py new file mode 100644 index 0000000..a25a3c0 --- /dev/null +++ b/dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py @@ -0,0 +1,699 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a simplified Rainbow agent. + +Specifically, we implement the following components from Rainbow: + + * n-step updates; + * prioritized replay; and + * distributional RL. + +These three components were found to significantly impact the performance of +the Atari game-playing agent. + +Furthermore, our implementation does away with some minor hyperparameter +choices. Specifically, we + + * keep the beta exponent fixed at beta=0.5, rather than increase it linearly; + * remove the alpha parameter, which was set to alpha=0.5 throughout the paper. + +Details in "Rainbow: Combining Improvements in Deep Reinforcement Learning" by +Hessel et al. (2018). +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from dopamine.agents.dqn import dqn_agent +from dopamine.replay_memory import prioritized_replay_buffer +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf + +import gin.tf +from dopamine.agents.agent_utils import * + +slim = tf.contrib.slim + + +@gin.configurable +class RainbowRPGAgent(dqn_agent.DQNAgent): + """A compact implementation of a simplified Rainbow agent.""" + + def __init__(self, + sess, + num_actions, + observation_shape=dqn_agent.NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=dqn_agent.NATURE_DQN_DTYPE, + stack_size=dqn_agent.NATURE_DQN_STACK_SIZE, + num_atoms=51, + vmax=10., + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + target_update_period=8000, + epsilon_fn=dqn_agent.linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + replay_scheme='prioritized', + tf_device='/gpu:*', + use_staging=True, + optimizer=tf.train.AdamOptimizer( + learning_rate=0.00025, epsilon=0.0003125), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints or an int. If single int, the observation + is assumed to be a 2D square. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + num_atoms: int, the number of buckets of the value function distribution. + vmax: float, the value distribution support is [-vmax, vmax]. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + replay_scheme: str, 'prioritized' or 'uniform', the sampling scheme of the + replay memory. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + # We need this because some tools convert round floats into ints. + vmax = float(vmax) + self._num_atoms = num_atoms + self._support = tf.linspace(-vmax, vmax, num_atoms) + self._replay_scheme = replay_scheme + # TODO(b/110897128): Make agent optimizer attribute private. + self.optimizer = optimizer + self.optimizer_rpg = tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True) # optimizer for RPG'= + self.start_training = 1000 + + super(RainbowRPGAgent, self).__init__( + sess=sess, + num_actions=num_actions, + observation_shape=observation_shape, + observation_dtype=observation_dtype, + stack_size=stack_size, + gamma=gamma, + update_horizon=update_horizon, + min_replay_history=min_replay_history, + update_period=update_period, + target_update_period=target_update_period, + epsilon_fn=epsilon_fn, + epsilon_train=epsilon_train, + epsilon_eval=epsilon_eval, + epsilon_decay_period=epsilon_decay_period, + tf_device=tf_device, + use_staging=use_staging, + optimizer=self.optimizer, + summary_writer=summary_writer, + summary_writing_frequency=summary_writing_frequency) + + with tf.device(tf_device): + self._replay_opt = self._build_replay_buffer_opt(use_staging) + self._build_networks_rpg() + self._train_op_rpg = self._build_train_op_rpg() + + # replay buffer for rpg. only store good trajectories. + self.replay_buffer_temp = ReplayBufferRegular(100000) # temporarily + + def _get_network_type(self): + """Returns the type of the outputs of a value distribution network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('c51_network', + ['q_values', 'logits', 'probabilities']) + + def _network_template(self, state): + """Builds a convolutional network that outputs Q-value distributions. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + weights_initializer = slim.variance_scaling_initializer( + factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True) + + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d( + net, 32, [8, 8], stride=4, weights_initializer=weights_initializer) + net = slim.conv2d( + net, 64, [4, 4], stride=2, weights_initializer=weights_initializer) + net = slim.conv2d( + net, 64, [3, 3], stride=1, weights_initializer=weights_initializer) + net = slim.flatten(net) + net = slim.fully_connected( + net, 512, weights_initializer=weights_initializer) + net = slim.fully_connected( + net, + self.num_actions * self._num_atoms, + activation_fn=None, + weights_initializer=weights_initializer) + + logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms]) + probabilities = tf.contrib.layers.softmax(logits) + q_values = tf.reduce_sum(self._support * probabilities, axis=2) + return self._get_network_type()(q_values, logits, probabilities) + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A `WrappedPrioritizedReplayBuffer` object. + + Raises: + ValueError: if given an invalid replay scheme. + """ + if self._replay_scheme not in ['uniform', 'prioritized']: + raise ValueError('Invalid replay scheme: {}'.format(self._replay_scheme)) + return prioritized_replay_buffer.WrappedPrioritizedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma) + + def _build_replay_buffer_opt(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_target_distribution(self): + """Builds the C51 target distribution as per Bellemare et al. (2017). + + First, we compute the support of the Bellman target, r + gamma Z'. Where Z' + is the support of the next state distribution: + + * Evenly spaced in [-vmax, vmax] if the current state is nonterminal; + * 0 otherwise (duplicated num_atoms times). + + Second, we compute the next-state probabilities, corresponding to the action + with highest expected value. + + Finally we project the Bellman target (support + probabilities) onto the + original support. + + Returns: + target_distribution: tf.tensor, the target distribution from the replay. + """ + batch_size = self._replay.batch_size + + # size of rewards: batch_size x 1 + rewards = self._replay.rewards[:, None] + + # size of tiled_support: batch_size x num_atoms + tiled_support = tf.tile(self._support, [batch_size]) + tiled_support = tf.reshape(tiled_support, [batch_size, self._num_atoms]) + + # size of target_support: batch_size x num_atoms + + is_terminal_multiplier = 1. - tf.cast(self._replay.terminals, tf.float32) + # Incorporate terminal state to discount factor. + # size of gamma_with_terminal: batch_size x 1 + gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier + gamma_with_terminal = gamma_with_terminal[:, None] + + target_support = rewards + gamma_with_terminal * tiled_support + + # size of next_qt_argmax: 1 x batch_size + next_qt_argmax = tf.argmax( + self._replay_next_target_net_outputs.q_values, axis=1)[:, None] + batch_indices = tf.range(tf.to_int64(batch_size))[:, None] + # size of next_qt_argmax: batch_size x 2 + batch_indexed_next_qt_argmax = tf.concat( + [batch_indices, next_qt_argmax], axis=1) + + # size of next_probabilities: batch_size x num_atoms + next_probabilities = tf.gather_nd( + self._replay_next_target_net_outputs.probabilities, + batch_indexed_next_qt_argmax) + + return project_distribution(target_support, next_probabilities, + self._support) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + # DQN explore net. + self.online_convnet = tf.make_template('Online', self._network_template) + self.target_convnet = tf.make_template('Target', self._network_template) + + self._net_outputs = self.online_convnet(self.state_ph) + # TODO(bellemare): Ties should be broken. They are unlikely to happen when + # using a deep network, but may affect performance with a linear + # approximation scheme. + self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + self._replay_next_target_net_outputs = self.target_convnet( + self._replay.next_states) + + def _build_networks_rpg(self): + # RPG learning net. + self.rpg_convnet = tf.make_template('RPG', self._network_template) + self._rpg_net_outputs = self.rpg_convnet(self.state_ph) + self._q_argmax_rpg = tf.argmax(self._rpg_net_outputs.q_values, axis=1)[0] + self._replay_rpg_net_outputs = self.rpg_convnet(self._replay_opt.states) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + target_distribution = tf.stop_gradient(self._build_target_distribution()) + + # size of indices: batch_size x 1. + indices = tf.range(tf.shape(self._replay_net_outputs.logits)[0])[:, None] + # size of reshaped_actions: batch_size x 2. + reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1) + # For each element of the batch, fetch the logits for its selected action. + chosen_action_logits = tf.gather_nd(self._replay_net_outputs.logits, + reshaped_actions) + + loss = tf.nn.softmax_cross_entropy_with_logits( + labels=target_distribution, + logits=chosen_action_logits) + + if self._replay_scheme == 'prioritized': + # The original prioritized experience replay uses a linear exponent + # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5 + # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested + # a fixed exponent actually performs better, except on Pong. + probs = self._replay.transition['sampling_probabilities'] + loss_weights = 1.0 / tf.sqrt(probs + 1e-10) + loss_weights /= tf.reduce_max(loss_weights) + + # Rainbow and prioritized replay are parametrized by an exponent alpha, + # but in both cases it is set to 0.5 - for simplicity's sake we leave it + # as is here, using the more direct tf.sqrt(). Taking the square root + # "makes sense", as we are dealing with a squared loss. + # Add a small nonzero value to the loss to avoid 0 priority items. While + # technically this may be okay, setting all items to 0 priority will cause + # troubles, and also result in 1.0 / 0.0 = NaN correction terms. + update_priorities_op = self._replay.tf_set_priority( + self._replay.indices, tf.sqrt(loss + 1e-10)) + + # Weight the loss by the inverse priorities. + loss = loss_weights * loss + else: + update_priorities_op = tf.no_op() + + with tf.control_dependencies([update_priorities_op]): + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('CrossEntropyLoss', tf.reduce_mean(loss)) + # Schaul et al. reports a slightly different rule, where 1/N is also + # exponentiated by beta. Not doing so seems more reasonable, and did not + # impact performance in our experiments. + return self.optimizer.minimize(tf.reduce_mean(loss)), loss + + def _build_train_op_rpg(self): + # RPG loss + replay_action_one_hot = tf.one_hot( + self._replay_opt.actions, self.num_actions, 1., 0., name='action_one_hot_rpg') + replay_chosen_q = tf.reduce_sum( + self._replay_rpg_net_outputs.q_values * replay_action_one_hot, + reduction_indices=1, + name='replay_chosen_q_rpg') + margin = 1 + qvalue = self._replay_rpg_net_outputs.q_values + # debug self.temp_action_one_hot = replay_action_one_hot + self.temp_qvalue = qvalue + self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot + self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \ + * ((1 - replay_action_one_hot) + (replay_action_one_hot)) + self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2) + rpg_loss = tf.reduce_mean(self.hingeloss) + return self.optimizer_rpg.minimize(rpg_loss) + + def _store_transition(self, + last_observation, + action, + reward, + is_terminal, + priority=None): + """Stores a transition when in training mode. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer (last_observation, action, reward, + is_terminal, priority). + + Args: + last_observation: Last observation, type determined via observation_type + parameter in the replay_memory constructor. + action: An integer, the action taken. + reward: A float, the reward. + is_terminal: Boolean indicating if the current state is a terminal state. + priority: Float. Priority of sampling the transition. If None, the default + priority will be used. If replay scheme is uniform, the default priority + is 1. If the replay scheme is prioritized, the default priority is the + maximum ever seen [Schaul et al., 2015]. + """ + if priority is None: + priority = (1. if self._replay_scheme == 'uniform' else + self._replay.memory.sum_tree.max_recorded_priority) + + if not self.eval_mode: + self._replay.add(last_observation, action, reward, is_terminal, priority) + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + self._store_transition(self._last_observation, self.action, reward, False) + self.replay_buffer_temp.add(self._last_observation, self.action, reward, False) + self._train_step() + + self.action = self._select_action() + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + self.replay_buffer_temp.clear() # this episode is not optimal + self._store_transition(self._observation, self.action, reward, True) + + def end_episode_(self, reward, total_reward, step_number): + """ This episodes is optimal trajectory """ + if not self.eval_mode: + # for DQN + self._store_transition(self._observation, self.action, reward, True) + + # replay buffer for RPG. + self.replay_buffer_temp.add(self._observation, self.action, reward, True) + count = step_number + while count > 0: + experience = self.replay_buffer_temp.get_sample() + state, action, reward, _ = experience + count -= 1 + # self.replay_buffer_opt.add(state, action, reward, False) + self._replay_opt.add(state, action, reward, False) + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + if self.eval_mode is not True: + epsilon = self.epsilon_fn( + self.epsilon_decay_period, + self.training_steps, + self.min_replay_history, + self.epsilon_train) + if random.random() <= epsilon: + # Choose a random action with probability epsilon. + return random.randint(0, self.num_actions - 1) + else: + # Choose the action with highest Q-value at the current state. + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + else: + # evaluation mode: use rpg. + return self._sess.run(self._q_argmax_rpg, {self.state_ph: self.state}) + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + if self._replay.memory.add_count > self.min_replay_history: + if self.training_steps % self.update_period == 0: + self._sess.run(self._train_op) + if self._replay_opt.memory.add_count > self.start_training: + self._sess.run(self._train_op_rpg) + + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + if self.training_steps % self.target_update_period == 0: + self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + +def project_distribution(supports, weights, target_support, + validate_args=False): + """Projects a batch of (support, weights) onto target_support. + + Based on equation (7) in (Bellemare et al., 2017): + https://arxiv.org/abs/1707.06887 + In the rest of the comments we will refer to this equation simply as Eq7. + + This code is not easy to digest, so we will use a running example to clarify + what is going on, with the following sample inputs: + + * supports = [[0, 2, 4, 6, 8], + [1, 3, 4, 5, 6]] + * weights = [[0.1, 0.6, 0.1, 0.1, 0.1], + [0.1, 0.2, 0.5, 0.1, 0.1]] + * target_support = [4, 5, 6, 7, 8] + + In the code below, comments preceded with 'Ex:' will be referencing the above + values. + + Args: + supports: Tensor of shape (batch_size, num_dims) defining supports for the + distribution. + weights: Tensor of shape (batch_size, num_dims) defining weights on the + original support points. Although for the CategoricalDQN agent these + weights are probabilities, it is not required that they are. + target_support: Tensor of shape (num_dims) defining support of the projected + distribution. The values must be monotonically increasing. Vmin and Vmax + will be inferred from the first and last elements of this tensor, + respectively. The values in this tensor must be equally spaced. + validate_args: Whether we will verify the contents of the + target_support parameter. + + Returns: + A Tensor of shape (batch_size, num_dims) with the projection of a batch of + (support, weights) onto target_support. + + Raises: + ValueError: If target_support has no dimensions, or if shapes of supports, + weights, and target_support are incompatible. + """ + target_support_deltas = target_support[1:] - target_support[:-1] + # delta_z = `\Delta z` in Eq7. + delta_z = target_support_deltas[0] + validate_deps = [] + supports.shape.assert_is_compatible_with(weights.shape) + supports[0].shape.assert_is_compatible_with(target_support.shape) + target_support.shape.assert_has_rank(1) + if validate_args: + # Assert that supports and weights have the same shapes. + validate_deps.append( + tf.Assert( + tf.reduce_all(tf.equal(tf.shape(supports), tf.shape(weights))), + [supports, weights])) + # Assert that elements of supports and target_support have the same shape. + validate_deps.append( + tf.Assert( + tf.reduce_all( + tf.equal(tf.shape(supports)[1], tf.shape(target_support))), + [supports, target_support])) + # Assert that target_support has a single dimension. + validate_deps.append( + tf.Assert( + tf.equal(tf.size(tf.shape(target_support)), 1), [target_support])) + # Assert that the target_support is monotonically increasing. + validate_deps.append( + tf.Assert(tf.reduce_all(target_support_deltas > 0), [target_support])) + # Assert that the values in target_support are equally spaced. + validate_deps.append( + tf.Assert( + tf.reduce_all(tf.equal(target_support_deltas, delta_z)), + [target_support])) + + with tf.control_dependencies(validate_deps): + # Ex: `v_min, v_max = 4, 8`. + v_min, v_max = target_support[0], target_support[-1] + # Ex: `batch_size = 2`. + batch_size = tf.shape(supports)[0] + # `N` in Eq7. + # Ex: `num_dims = 5`. + num_dims = tf.shape(target_support)[0] + # clipped_support = `[\hat{T}_{z_j}]^{V_max}_{V_min}` in Eq7. + # Ex: `clipped_support = [[[ 4. 4. 4. 6. 8.]] + # [[ 4. 4. 4. 5. 6.]]]`. + clipped_support = tf.clip_by_value(supports, v_min, v_max)[:, None, :] + # Ex: `tiled_support = [[[[ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.] + # [ 4. 4. 4. 6. 8.]] + # [[ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.] + # [ 4. 4. 4. 5. 6.]]]]`. + tiled_support = tf.tile([clipped_support], [1, 1, num_dims, 1]) + # Ex: `reshaped_target_support = [[[ 4.] + # [ 5.] + # [ 6.] + # [ 7.] + # [ 8.]] + # [[ 4.] + # [ 5.] + # [ 6.] + # [ 7.] + # [ 8.]]]`. + reshaped_target_support = tf.tile(target_support[:, None], [batch_size, 1]) + reshaped_target_support = tf.reshape(reshaped_target_support, + [batch_size, num_dims, 1]) + # numerator = `|clipped_support - z_i|` in Eq7. + # Ex: `numerator = [[[[ 0. 0. 0. 2. 4.] + # [ 1. 1. 1. 1. 3.] + # [ 2. 2. 2. 0. 2.] + # [ 3. 3. 3. 1. 1.] + # [ 4. 4. 4. 2. 0.]] + # [[ 0. 0. 0. 1. 2.] + # [ 1. 1. 1. 0. 1.] + # [ 2. 2. 2. 1. 0.] + # [ 3. 3. 3. 2. 1.] + # [ 4. 4. 4. 3. 2.]]]]`. + numerator = tf.abs(tiled_support - reshaped_target_support) + quotient = 1 - (numerator / delta_z) + # clipped_quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7. + # Ex: `clipped_quotient = [[[[ 1. 1. 1. 0. 0.] + # [ 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 1. 0.] + # [ 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 0. 1.]] + # [[ 1. 1. 1. 0. 0.] + # [ 0. 0. 0. 1. 0.] + # [ 0. 0. 0. 0. 1.] + # [ 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 0. 0.]]]]`. + clipped_quotient = tf.clip_by_value(quotient, 0, 1) + # Ex: `weights = [[ 0.1 0.6 0.1 0.1 0.1] + # [ 0.1 0.2 0.5 0.1 0.1]]`. + weights = weights[:, None, :] + # inner_prod = `\sum_{j=0}^{N-1} clipped_quotient * p_j(x', \pi(x'))` + # in Eq7. + # Ex: `inner_prod = [[[[ 0.1 0.6 0.1 0. 0. ] + # [ 0. 0. 0. 0. 0. ] + # [ 0. 0. 0. 0.1 0. ] + # [ 0. 0. 0. 0. 0. ] + # [ 0. 0. 0. 0. 0.1]] + # [[ 0.1 0.2 0.5 0. 0. ] + # [ 0. 0. 0. 0.1 0. ] + # [ 0. 0. 0. 0. 0.1] + # [ 0. 0. 0. 0. 0. ] + # [ 0. 0. 0. 0. 0. ]]]]`. + inner_prod = clipped_quotient * weights + # Ex: `projection = [[ 0.8 0.0 0.1 0.0 0.1] + # [ 0.8 0.1 0.1 0.0 0.0]]`. + projection = tf.reduce_sum(inner_prod, 3) + projection = tf.reshape(projection, [batch_size, num_dims]) + return projection diff --git a/dopamine/dopamine/agents/repg/__init__.py b/dopamine/dopamine/agents/repg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/repg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/repg/configs/repg.gin b/dopamine/dopamine/agents/repg/configs/repg.gin new file mode 100644 index 0000000..a0dad17 --- /dev/null +++ b/dopamine/dopamine/agents/repg/configs/repg.gin @@ -0,0 +1,36 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.repg.repg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +REPGAgent.gamma = 0.99 +REPGAgent.game_name = 'Pong' # Boxing, Pong +REPGAgent.update_horizon = 1 +REPGAgent.min_replay_history = 200000 # agent steps, step more than this, stop exploration. +REPGAgent.update_period = 4 +REPGAgent.epsilon_train = 0.0001 +REPGAgent.epsilon_eval = 0 +REPGAgent.epsilon_decay_period = 250000 # agent steps +REPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +REPGAgent.optimizer = @tf.train.RMSPropOptimizer() +REPGAgent.margin = 1 + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' # Pong +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 15 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 10000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/repg/repg_agent.py b/dopamine/dopamine/agents/repg/repg_agent.py new file mode 100644 index 0000000..69e0e49 --- /dev/null +++ b/dopamine/dopamine/agents/repg/repg_agent.py @@ -0,0 +1,607 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a DQN agent.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random + +from dopamine.agents.agent_utils import * +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf +from tensorflow.distributions import Categorical +import gin.tf +from collections import deque + +slim = tf.contrib.slim + +NATURE_DQN_OBSERVATION_SHAPE = (84, 84) # Size of downscaled Atari 2600 frame. +NATURE_DQN_DTYPE = tf.uint8 # DType of Atari 2600 observations. +NATURE_DQN_STACK_SIZE = 4 # Number of frames in the state stack. + + +def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon): + """Returns the current epsilon for the agent's epsilon-greedy policy. + + This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et + al., 2015). The schedule is as follows: + Begin at 1. until warmup_steps steps have been taken; then + Linearly decay epsilon from 1. to epsilon in decay_period steps; and then + Use epsilon from there on. + + Args: + decay_period: float, the period over which epsilon is decayed. + step: int, the number of training steps completed so far. + warmup_steps: int, the number of steps taken before epsilon is decayed. + epsilon: float, the final value to which to decay the epsilon parameter. + + Returns: + A float, the current epsilon value computed according to the schedule. + """ + steps_left = decay_period + warmup_steps - step + bonus = (1.0 - epsilon) * steps_left / decay_period + bonus = np.clip(bonus, 0., 1. - epsilon) + return epsilon + bonus + + +@gin.configurable +class REPGAgent(object): + """An implementation of the RPG with EPG exploration agent.""" + + def __init__(self, + sess, + num_actions, + game_name="Pong", + observation_shape=NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=NATURE_DQN_DTYPE, + stack_size=NATURE_DQN_STACK_SIZE, + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + epsilon_fn=linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + margin=1, + tf_device='/cpu:*', + use_staging=True, + max_tf_checkpoints_to_keep=3, + optimizer=tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints describing the observation shape. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to + keep. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + assert isinstance(observation_shape, tuple) + tf.logging.info('Creating %s agent with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t gamma: %f', gamma) + tf.logging.info('\t update_horizon: %f', update_horizon) + tf.logging.info('\t min_replay_history: %d', min_replay_history) + tf.logging.info('\t update_period: %d', update_period) + # tf.logging.info('\t random_seed: %d', random_seed) + tf.logging.info('\t epsilon_train: %f', epsilon_train) + tf.logging.info('\t epsilon_eval: %f', epsilon_eval) + tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) + tf.logging.info('\t tf_device: %s', tf_device) + tf.logging.info('\t use_staging: %s', use_staging) + tf.logging.info('\t optimizer: %s', optimizer) + tf.logging.info('\t game: %s', game_name) + + self.game_name = game_name + self.num_actions = num_actions + self.observation_shape = tuple(observation_shape) + self.observation_dtype = observation_dtype + self.stack_size = stack_size + self.gamma = gamma + self.update_horizon = update_horizon + self.cumulative_gamma = math.pow(gamma, update_horizon) + self.min_replay_history = min_replay_history + self.epsilon_fn = epsilon_fn + self.epsilon_train = epsilon_train + self.epsilon_eval = epsilon_eval + self.epsilon_decay_period = epsilon_decay_period + self.update_period = update_period + self.eval_mode = False + self.training_steps = 0 + self.optimizer = optimizer # optimizer for RPG + self.optimizer_exp = tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True) # optimizer for EPG + self.summary_writer = summary_writer + self.summary_writing_frequency = summary_writing_frequency + self.margin = margin + self.start_training = 1000 # todo task specific + self.highest_reward = 6 # todo task specific + # which deviate the hing loss. + self.isPrinted = False + self.current_replay_size = 0 + self.epsilon_current = 1 + + with tf.device(tf_device): + # Create a placeholder for the state input to the DQN network. + # The last axis indicates the number of consecutive frames stacked. + state_shape = (1,) + self.observation_shape + (stack_size,) + self.state = np.zeros(state_shape) + self.state_ph = tf.placeholder(self.observation_dtype, state_shape, + name='state_ph') + self._replay = self._build_replay_buffer(use_staging) + + self._build_networks() + + self._train_op, self._train_exp_op = self._build_train_op() + + self.replay_buffer = ReplayBufferRegular(100000) + + if self.summary_writer is not None: + # All tf.summaries should have been defined prior to running this. + self._merged_summaries = tf.summary.merge_all() + self._sess = sess + self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) + + # Variables to be initialized by the agent once it interacts with the + # environment. + self._observation = None + self._last_observation = None + + def _get_network_type(self): + """Returns the type of the outputs of a Q value network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('DQN_network', ['q_values']) + + def _network_template(self, state): + """Builds the convolutional network used to compute the agent's Q-values. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d(net, 32, [8, 8], stride=4) + net = slim.conv2d(net, 64, [4, 4], stride=2) + net = slim.conv2d(net, 64, [3, 3], stride=1) + net = slim.flatten(net) + net = slim.fully_connected(net, 512) + q_values = slim.fully_connected(net, self.num_actions, activation_fn=None) + return self._get_network_type()(q_values) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + self.explore_convnet = tf.make_template('Explore', self._network_template) + + self._net_outputs = self.online_convnet(self.state_ph) + self._exp_net_outputs = self.explore_convnet(self.state_ph) + + # TODO(bellemare): Ties should be broken. They are unlikely to happen when + # using a deep network, but may affect performance with a linear + # approximation scheme. + self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + self._replay_exp_net_outputs = self.explore_convnet( + self._replay.states) + + self.logsoftmaxprob = tf.nn.log_softmax(self._exp_net_outputs.q_values) + self.sample = Categorical(logits=self.logsoftmaxprob).sample(1) + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + + # for hinge loss + margin = 1 + + replay_action_one_hot = tf.one_hot( + self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') + qvalue = self._replay_net_outputs.q_values + replay_chosen_q = tf.reduce_sum( + self._replay_net_outputs.q_values * replay_action_one_hot, + reduction_indices=1, + name='replay_chosen_q') + self.temp_action_one_hot = replay_action_one_hot + self.temp_qvalue = qvalue + + # Q_j + c - Q_* = temp1 + temp2 + # temp1 = [Q_j + c, Q_*] + # temp2 = [-Q_*, -Q_*] + self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot + self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \ + * ((1 - replay_action_one_hot) + (replay_action_one_hot)) + self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2) + loss = tf.reduce_mean(self.hingeloss) + + # for cross entropy loss + logits = self._replay_exp_net_outputs.q_values + self.logsoftmaxprob = tf.nn.log_softmax(logits) + self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1) + # self.temp_loss = self.neglogprob # * self.y_pl + self.actor_loss = tf.reduce_mean(self.neglogprob) + self.replay_action_one_hot = replay_action_one_hot + + # target = tf.stop_gradient(self._build_target_q_op()) + # loss = tf.losses.huber_loss( + # target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('hingeLoss', loss) + if self.exploration_strategy == "EPG": + tf.summary.scalar('actorloss', self.actor_loss) + return self.optimizer.minimize(loss), self.optimizer_exp.minimize(self.actor_loss) + + def begin_episode(self, observation): + """Returns the agent's first action for this episode. + + Args: + observation: numpy array, the environment's initial observation. + + Returns: + int, the selected action. + """ + self._reset_state() + self._record_observation(observation) + + if not self.eval_mode: + self._train_step() + + self.action = self._select_action() + return self.action + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + + self._train_step() + + self.action = self._select_action() + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + # if reward < 0: + # self.replay_buffer.clear() + # elif reward > 0: + # self.replay_buffer.add(self._last_observation, self.action, reward, True) + # while self.replay_buffer.size() > 0: + # experience = self.replay_buffer.get_sample() + # state, action, reward, _ = experience + # self._store_transition(state, action, reward, True) + # # there is zero transition padding to the memory in self._replay. + # else: + # self.replay_buffer.add(self._last_observation, self.action, reward, True) + + def _select_action_training(self): + """Use EPG to select action during training, """ + return self._sess.run(self.sample, {self.state_ph: self.state})[0][0] + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + # epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn( + # self.epsilon_decay_period, + # self.training_steps, + # self.min_replay_history, + # self.epsilon_train) + + self.epsilon_current = 0 + self.current_replay_size = self._replay.memory.add_count + if self.eval_mode: + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + return self._select_action_training() + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + + if self._replay.memory.add_count > self.start_training: + if self.training_steps % self.update_period == 0: + + # # debug checked. + # _, temp1, temp2, taction, tqvalue, hingloss = self._sess.run([self._train_op, + # self.temp1, + # self.temp2, + # self.temp_action_one_hot, + # self.temp_qvalue, + # self.hingeloss]) + self._sess.run(self._train_op) + self._sess.run(self._train_exp_op) + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + # if self.training_steps % self.target_update_period == 0: + # self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False: + print("start training at {}".format(self.training_steps)) + self.isPrinted = True + + def _record_observation(self, observation): + """Records an observation and update state. + + Extracts a frame from the observation vector and overwrites the oldest + frame in the state buffer. + + Args: + observation: numpy array, an observation from the environment. + """ + # Set current observation. We do the reshaping to handle environments + # without frame stacking. + observation = np.reshape(observation, self.observation_shape) + self._observation = observation[..., 0] + self._observation = np.reshape(observation, self.observation_shape) + # Swap out the oldest frame with the current frame. + self.state = np.roll(self.state, -1, axis=-1) + self.state[0, ..., -1] = self._observation + + def _store_transition(self, last_observation, action, reward, is_terminal): + """Stores an experienced transition. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer: + (last_observation, action, reward, is_terminal). + + Pedantically speaking, this does not actually store an entire transition + since the next state is recorded on the following time step. + + Args: + last_observation: numpy array, last observation. + action: int, the action taken. + reward: float, the reward. + is_terminal: bool, indicating if the current state is a terminal state. + """ + self._replay.add(last_observation, action, reward, is_terminal) + + def _reset_state(self): + """Resets the agent state by filling it with zeros.""" + self.state.fill(0) + + def bundle_and_checkpoint(self, checkpoint_dir, iteration_number): + """Returns a self-contained bundle of the agent's state. + + This is used for checkpointing. It will return a dictionary containing all + non-TensorFlow objects (to be saved into a file by the caller), and it saves + all TensorFlow objects into a checkpoint file. + + Args: + checkpoint_dir: str, directory where TensorFlow objects will be saved. + iteration_number: int, iteration number to use for naming the checkpoint + file. + + Returns: + A dict containing additional Python objects to be checkpointed by the + experiment. If the checkpoint directory does not exist, returns None. + """ + if not tf.gfile.Exists(checkpoint_dir): + return None + # Call the Tensorflow saver to checkpoint the graph. + self._saver.save( + self._sess, + os.path.join(checkpoint_dir, 'tf_ckpt'), + global_step=iteration_number) + # Checkpoint the out-of-graph replay buffer. + self._replay.save(checkpoint_dir, iteration_number) + bundle_dictionary = {} + bundle_dictionary['state'] = self.state + bundle_dictionary['eval_mode'] = self.eval_mode + bundle_dictionary['training_steps'] = self.training_steps + return bundle_dictionary + + def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary): + """Restores the agent from a checkpoint. + + Restores the agent's Python objects to those specified in bundle_dictionary, + and restores the TensorFlow objects to those specified in the + checkpoint_dir. If the checkpoint_dir does not exist, will not reset the + agent's state. + + Args: + checkpoint_dir: str, path to the checkpoint saved by tf.Save. + iteration_number: int, checkpoint version, used when restoring replay + buffer. + bundle_dictionary: dict, containing additional Python objects owned by + the agent. + + Returns: + bool, True if unbundling was successful. + """ + try: + # self._replay.load() will throw a NotFoundError if it does not find all + # the necessary files, in which case we abort the process & return False. + self._replay.load(checkpoint_dir, iteration_number) + except tf.errors.NotFoundError: + return False + for key in self.__dict__: + if key in bundle_dictionary: + self.__dict__[key] = bundle_dictionary[key] + # Restore the agent's TensorFlow graph. + self._saver.restore(self._sess, + os.path.join(checkpoint_dir, + 'tf_ckpt-{}'.format(iteration_number))) + return True + + +class ReplayBufferRegular(object): + """ for uniformly sampling. + + """ + + def __init__(self, buffer_size, random_seed=1234): + self.buffer_size = buffer_size + self.count = 0 + # Right side of deque contains newest experience + self.buffer = deque() + random.seed(random_seed) + self.ptr, self.path_start_idx = 0, 0 + + def add(self, state, action, reward, terminal): + experience = [state, action, reward, terminal] + assert self.count < self.buffer_size + self.buffer.append(experience) + self.count += 1 + self.ptr += 1 + # else: + # self.path_start_idx -= 1 + # self.ptr = self.buffer_size - 1 + # self.buffer.popleft() + # self.buffer.append(experience) + + def get_sample(self): + self.count -= 1 + return self.buffer.popleft() + + def size(self): + return self.count + + def clear(self): + self.buffer.clear() + self.count = 0 + self.ptr = 0 + self.path_start_idx = 0 diff --git a/dopamine/dopamine/agents/rpg/__init__.py b/dopamine/dopamine/agents/rpg/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/agents/rpg/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/agents/rpg/configs/rpg.gin b/dopamine/dopamine/agents/rpg/configs/rpg.gin new file mode 100644 index 0000000..96a69fd --- /dev/null +++ b/dopamine/dopamine/agents/rpg/configs/rpg.gin @@ -0,0 +1,36 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.rpg.rpg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +RPGAgent.gamma = 0.99 +RPGAgent.game_name = 'Pong' +RPGAgent.update_horizon = 1 +RPGAgent.min_replay_history = 200000 # agent steps, step more than this, stop exploration. +RPGAgent.update_period = 4 +RPGAgent.epsilon_train = 0.0001 +RPGAgent.epsilon_eval = 0 +RPGAgent.epsilon_decay_period = 250000 # agent steps +RPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RPGAgent.optimizer = @tf.train.RMSPropOptimizer() +RPGAgent.margin = 1 + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 15 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 10000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 32 diff --git a/dopamine/dopamine/agents/rpg/configs/rpg_pong.gin b/dopamine/dopamine/agents/rpg/configs/rpg_pong.gin new file mode 100644 index 0000000..e155ad4 --- /dev/null +++ b/dopamine/dopamine/agents/rpg/configs/rpg_pong.gin @@ -0,0 +1,36 @@ +# Hyperparameters follow the classic Nature DQN, but we modify as necessary to +# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples +# comparison. +import dopamine.atari.run_experiment +import dopamine.agents.rpg.rpg_agent +import dopamine.replay_memory.circular_replay_buffer +import gin.tf.external_configurables + +RPGAgent.gamma = 0.99 +RPGAgent.game_name = 'Pong' +RPGAgent.update_horizon = 1 +RPGAgent.min_replay_history = 200000 # agent steps, step more than this, stop exploration. +RPGAgent.update_period = 4 +RPGAgent.epsilon_train = 0.0001 +RPGAgent.epsilon_eval = 0 +RPGAgent.epsilon_decay_period = 250000 # agent steps +RPGAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RPGAgent.optimizer = @tf.train.RMSPropOptimizer() +RPGAgent.margin = 1 + +tf.train.RMSPropOptimizer.learning_rate = 0.00025 +tf.train.RMSPropOptimizer.decay = 0.95 +tf.train.RMSPropOptimizer.momentum = 0.0 +tf.train.RMSPropOptimizer.epsilon = 0.00001 +tf.train.RMSPropOptimizer.centered = True + +Runner.game_name = 'Pong' +# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017). +Runner.sticky_actions = False +Runner.num_iterations = 15 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 10000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedReplayBuffer.replay_capacity = 1000000 +WrappedReplayBuffer.batch_size = 256 diff --git a/dopamine/dopamine/agents/rpg/rpg_agent.py b/dopamine/dopamine/agents/rpg/rpg_agent.py new file mode 100644 index 0000000..e7ddbd5 --- /dev/null +++ b/dopamine/dopamine/agents/rpg/rpg_agent.py @@ -0,0 +1,613 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compact implementation of a DQN agent.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random + +from dopamine.agents.agent_utils import * +from dopamine.replay_memory import circular_replay_buffer +import numpy as np +import tensorflow as tf +from tensorflow.distributions import Categorical +import gin.tf +from collections import deque + +slim = tf.contrib.slim + +NATURE_DQN_OBSERVATION_SHAPE = (84, 84) # Size of downscaled Atari 2600 frame. +NATURE_DQN_DTYPE = tf.uint8 # DType of Atari 2600 observations. +NATURE_DQN_STACK_SIZE = 4 # Number of frames in the state stack. + + +def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon): + """Returns the current epsilon for the agent's epsilon-greedy policy. + + This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et + al., 2015). The schedule is as follows: + Begin at 1. until warmup_steps steps have been taken; then + Linearly decay epsilon from 1. to epsilon in decay_period steps; and then + Use epsilon from there on. + + Args: + decay_period: float, the period over which epsilon is decayed. + step: int, the number of training steps completed so far. + warmup_steps: int, the number of steps taken before epsilon is decayed. + epsilon: float, the final value to which to decay the epsilon parameter. + + Returns: + A float, the current epsilon value computed according to the schedule. + """ + steps_left = decay_period + warmup_steps - step + bonus = (1.0 - epsilon) * steps_left / decay_period + bonus = np.clip(bonus, 0., 1. - epsilon) + return epsilon + bonus + + +@gin.configurable +class RPGAgent(object): + """An implementation of the DQN agent.""" + + def __init__(self, + sess, + num_actions, + game_name="Pong", + observation_shape=NATURE_DQN_OBSERVATION_SHAPE, + observation_dtype=NATURE_DQN_DTYPE, + stack_size=NATURE_DQN_STACK_SIZE, + gamma=0.99, + update_horizon=1, + min_replay_history=20000, + update_period=4, + epsilon_fn=linearly_decaying_epsilon, + epsilon_train=0.01, + epsilon_eval=0.001, + epsilon_decay_period=250000, + margin=1, + tf_device='/cpu:*', + use_staging=True, + max_tf_checkpoints_to_keep=3, + optimizer=tf.train.RMSPropOptimizer( + learning_rate=0.00025, + decay=0.95, + momentum=0.0, + epsilon=0.00001, + centered=True), + summary_writer=None, + summary_writing_frequency=500): + """Initializes the agent and constructs the components of its graph. + + Args: + sess: `tf.Session`, for executing ops. + num_actions: int, number of actions the agent can take at any state. + observation_shape: tuple of ints describing the observation shape. + observation_dtype: tf.DType, specifies the type of the observations. Note + that if your inputs are continuous, you should set this to tf.float32. + stack_size: int, number of frames to use in state stack. + gamma: float, discount factor with the usual RL meaning. + update_horizon: int, horizon at which updates are performed, the 'n' in + n-step update. + min_replay_history: int, number of transitions that should be experienced + before the agent begins training its value function. + update_period: int, period between DQN updates. + target_update_period: int, update period for the target network. + epsilon_fn: function expecting 4 parameters: + (decay_period, step, warmup_steps, epsilon). This function should return + the epsilon value used for exploration during training. + epsilon_train: float, the value to which the agent's epsilon is eventually + decayed during training. + epsilon_eval: float, epsilon used when evaluating the agent. + epsilon_decay_period: int, length of the epsilon decay schedule. + tf_device: str, Tensorflow device on which the agent's graph is executed. + use_staging: bool, when True use a staging area to prefetch the next + training batch, speeding training up by about 30%. + max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to + keep. + optimizer: `tf.train.Optimizer`, for training the value function. + summary_writer: SummaryWriter object for outputting training statistics. + Summary writing disabled if set to None. + summary_writing_frequency: int, frequency with which summaries will be + written. Lower values will result in slower training. + """ + assert isinstance(observation_shape, tuple) + tf.logging.info('Creating %s agent with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t gamma: %f', gamma) + tf.logging.info('\t update_horizon: %f', update_horizon) + tf.logging.info('\t min_replay_history: %d', min_replay_history) + tf.logging.info('\t update_period: %d', update_period) + # tf.logging.info('\t random_seed: %d', random_seed) + tf.logging.info('\t epsilon_train: %f', epsilon_train) + tf.logging.info('\t epsilon_eval: %f', epsilon_eval) + tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) + tf.logging.info('\t tf_device: %s', tf_device) + tf.logging.info('\t use_staging: %s', use_staging) + tf.logging.info('\t optimizer: %s', optimizer) + tf.logging.info('\t game: %s', game_name) + + self.game_name = game_name + self.num_actions = num_actions + self.observation_shape = tuple(observation_shape) + self.observation_dtype = observation_dtype + self.stack_size = stack_size + self.gamma = gamma + self.update_horizon = update_horizon + self.cumulative_gamma = math.pow(gamma, update_horizon) + self.min_replay_history = min_replay_history + self.epsilon_fn = epsilon_fn + self.epsilon_train = epsilon_train + self.epsilon_eval = epsilon_eval + self.epsilon_decay_period = epsilon_decay_period + self.update_period = update_period + self.eval_mode = False + self.training_steps = 0 + self.optimizer = optimizer + self.summary_writer = summary_writer + self.summary_writing_frequency = summary_writing_frequency + self.margin = margin + self.start_training = 1000 # todo task specific + self.highest_reward = 6 # todo task specific + self.exploration_strategy = "NonEGP" # NonEPG for random explore. + # todo EPG exploration failed since when hinge loss is small, it will optimize the cross entropy, + # which deviate the hing loss. + self.isPrinted = False + self.current_replay_size = 0 + self.epsilon_current = 1 + + with tf.device(tf_device): + # Create a placeholder for the state input to the DQN network. + # The last axis indicates the number of consecutive frames stacked. + state_shape = (1,) + self.observation_shape + (stack_size,) + self.state = np.zeros(state_shape) + self.state_ph = tf.placeholder(self.observation_dtype, state_shape, + name='state_ph') + self._replay = self._build_replay_buffer(use_staging) + + self._build_networks() + + self._train_op = self._build_train_op() + + self.replay_buffer = ReplayBufferRegular(100000) + + if self.summary_writer is not None: + # All tf.summaries should have been defined prior to running this. + self._merged_summaries = tf.summary.merge_all() + self._sess = sess + self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep) + + # Variables to be initialized by the agent once it interacts with the + # environment. + self._observation = None + self._last_observation = None + + def _get_network_type(self): + """Returns the type of the outputs of a Q value network. + + Returns: + net_type: _network_type object defining the outputs of the network. + """ + return collections.namedtuple('DQN_network', ['q_values']) + + def _network_template(self, state): + """Builds the convolutional network used to compute the agent's Q-values. + + Args: + state: `tf.Tensor`, contains the agent's current state. + + Returns: + net: _network_type object containing the tensors output by the network. + """ + net = tf.cast(state, tf.float32) + net = tf.div(net, 255.) + net = slim.conv2d(net, 32, [8, 8], stride=4) + net = slim.conv2d(net, 64, [4, 4], stride=2) + net = slim.conv2d(net, 64, [3, 3], stride=1) + net = slim.flatten(net) + net = slim.fully_connected(net, 512) + q_values = slim.fully_connected(net, self.num_actions, activation_fn=None) + return self._get_network_type()(q_values) + + def _build_networks(self): + """Builds the Q-value network computations needed for acting and training. + + These are: + self.online_convnet: For computing the current state's Q-values. + self.target_convnet: For computing the next state's target Q-values. + self._net_outputs: The actual Q-values. + self._q_argmax: The action maximizing the current state's Q-values. + self._replay_net_outputs: The replayed states' Q-values. + self._replay_next_target_net_outputs: The replayed next states' target + Q-values (see Mnih et al., 2015 for details). + """ + # Calling online_convnet will generate a new graph as defined in + # self._get_network_template using whatever input is passed, but will always + # share the same weights. + self.online_convnet = tf.make_template('Online', self._network_template) + self.target_convnet = tf.make_template('Target', self._network_template) + self._net_outputs = self.online_convnet(self.state_ph) + # TODO(bellemare): Ties should be broken. They are unlikely to happen when + # using a deep network, but may affect performance with a linear + # approximation scheme. + self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] + + self._replay_net_outputs = self.online_convnet(self._replay.states) + self._replay_next_target_net_outputs = self.target_convnet( + self._replay.next_states) + self.logsoftmaxprob = tf.nn.log_softmax(self._net_outputs.q_values) + self.sample = Categorical(logits=self.logsoftmaxprob).sample(1) + + def _build_replay_buffer(self, use_staging): + """Creates the replay buffer used by the agent. + + Args: + use_staging: bool, if True, uses a staging area to prefetch data for + faster training. + + Returns: + A WrapperReplayBuffer object. + """ + return circular_replay_buffer.WrappedReplayBuffer( + observation_shape=self.observation_shape, + stack_size=self.stack_size, + use_staging=use_staging, + update_horizon=self.update_horizon, + gamma=self.gamma, + observation_dtype=self.observation_dtype.as_numpy_dtype) + + def _build_train_op(self): + """Builds a training op. + + Returns: + train_op: An op performing one step of training from replay data. + """ + margin = 1 + + replay_action_one_hot = tf.one_hot( + self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') + qvalue = self._replay_net_outputs.q_values + replay_chosen_q = tf.reduce_sum( + self._replay_net_outputs.q_values * replay_action_one_hot, + reduction_indices=1, + name='replay_chosen_q') + self.temp_action_one_hot = replay_action_one_hot + self.temp_qvalue = qvalue + + # Q_j + c - Q_* = temp1 + temp2 + # temp1 = [Q_j + c, Q_*] + # temp2 = [-Q_*, -Q_*] + self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot + self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \ + * ((1 - replay_action_one_hot) + (replay_action_one_hot)) + self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2) + loss = tf.reduce_mean(self.hingeloss) + + if self.exploration_strategy == "EPG": + logits = qvalue + self.logsoftmaxprob = tf.nn.log_softmax(logits) + self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1) + self.actor_loss = tf.reduce_mean(self.neglogprob) + loss = self.actor_loss + loss + + # target = tf.stop_gradient(self._build_target_q_op()) + # loss = tf.losses.huber_loss( + # target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) + if self.summary_writer is not None: + with tf.variable_scope('Losses'): + tf.summary.scalar('hingeLoss', loss) + if self.exploration_strategy == "EPG": + tf.summary.scalar('actorloss', self.actor_loss) + return self.optimizer.minimize(loss) + + def begin_episode(self, observation): + """Returns the agent's first action for this episode. + + Args: + observation: numpy array, the environment's initial observation. + + Returns: + int, the selected action. + """ + self._reset_state() + self._record_observation(observation) + + if not self.eval_mode: + self._train_step() + + self.action = self._select_action() + return self.action + + def step(self, reward, observation): + """Records the most recent transition and returns the agent's next action. + + We store the observation of the last time step since we want to store it + with the reward. + + Args: + reward: float, the reward received from the agent's most recent action. + observation: numpy array, the most recent observation. + + Returns: + int, the selected action. + """ + self._last_observation = self._observation + self._record_observation(observation) + + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + + self._train_step() + + self.action = self._select_action() + return self.action + + def end_episode(self, reward): + """Signals the end of the episode to the agent. + + We store the observation of the current time step, which is the last + observation of the episode. + + Args: + reward: float, the last reward from the environment. + """ + if not self.eval_mode: + if self.game_name in ["Pong"]: + collect_trajectory(self, reward) + else: + raise ValueError("collection wrong trajectory") + # if reward < 0: + # self.replay_buffer.clear() + # elif reward > 0: + # self.replay_buffer.add(self._last_observation, self.action, reward, True) + # while self.replay_buffer.size() > 0: + # experience = self.replay_buffer.get_sample() + # state, action, reward, _ = experience + # self._store_transition(state, action, reward, True) + # # there is zero transition padding to the memory in self._replay. + # else: + # self.replay_buffer.add(self._last_observation, self.action, reward, True) + + def _select_action_training(self): + """Use EPG to select action during training, """ + return self._sess.run(self.sample, {self.state_ph: self.state})[0][0] + + def _select_action(self): + """Select an action from the set of available actions. + + Chooses an action randomly with probability self._calculate_epsilon(), and + otherwise acts greedily according to the current Q-value estimates. + + Returns: + int, the selected action. + """ + # epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn( + # self.epsilon_decay_period, + # self.training_steps, + # self.min_replay_history, + # self.epsilon_train) + + exploration = self.exploration_strategy # default random explore + if exploration == "EPG": + self.epsilon_current = 0 + self.current_replay_size = self._replay.memory.add_count + if self.eval_mode: + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + return self._select_action_training() + else: + if self.training_steps < self.min_replay_history: + epsilon = 1 + else: + epsilon = self.epsilon_train + if self.eval_mode: + epsilon = self.epsilon_eval + + self.epsilon_current = epsilon + self.current_replay_size = self._replay.memory.add_count + if random.random() <= epsilon: + # Choose a random action with probability epsilon. + return random.randint(0, self.num_actions - 1) + else: + # Choose the action with highest Q-value at the current state. + return self._sess.run(self._q_argmax, {self.state_ph: self.state}) + + def _train_step(self): + """Runs a single training step. + + Runs a training op if both: + (1) A minimum number of frames have been added to the replay buffer. + (2) `training_steps` is a multiple of `update_period`. + + Also, syncs weights from online to target network if training steps is a + multiple of target update period. + """ + # Run a train op at the rate of self.update_period if enough training steps + # have been run. This matches the Nature DQN behaviour. + + if self._replay.memory.add_count > self.start_training: + if self.training_steps % self.update_period == 0: + + # # debug checked. + # _, temp1, temp2, taction, tqvalue, hingloss = self._sess.run([self._train_op, + # self.temp1, + # self.temp2, + # self.temp_action_one_hot, + # self.temp_qvalue, + # self.hingeloss]) + self._sess.run(self._train_op) + if (self.summary_writer is not None and + self.training_steps > 0 and + self.training_steps % self.summary_writing_frequency == 0): + summary = self._sess.run(self._merged_summaries) + self.summary_writer.add_summary(summary, self.training_steps) + + # if self.training_steps % self.target_update_period == 0: + # self._sess.run(self._sync_qt_ops) + + self.training_steps += 1 + + if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False: + print("start training at {}".format(self.training_steps)) + self.isPrinted = True + + def _record_observation(self, observation): + """Records an observation and update state. + + Extracts a frame from the observation vector and overwrites the oldest + frame in the state buffer. + + Args: + observation: numpy array, an observation from the environment. + """ + # Set current observation. We do the reshaping to handle environments + # without frame stacking. + observation = np.reshape(observation, self.observation_shape) + self._observation = observation[..., 0] + self._observation = np.reshape(observation, self.observation_shape) + # Swap out the oldest frame with the current frame. + self.state = np.roll(self.state, -1, axis=-1) + self.state[0, ..., -1] = self._observation + + def _store_transition(self, last_observation, action, reward, is_terminal): + """Stores an experienced transition. + + Executes a tf session and executes replay buffer ops in order to store the + following tuple in the replay buffer: + (last_observation, action, reward, is_terminal). + + Pedantically speaking, this does not actually store an entire transition + since the next state is recorded on the following time step. + + Args: + last_observation: numpy array, last observation. + action: int, the action taken. + reward: float, the reward. + is_terminal: bool, indicating if the current state is a terminal state. + """ + self._replay.add(last_observation, action, reward, is_terminal) + + def _reset_state(self): + """Resets the agent state by filling it with zeros.""" + self.state.fill(0) + + def bundle_and_checkpoint(self, checkpoint_dir, iteration_number): + """Returns a self-contained bundle of the agent's state. + + This is used for checkpointing. It will return a dictionary containing all + non-TensorFlow objects (to be saved into a file by the caller), and it saves + all TensorFlow objects into a checkpoint file. + + Args: + checkpoint_dir: str, directory where TensorFlow objects will be saved. + iteration_number: int, iteration number to use for naming the checkpoint + file. + + Returns: + A dict containing additional Python objects to be checkpointed by the + experiment. If the checkpoint directory does not exist, returns None. + """ + if not tf.gfile.Exists(checkpoint_dir): + return None + # Call the Tensorflow saver to checkpoint the graph. + self._saver.save( + self._sess, + os.path.join(checkpoint_dir, 'tf_ckpt'), + global_step=iteration_number) + # Checkpoint the out-of-graph replay buffer. + self._replay.save(checkpoint_dir, iteration_number) + bundle_dictionary = {} + bundle_dictionary['state'] = self.state + bundle_dictionary['eval_mode'] = self.eval_mode + bundle_dictionary['training_steps'] = self.training_steps + return bundle_dictionary + + def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary): + """Restores the agent from a checkpoint. + + Restores the agent's Python objects to those specified in bundle_dictionary, + and restores the TensorFlow objects to those specified in the + checkpoint_dir. If the checkpoint_dir does not exist, will not reset the + agent's state. + + Args: + checkpoint_dir: str, path to the checkpoint saved by tf.Save. + iteration_number: int, checkpoint version, used when restoring replay + buffer. + bundle_dictionary: dict, containing additional Python objects owned by + the agent. + + Returns: + bool, True if unbundling was successful. + """ + try: + # self._replay.load() will throw a NotFoundError if it does not find all + # the necessary files, in which case we abort the process & return False. + self._replay.load(checkpoint_dir, iteration_number) + except tf.errors.NotFoundError: + return False + for key in self.__dict__: + if key in bundle_dictionary: + self.__dict__[key] = bundle_dictionary[key] + # Restore the agent's TensorFlow graph. + self._saver.restore(self._sess, + os.path.join(checkpoint_dir, + 'tf_ckpt-{}'.format(iteration_number))) + return True + + +class ReplayBufferRegular(object): + """ for uniformly sampling. + + """ + + def __init__(self, buffer_size, random_seed=1234): + self.buffer_size = buffer_size + self.count = 0 + # Right side of deque contains newest experience + self.buffer = deque() + random.seed(random_seed) + self.ptr, self.path_start_idx = 0, 0 + + def add(self, state, action, reward, terminal): + experience = [state, action, reward, terminal] + assert self.count < self.buffer_size + self.buffer.append(experience) + self.count += 1 + self.ptr += 1 + # else: + # self.path_start_idx -= 1 + # self.ptr = self.buffer_size - 1 + # self.buffer.popleft() + # self.buffer.append(experience) + + def get_sample(self): + self.count -= 1 + return self.buffer.popleft() + + def size(self): + return self.count + + def clear(self): + self.buffer.clear() + self.count = 0 + self.ptr = 0 + self.path_start_idx = 0 diff --git a/dopamine/dopamine/atari/__init__.py b/dopamine/dopamine/atari/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/atari/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/atari/preprocessing.py b/dopamine/dopamine/atari/preprocessing.py new file mode 100644 index 0000000..861c544 --- /dev/null +++ b/dopamine/dopamine/atari/preprocessing.py @@ -0,0 +1,216 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A class implementing minimal Atari 2600 preprocessing. + +This includes: + . Emitting a terminal signal when losing a life (optional). + . Frame skipping and color pooling. + . Resizing the image before it is provided to the agent. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from gym.spaces.box import Box +import numpy as np +import gin.tf +import cv2 + + +@gin.configurable +class AtariPreprocessing(object): + """A class implementing image preprocessing for Atari 2600 agents. + + Specifically, this provides the following subset from the JAIR paper + (Bellemare et al., 2013) and Nature DQN paper (Mnih et al., 2015): + + * Frame skipping (defaults to 4). + * Terminal signal when a life is lost (off by default). + * Grayscale and max-pooling of the last two frames. + * Downsample the screen to a square image (defaults to 84x84). + + More generally, this class follows the preprocessing guidelines set down in + Machado et al. (2018), "Revisiting the Arcade Learning Environment: + Evaluation Protocols and Open Problems for General Agents". + """ + + def __init__(self, environment, frame_skip=4, terminal_on_life_loss=False, + screen_size=84): + """Constructor for an Atari 2600 preprocessor. + + Args: + environment: Gym environment whose observations are preprocessed. + frame_skip: int, the frequency at which the agent experiences the game. + terminal_on_life_loss: bool, If True, the step() method returns + is_terminal=True whenever a life is lost. See Mnih et al. 2015. + screen_size: int, size of a resized Atari 2600 frame. + + Raises: + ValueError: if frame_skip or screen_size are not strictly positive. + """ + if frame_skip <= 0: + raise ValueError('Frame skip should be strictly positive, got {}'. + format(frame_skip)) + if screen_size <= 0: + raise ValueError('Target screen size should be strictly positive, got {}'. + format(screen_size)) + + self.environment = environment + self.terminal_on_life_loss = terminal_on_life_loss + self.frame_skip = frame_skip + self.screen_size = screen_size + + obs_dims = self.environment.observation_space + # Stores temporary observations used for pooling over two successive + # frames. + self.screen_buffer = [ + np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8), + np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8) + ] + + self.game_over = False + self.lives = 0 # Will need to be set by reset(). + + @property + def observation_space(self): + # Return the observation space adjusted to match the shape of the processed + # observations. + return Box(low=0, high=255, shape=(self.screen_size, self.screen_size, 1), + dtype=np.uint8) + + @property + def action_space(self): + return self.environment.action_space + + @property + def reward_range(self): + return self.environment.reward_range + + @property + def metadata(self): + return self.environment.metadata + + def reset(self): + """Resets the environment. + + Returns: + observation: numpy array, the initial observation emitted by the + environment. + """ + self.environment.reset() + self.lives = self.environment.ale.lives() + self._fetch_grayscale_observation(self.screen_buffer[0]) + self.screen_buffer[1].fill(0) + return self._pool_and_resize() + + def render(self, mode): + """Renders the current screen, before preprocessing. + + This calls the Gym API's render() method. + + Args: + mode: Mode argument for the environment's render() method. + Valid values (str) are: + 'rgb_array': returns the raw ALE image. + 'human': renders to display via the Gym renderer. + + Returns: + if mode='rgb_array': numpy array, the most recent screen. + if mode='human': bool, whether the rendering was successful. + """ + return self.environment.render(mode) + + def step(self, action): + """Applies the given action in the environment. + + Remarks: + + * If a terminal state (from life loss or episode end) is reached, this may + execute fewer than self.frame_skip steps in the environment. + * Furthermore, in this case the returned observation may not contain valid + image data and should be ignored. + + Args: + action: The action to be executed. + + Returns: + observation: numpy array, the observation following the action. + reward: float, the reward following the action. + is_terminal: bool, whether the environment has reached a terminal state. + This is true when a life is lost and terminal_on_life_loss, or when the + episode is over. + info: Gym API's info data structure. + """ + accumulated_reward = 0. + + for time_step in range(self.frame_skip): + # We bypass the Gym observation altogether and directly fetch the + # grayscale image from the ALE. This is a little faster. + _, reward, game_over, info = self.environment.step(action) + accumulated_reward += reward + + if self.terminal_on_life_loss: + new_lives = self.environment.ale.lives() + is_terminal = game_over or new_lives < self.lives + self.lives = new_lives + else: + is_terminal = game_over + + if is_terminal: + break + # We max-pool over the last two frames, in grayscale. + elif time_step >= self.frame_skip - 2: + t = time_step - (self.frame_skip - 2) + self._fetch_grayscale_observation(self.screen_buffer[t]) + + # Pool the last two observations. + observation = self._pool_and_resize() + + self.game_over = game_over + return observation, accumulated_reward, is_terminal, info + + def _fetch_grayscale_observation(self, output): + """Returns the current observation in grayscale. + + The returned observation is stored in 'output'. + + Args: + output: numpy array, screen buffer to hold the returned observation. + + Returns: + observation: numpy array, the current observation in grayscale. + """ + self.environment.ale.getScreenGrayscale(output) + return output + + def _pool_and_resize(self): + """Transforms two frames into a Nature DQN observation. + + For efficiency, the transformation is done in-place in self.screen_buffer. + + Returns: + transformed_screen: numpy array, pooled, resized screen. + """ + # Pool if there are enough screens to do so. + if self.frame_skip > 1: + np.maximum(self.screen_buffer[0], self.screen_buffer[1], + out=self.screen_buffer[0]) + + transformed_image = cv2.resize(self.screen_buffer[0], + (self.screen_size, self.screen_size), + interpolation=cv2.INTER_AREA) + int_image = np.asarray(transformed_image, dtype=np.uint8) + return np.expand_dims(int_image, axis=2) diff --git a/dopamine/dopamine/atari/run_experiment.py b/dopamine/dopamine/atari/run_experiment.py new file mode 100644 index 0000000..e82758c --- /dev/null +++ b/dopamine/dopamine/atari/run_experiment.py @@ -0,0 +1,592 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module defining classes and helper methods for running Atari 2600 agents.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import time + + +import atari_py +from dopamine.atari import preprocessing +from dopamine.common import checkpointer +from dopamine.common import iteration_statistics +from dopamine.common import logger +from dopamine.agents.agent_utils import * +import gym +import numpy as np +import tensorflow as tf + +import gin.tf + +RPG_AGENTS = ['dqnrpg', 'rainbowrpg', 'implicit_quantilerpg', 'c51rpg'] + + +def load_gin_configs(gin_files, gin_bindings): + """Loads gin configuration files. + + Args: + gin_files: list, of paths to the gin configuration files for this + experiment. + gin_bindings: list, of gin parameter bindings to override the values in + the config files. + """ + gin.parse_config_files_and_bindings(gin_files, + bindings=gin_bindings, + skip_unknown=False) + + +def create_atari_environment(game_name, sticky_actions=True): + """Wraps an Atari 2600 Gym environment with some basic preprocessing. + + This preprocessing matches the guidelines proposed in Machado et al. (2017), + "Revisiting the Arcade Learning Environment: Evaluation Protocols and Open + Problems for General Agents". + + The created environment is the Gym wrapper around the Arcade Learning + Environment. + + The main choice available to the user is whether to use sticky actions or not. + Sticky actions, as prescribed by Machado et al., cause actions to persist + with some probability (0.25) when a new command is sent to the ALE. This + can be viewed as introducing a mild form of stochasticity in the environment. + We use them by default. + + Args: + game_name: str, the name of the Atari 2600 domain. + sticky_actions: bool, whether to use sticky_actions as per Machado et al. + + Returns: + An Atari 2600 environment with some standard preprocessing. + """ + game_version = 'v0' if sticky_actions else 'v4' + full_game_name = '{}NoFrameskip-{}'.format(game_name, game_version) + env = gym.make(full_game_name) + # Strip out the TimeLimit wrapper from Gym, which caps us at 100k frames. We + # handle this time limit internally instead, which lets us cap at 108k frames + # (30 minutes). The TimeLimit wrapper also plays poorly with saving and + # restoring states. + env = env.env + env = preprocessing.AtariPreprocessing(env) + return env + + +@gin.configurable +class Runner(object): + """Object that handles running Atari 2600 experiments. + + Here we use the term 'experiment' to mean simulating interactions between the + agent and the environment and reporting some statistics pertaining to these + interactions. + + A simple scenario to train a DQN agent is as follows: + + ```python + base_dir = '/tmp/simple_example' + def create_agent(sess, environment): + return dqn_agent.DQNAgent(sess, num_actions=environment.action_space.n) + runner = Runner(base_dir, create_agent, game_name='Pong') + runner.run() + ``` + """ + + def __init__(self, + base_dir, + create_agent_fn, + random_seed, + agent_name, + game_name, + num_iterations, + create_environment_fn=create_atari_environment, + sticky_actions=True, + checkpoint_file_prefix='ckpt', + logging_file_prefix='log', + log_every_n=1, + training_steps=250000, + evaluation_steps=125000, + max_steps_per_episode=27000): + """Initialize the Runner object in charge of running a full experiment. + + Args: + base_dir: str, the base directory to host all required sub-directories. + create_agent_fn: A function that takes as args a Tensorflow session and an + Atari 2600 Gym environment, and returns an agent. + create_environment_fn: A function which receives a game name and creates + an Atari 2600 Gym environment. + game_name: str, name of the Atari 2600 domain to run. + sticky_actions: bool, whether to enable sticky actions in the environment. + checkpoint_file_prefix: str, the prefix to use for checkpoint files. + logging_file_prefix: str, prefix to use for the log files. + log_every_n: int, the frequency for writing logs. + num_iterations: int, the iteration number threshold (must be greater than + start_iteration). + training_steps: int, the number of training steps to perform. + evaluation_steps: int, the number of evaluation steps to perform. + max_steps_per_episode: int, maximum number of steps after which an episode + terminates. + + This constructor will take the following actions: + - Initialize an environment. + - Initialize a `tf.Session`. + - Initialize a logger. + - Initialize an agent. + - Reload from the latest checkpoint, if available, and initialize the + Checkpointer object. + """ + assert base_dir is not None + assert game_name is not None + self._logging_file_prefix = logging_file_prefix + self._log_every_n = log_every_n + self._num_iterations = num_iterations + self._training_steps = training_steps + self._evaluation_steps = evaluation_steps + self._max_steps_per_episode = max_steps_per_episode + self._base_dir = base_dir + self._create_directories() + self._summary_writer = tf.summary.FileWriter(self._base_dir) + self.average_reward_eval = -100 + self.game_name = game_name + self.agent_name = agent_name + + self._environment = create_environment_fn(game_name, sticky_actions) + # Set up a session and initialize variables. + tf.set_random_seed(random_seed) + tfconfig = tf.ConfigProto(allow_soft_placement=True) + tfconfig.gpu_options.allow_growth = True + self._sess = tf.Session('', + config=tfconfig) + # self._sess = tf.Session('', + # config=tf.ConfigProto(allow_soft_placement=True)) + self._agent = create_agent_fn(self._sess, self._environment, + summary_writer=self._summary_writer) + tf.logging.info('Running %s with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t random_seed: %s', random_seed) + tf.logging.info('\t num_iterations: %s', num_iterations) + tf.logging.info('\t training_steps: %s', training_steps) + tf.logging.info('\t sticky_actions: %s', sticky_actions) + tf.logging.info('\t game_name: %s', game_name) + # self._sess = tf.Session('', + # config=tf.ConfigProto(allow_soft_placement=True)) + # self._agent = create_agent_fn(self._sess, self._environment, + # summary_writer=self._summary_writer) + + self._summary_writer.add_graph(graph=tf.get_default_graph()) + self._sess.run(tf.global_variables_initializer()) + + self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix) + + # # restore from pretained model a quick fix. + # base_restore_dir = "/mnt/research/linkaixi/AllData/pommerman/dopaminecheckpts" + # agent_name = "c51" + # restore_dir = base_restore_dir + "/{}/{}/1/tf_checkpoints".format(agent_name, game_name) + # self.restore_checkpoints(restore_dir, "tf_ckpt-199") + + def _create_directories(self): + """Create necessary sub-directories.""" + self._checkpoint_dir = os.path.join(self._base_dir, 'checkpoints') + self._logger = logger.Logger(os.path.join(self._base_dir, 'logs')) + + def _initialize_checkpointer_and_maybe_resume(self, checkpoint_file_prefix): + """Reloads the latest checkpoint if it exists. + + This method will first create a `Checkpointer` object and then call + `checkpointer.get_latest_checkpoint_number` to determine if there is a valid + checkpoint in self._checkpoint_dir, and what the largest file number is. + If a valid checkpoint file is found, it will load the bundled data from this + file and will pass it to the agent for it to reload its data. + If the agent is able to successfully unbundle, this method will verify that + the unbundled data contains the keys,'logs' and 'current_iteration'. It will + then load the `Logger`'s data from the bundle, and will return the iteration + number keyed by 'current_iteration' as one of the return values (along with + the `Checkpointer` object). + + Args: + checkpoint_file_prefix: str, the checkpoint file prefix. + + Returns: + start_iteration: int, the iteration number to start the experiment from. + experiment_checkpointer: `Checkpointer` object for the experiment. + """ + # self._checkpoint_dir = base_dir + "/checkpoints" + self._checkpointer = checkpointer.Checkpointer(self._checkpoint_dir, + checkpoint_file_prefix) + self._start_iteration = 0 + # Check if checkpoint exists. Note that the existence of checkpoint 0 means + # that we have finished iteration 0 (so we will start from iteration 1). + latest_checkpoint_version = checkpointer.get_latest_checkpoint_number( + self._checkpoint_dir) + if latest_checkpoint_version >= 0: + experiment_data = self._checkpointer.load_checkpoint( + latest_checkpoint_version) + if self._agent.unbundle( + self._checkpoint_dir, latest_checkpoint_version, experiment_data): + assert 'logs' in experiment_data + assert 'current_iteration' in experiment_data + self._logger.data = experiment_data['logs'] + self._start_iteration = experiment_data['current_iteration'] + 1 + tf.logging.info('Reloaded checkpoint and will start from iteration %d', + self._start_iteration) + + def restore_checkpoints(self, restore_dir, filename): + saver = tf.train.Saver() + saver.restore(self._sess, os.path.join(restore_dir, filename)) + + def _initialize_episode(self): + """Initialization for a new episode. + + Returns: + action: int, the initial action chosen by the agent. + """ + initial_observation = self._environment.reset() + return self._agent.begin_episode(initial_observation) + + def _run_one_step(self, action): + """Executes a single step in the environment. + + Args: + action: int, the action to perform in the environment. + + Returns: + The observation, reward, and is_terminal values returned from the + environment. + """ + observation, reward, is_terminal, _ = self._environment.step(action) + return observation, reward, is_terminal + + def _end_episode(self, reward): + """Finalizes an episode run. + + Args: + reward: float, the last reward from the environment. + """ + self._agent.end_episode(reward) + + def _end_episode_store(self, reward, total_reward, step_number, is_opt): + """Finalizes an episode run and store optimal trajectories. + + Args: + reward: float, the last reward from the environment. + """ + if is_opt: # if it is optimal trajectories, then store it. + self._agent.end_episode_(reward, total_reward, step_number) + else: # else only store for DQN + self._agent.end_episode(reward) + + def _run_one_episode(self): + """Executes a full trajectory of the agent interacting with the environment. + + Returns: + The number of steps taken and the total reward. + """ + step_number = 0 + total_reward = 0. + + action = self._initialize_episode() + is_terminal = False + + # Keep interacting until we reach a terminal state. + while True: + observation, reward, is_terminal = self._run_one_step(action) + + total_reward += reward + step_number += 1 + + # Perform reward clipping. + # reward = np.clip(reward, -1, 1) todo + + if (self._environment.game_over or + step_number == self._max_steps_per_episode): + # Stop the run loop once we reach the true end of episode. + break + elif is_terminal: + # If we lose a life but the episode is not over, signal an artificial + # end of episode to the agent. + self._agent.end_episode(reward) + action = self._agent.begin_episode(observation) + else: + action = self._agent.step(reward, observation) + if self.agent_name in RPG_AGENTS: + is_opt = False + if total_reward >= episodic_return[self.game_name]: + is_opt = True + self._end_episode_store(reward, total_reward, step_number, is_opt) + else: + self._end_episode(reward) + + return step_number, total_reward + + def _run_one_phase(self, min_steps, statistics, run_mode_str): + """Runs the agent/environment loop until a desired number of steps. + + We follow the Machado et al., 2017 convention of running full episodes, + and terminating once we've run a minimum number of steps. + + Args: + min_steps: int, minimum number of steps to generate in this phase. + statistics: `IterationStatistics` object which records the experimental + results. + run_mode_str: str, describes the run mode for this agent. + + Returns: + Tuple containing the number of steps taken in this phase (int), the sum of + returns (float), and the number of episodes performed (int). + """ + step_count = 0 + num_episodes = 0 + sum_returns = 0. + num_good_trajs = 0 + good_traj_label = 0 + while step_count < min_steps: + episode_length, episode_return = self._run_one_episode() + + good_traj_label = 0 + if episode_return >= episodic_return[self.game_name]: + good_traj_label = 1 + num_good_trajs += 1 + statistics.append({ + '{}_episode_lengths'.format(run_mode_str): episode_length, + '{}_episode_returns'.format(run_mode_str): episode_return, + '{}_episode_goodtraj'.format(run_mode_str): good_traj_label + }) + step_count += episode_length + sum_returns += episode_return + num_episodes += 1 + # We use sys.stdout.write instead of tf.logging so as to flush frequently + # without generating a line break. + + if self.agent_name in ['rpg', 'repg']: + sys.stdout.write('epsilon: {} '.format(self._agent.epsilon_current) + + 'replaysize {}\r'.format(self._agent.current_replay_size)) + elif self.agent_name in RPG_AGENTS: + sys.stdout.write('Opt replay size: {} '.format(self._agent._replay_opt.memory.add_count)) + + sys.stdout.write('Steps executed: {} '.format(step_count) + + 'Episode length: {} '.format(episode_length) + + 'Return: {}'.format(episode_return) + + 'Good traj?: {}\r'.format(good_traj_label)) + sys.stdout.flush() + return step_count, sum_returns, num_episodes, num_good_trajs + + def _run_train_phase(self, statistics, eval_mode=False): + """Run training phase. + + Args: + statistics: `IterationStatistics` object which records the experimental + results. Note - This object is modified by this method. + + Returns: + num_episodes: int, The number of episodes run in this phase. + average_reward: The average reward generated in this phase. + """ + # Perform the training phase, during which the agent learns. + self._agent.eval_mode = eval_mode + start_time = time.time() + number_steps, sum_returns, num_episodes, num_good_trajs = self._run_one_phase( + self._training_steps, statistics, 'train') + average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0 + statistics.append({'train_average_return': average_return}) + average_good_trajs = num_good_trajs / num_episodes if num_episodes > 0 else 0.0 + statistics.append({'train_average_goodtraj': average_good_trajs}) + time_delta = time.time() - start_time + tf.logging.info('Average undiscounted return per training episode: %.2f', + average_return) + tf.logging.info('Average training steps per second: %.2f', + number_steps / time_delta) + return num_episodes, average_return + + def _run_eval_phase(self, statistics): + """Run evaluation phase. + + Args: + statistics: `IterationStatistics` object which records the experimental + results. Note - This object is modified by this method. + + Returns: + num_episodes: int, The number of episodes run in this phase. + average_reward: float, The average reward generated in this phase. + """ + # Perform the evaluation phase -- no learning. + self._agent.eval_mode = True + _, sum_returns, num_episodes, _ = self._run_one_phase( + self._evaluation_steps, statistics, 'eval') + average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0 + tf.logging.info('Average undiscounted return per evaluation episode: %.2f', + average_return) + statistics.append({'eval_average_return': average_return}) + return num_episodes, average_return + + def _run_one_iteration(self, iteration): + """Runs one iteration of agent/environment interaction. + + An iteration involves running several episodes until a certain number of + steps are obtained. The interleaving of train/eval phases implemented here + are to match the implementation of (Mnih et al., 2015). + + Args: + iteration: int, current iteration number, used as a global_step for saving + Tensorboard summaries. + + Returns: + A dict containing summary statistics for this iteration. + """ + statistics = iteration_statistics.IterationStatistics() + tf.logging.info('Starting iteration %d', iteration) + + train_eval_mode = False + # if self.game_name == "Pong": + if self.average_reward_eval >= episodic_return_switch[self.game_name]: + train_eval_mode = True + print("Stop training at iteration {}".format(iteration)) + + num_episodes_train, average_reward_train = self._run_train_phase( + statistics, train_eval_mode) + # else: + # # don't train, only for evaluation. + # num_episodes_train, average_reward_train = 0, 0 + + if self.agent_name in RPG_AGENTS and self._agent._replay_opt.memory.add_count == 0: + num_episodes_eval, average_reward_eval = -10000, -10000 + # if we didn't train rpg, don't waste time evaluate it. + else: + num_episodes_eval, average_reward_eval = self._run_eval_phase( + statistics) + self.average_reward_eval = average_reward_eval + self._save_tensorboard_summaries(iteration, num_episodes_train, + average_reward_train, num_episodes_eval, + average_reward_eval) + return statistics.data_lists + + def _save_tensorboard_summaries(self, iteration, + num_episodes_train, + average_reward_train, + num_episodes_eval, + average_reward_eval): + """Save statistics as tensorboard summaries. + + Args: + iteration: int, The current iteration number. + num_episodes_train: int, number of training episodes run. + average_reward_train: float, The average training reward. + num_episodes_eval: int, number of evaluation episodes run. + average_reward_eval: float, The average evaluation reward. + """ + summary = tf.Summary(value=[ + tf.Summary.Value(tag='Train/NumEpisodes', + simple_value=num_episodes_train), + tf.Summary.Value(tag='Train/AverageReturns', + simple_value=average_reward_train), + tf.Summary.Value(tag='Eval/NumEpisodes', + simple_value=num_episodes_eval), + tf.Summary.Value(tag='Eval/AverageReturns', + simple_value=average_reward_eval) + ]) + self._summary_writer.add_summary(summary, iteration) + + def _log_experiment(self, iteration, statistics): + """Records the results of the current iteration. + + Args: + iteration: int, iteration number. + statistics: `IterationStatistics` object containing statistics to log. + """ + self._logger['iteration_{:d}'.format(iteration)] = statistics + if iteration % self._log_every_n == 0: + self._logger.log_to_file(self._logging_file_prefix, iteration) + + def _checkpoint_experiment(self, iteration): + """Checkpoint experiment data. + + Args: + iteration: int, iteration number for checkpointing. + """ + experiment_data = self._agent.bundle_and_checkpoint(self._checkpoint_dir, + iteration) + if experiment_data: + experiment_data['current_iteration'] = iteration + experiment_data['logs'] = self._logger.data + self._checkpointer.save_checkpoint(iteration, experiment_data) + + def run_experiment(self): + """Runs a full experiment, spread over multiple iterations.""" + tf.logging.info('Beginning training...') + if self._num_iterations <= self._start_iteration: + tf.logging.warning('num_iterations (%d) < start_iteration(%d)', + self._num_iterations, self._start_iteration) + return + + for iteration in range(self._start_iteration, self._num_iterations): + statistics = self._run_one_iteration(iteration) + self._log_experiment(iteration, statistics) + self._checkpoint_experiment(iteration) + + +@gin.configurable +class TrainRunner(Runner): + """Object that handles running Atari 2600 experiments. + + The `TrainRunner` differs from the base `Runner` class in that it does not + the evaluation phase. Checkpointing and logging for the train phase are + preserved as before. + """ + + def __init__(self, base_dir, create_agent_fn): + """Initialize the TrainRunner object in charge of running a full experiment. + + Args: + base_dir: str, the base directory to host all required sub-directories. + create_agent_fn: A function that takes as args a Tensorflow session and an + Atari 2600 Gym environment, and returns an agent. + """ + tf.logging.info('Creating TrainRunner ...') + super(TrainRunner, self).__init__( + base_dir=base_dir, create_agent_fn=create_agent_fn) + self._agent.eval_mode = False + + def _run_one_iteration(self, iteration): + """Runs one iteration of agent/environment interaction. + + An iteration involves running several episodes until a certain number of + steps are obtained. This method differs from the `_run_one_iteration` method + in the base `Runner` class in that it only runs the train phase. + + Args: + iteration: int, current iteration number, used as a global_step for saving + Tensorboard summaries. + + Returns: + A dict containing summary statistics for this iteration. + """ + statistics = iteration_statistics.IterationStatistics() + num_episodes_train, average_reward_train = self._run_train_phase( + statistics) + + self._save_tensorboard_summaries(iteration, num_episodes_train, + average_reward_train) + return statistics.data_lists + + def _save_tensorboard_summaries(self, iteration, num_episodes, + average_reward): + """Save statistics as tensorboard summaries.""" + summary = tf.Summary(value=[ + tf.Summary.Value(tag='Train/NumEpisodes', simple_value=num_episodes), + tf.Summary.Value( + tag='Train/AverageReturns', simple_value=average_reward), + ]) + self._summary_writer.add_summary(summary, iteration) diff --git a/dopamine/dopamine/atari/train.py b/dopamine/dopamine/atari/train.py new file mode 100644 index 0000000..970c8b0 --- /dev/null +++ b/dopamine/dopamine/atari/train.py @@ -0,0 +1,186 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +r"""The entry point for running an agent on an Atari 2600 domain. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + + +from absl import app +from absl import flags +from dopamine.agents.dqn import dqn_agent +from dopamine.agents.implicit_quantile import implicit_quantile_agent +from dopamine.agents.rainbow import rainbow_agent +from dopamine.atari import run_experiment +from dopamine.agents.rpg import rpg_agent +from dopamine.agents.epg import epg_agent +from dopamine.agents.lpg import lpg_agent +from dopamine.agents.repg import repg_agent +from dopamine.agents.dqnrpg import dqnrpg_agent +from dopamine.agents.rainbowrpg import rainbowrpg_agent +from dopamine.agents.implicit_quantilerpg import implicit_quantilerpg_agent +import tensorflow as tf +import time + +current_time = time.strftime("%Y%m%d_%H%M") +agentname = "implicit_quantilerpg" #"rainbowrpg" #"dqnrpg" +flags.DEFINE_bool('debug_mode', False, + 'If set to true, the agent will output in-episode statistics ' + 'to Tensorboard. Disabled by default as this results in ' + 'slower training.') +flags.DEFINE_string('agent_name', agentname, + 'Name of the agent. Must be one of ' + '(dqn, rainbow, implicit_quantile)') +flags.DEFINE_string('game_name', "Pong", + 'Name of game playing ' + 'Pong, Boxing, Bowling, etc.') +flags.DEFINE_integer('num_iterations', 35, + 'Number of training iterations') + +flags.DEFINE_string('base_dir', "/mnt/research/linkaixi/AllData/pommerman/tmp", #"/mnt/research/linkaixi/AllData/pommerman/{}".format(current_time), + 'Base directory to host all required sub-directories.') +flags.DEFINE_integer('random_seed', 1, + 'graph level random seed') +flags.DEFINE_multi_string( + 'gin_files', "../agents/{}/configs/{}.gin".format(agentname, agentname), 'List of paths to gin configuration files (e.g.' + '"../agents/dqn/configs/dqn.gin").') +flags.DEFINE_multi_string( + 'gin_bindings', [], + 'Gin bindings to override the values set in the config files ' + '(e.g. "DQNAgent.epsilon_train=0.1",' + ' "create_environment.game_name="Pong"").') +flags.DEFINE_string( + 'schedule', 'continuous_train_and_eval', + 'The schedule with which to run the experiment and choose an appropriate ' + 'Runner. Supported choices are ' + '{continuous_train, continuous_train_and_eval}.') + +FLAGS = flags.FLAGS + + + +def create_agent(sess, environment, summary_writer=None): + """Creates a DQN agent. + + Args: + sess: A `tf.Session` object for running associated ops. + environment: An Atari 2600 Gym environment. + summary_writer: A Tensorflow summary writer to pass to the agent + for in-agent training statistics in Tensorboard. + + Returns: + agent: An RL agent. + + Raises: + ValueError: If `agent_name` is not in supported list. + """ + if not FLAGS.debug_mode: + summary_writer = None + if FLAGS.agent_name == 'dqn': + return dqn_agent.DQNAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'rainbow': + return rainbow_agent.RainbowAgent( + sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'implicit_quantile': + return implicit_quantile_agent.ImplicitQuantileAgent( + sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'rpg': + return rpg_agent.RPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'epg': + return epg_agent.EPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'lpg': + return lpg_agent.LPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'repg': + return repg_agent.REPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'dqnrpg': + return dqnrpg_agent.DQNRPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'rainbowrpg': + return rainbowrpg_agent.RainbowRPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + elif FLAGS.agent_name == 'implicit_quantilerpg': + return implicit_quantilerpg_agent.ImplicitQuantileRPGAgent(sess, num_actions=environment.action_space.n, + summary_writer=summary_writer) + else: + raise ValueError('Unknown agent: {}'.format(FLAGS.agent_name)) + + +def create_runner(base_dir, create_agent_fn, random_seed, agent_name, game_name, num_iterations): + """Creates an experiment Runner. + + Args: + base_dir: str, base directory for hosting all subdirectories. + create_agent_fn: A function that takes as args a Tensorflow session and an + Atari 2600 Gym environment, and returns an agent. + + Returns: + runner: A `run_experiment.Runner` like object. + + Raises: + ValueError: When an unknown schedule is encountered. + """ + assert base_dir is not None + # Continuously runs training and evaluation until max num_iterations is hit. + if FLAGS.schedule == 'continuous_train_and_eval': + return run_experiment.Runner(base_dir, create_agent_fn, random_seed, + agent_name, game_name, num_iterations) + # Continuously runs training until max num_iterations is hit. + elif FLAGS.schedule == 'continuous_train': + return run_experiment.TrainRunner(base_dir, create_agent_fn) + else: + raise ValueError('Unknown schedule: {}'.format(FLAGS.schedule)) + + +def launch_experiment(create_runner_fn, create_agent_fn): + """Launches the experiment. + + Args: + create_runner_fn: A function that takes as args a base directory and a + function for creating an agent and returns a `Runner`-like object. + create_agent_fn: A function that takes as args a Tensorflow session and an + Atari 2600 Gym environment, and returns an agent. + """ + run_experiment.load_gin_configs(FLAGS.gin_files, FLAGS.gin_bindings) + runner = create_runner_fn(FLAGS.base_dir, create_agent_fn, + FLAGS.random_seed, FLAGS.agent_name, + FLAGS.game_name, FLAGS.num_iterations) + runner.run_experiment() + + +def main(unused_argv): + """Main method. + + Args: + unused_argv: Arguments (unused). + """ + tf.logging.set_verbosity(tf.logging.INFO) + launch_experiment(create_runner, create_agent) + + +if __name__ == '__main__': + # flags.mark_flag_as_required('agent_name') + # flags.mark_flag_as_required('base_dir') + app.run(main) diff --git a/dopamine/dopamine/common/__init__.py b/dopamine/dopamine/common/__init__.py new file mode 100644 index 0000000..f9bcb7c --- /dev/null +++ b/dopamine/dopamine/common/__init__.py @@ -0,0 +1,14 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/dopamine/dopamine/common/checkpointer.py b/dopamine/dopamine/common/checkpointer.py new file mode 100644 index 0000000..08a478a --- /dev/null +++ b/dopamine/dopamine/common/checkpointer.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A checkpointing mechanism for Dopamine agents. + +This Checkpointer expects a base directory where checkpoints for different +iterations are stored. Specifically, Checkpointer.save_checkpoint() takes in +as input a dictionary 'data' to be pickled to disk. At each iteration, we +write a file called 'cpkt.#', where # is the iteration number. The +Checkpointer also cleans up old files, maintaining up to the CHECKPOINT_DURATION +most recent iterations. + +The Checkpointer writes a sentinel file to indicate that checkpointing was +globally successful. This means that all other checkpointing activities +(saving the Tensorflow graph, the replay buffer) should be performed *prior* +to calling Checkpointer.save_checkpoint(). This allows the Checkpointer to +detect incomplete checkpoints. + +#### Example + +After running 10 iterations (numbered 0...9) with base_directory='/checkpoint', +the following files will exist: +``` + /checkpoint/cpkt.6 + /checkpoint/cpkt.7 + /checkpoint/cpkt.8 + /checkpoint/cpkt.9 + /checkpoint/sentinel_checkpoint_complete.6 + /checkpoint/sentinel_checkpoint_complete.7 + /checkpoint/sentinel_checkpoint_complete.8 + /checkpoint/sentinel_checkpoint_complete.9 +``` +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle +import tensorflow as tf + +CHECKPOINT_DURATION = 4 + + +def get_latest_checkpoint_number(base_directory): + """Returns the version number of the latest completed checkpoint. + + Args: + base_directory: str, directory in which to look for checkpoint files. + + Returns: + int, the iteration number of the latest checkpoint, or -1 if none was found. + """ + glob = os.path.join(base_directory, 'sentinel_checkpoint_complete.*') + def extract_iteration(x): + return int(x[x.rfind('.') + 1:]) + try: + checkpoint_files = tf.gfile.Glob(glob) + except tf.errors.NotFoundError: + return -1 + try: + latest_iteration = max(extract_iteration(x) for x in checkpoint_files) + return latest_iteration + except ValueError: + return -1 + + +class Checkpointer(object): + """Class for managing checkpoints for Dopamine agents. + """ + + def __init__(self, base_directory, checkpoint_file_prefix='ckpt', + checkpoint_frequency=1): + """Initializes Checkpointer. + + Args: + base_directory: str, directory where all checkpoints are saved/loaded. + checkpoint_file_prefix: str, prefix to use for naming checkpoint files. + checkpoint_frequency: int, the frequency at which to checkpoint. + + Raises: + ValueError: if base_directory is empty, or not creatable. + """ + if not base_directory: + raise ValueError('No path provided to Checkpointer.') + self._checkpoint_file_prefix = checkpoint_file_prefix + self._checkpoint_frequency = checkpoint_frequency + self._base_directory = base_directory + try: + tf.gfile.MakeDirs(base_directory) + except tf.errors.PermissionDeniedError: + # We catch the PermissionDeniedError and issue a more useful exception. + raise ValueError('Unable to create checkpoint path: {}.'.format( + base_directory)) + + def _generate_filename(self, file_prefix, iteration_number): + """Returns a checkpoint filename from prefix and iteration number.""" + filename = '{}.{}'.format(file_prefix, iteration_number) + return os.path.join(self._base_directory, filename) + + def _save_data_to_file(self, data, filename): + """Saves the given 'data' object to a file.""" + with tf.gfile.GFile(filename, 'w') as fout: + pickle.dump(data, fout) + + def save_checkpoint(self, iteration_number, data): + """Saves a new checkpoint at the current iteration_number. + + Args: + iteration_number: int, the current iteration number for this checkpoint. + data: Any (picklable) python object containing the data to store in the + checkpoint. + """ + if iteration_number % self._checkpoint_frequency != 0: + return + + filename = self._generate_filename(self._checkpoint_file_prefix, + iteration_number) + self._save_data_to_file(data, filename) + filename = self._generate_filename('sentinel_checkpoint_complete', + iteration_number) + with tf.gfile.GFile(filename, 'wb') as fout: + fout.write('done') + + self._clean_up_old_checkpoints(iteration_number) + + def _clean_up_old_checkpoints(self, iteration_number): + """Removes sufficiently old checkpoints.""" + # After writing a the checkpoint and sentinel file, we garbage collect files + # that are CHECKPOINT_DURATION * self._checkpoint_frequency versions old. + stale_iteration_number = iteration_number - (self._checkpoint_frequency * + CHECKPOINT_DURATION) + + if stale_iteration_number >= 0: + stale_file = self._generate_filename(self._checkpoint_file_prefix, + stale_iteration_number) + stale_sentinel = self._generate_filename('sentinel_checkpoint_complete', + stale_iteration_number) + try: + tf.gfile.Remove(stale_file) + tf.gfile.Remove(stale_sentinel) + except tf.errors.NotFoundError: + # Ignore if file not found. + tf.logging.info('Unable to remove {} or {}.'.format(stale_file, + stale_sentinel)) + + def _load_data_from_file(self, filename): + if not tf.gfile.Exists(filename): + return None + with tf.gfile.GFile(filename, 'rb') as fin: + return pickle.load(fin) + + def load_checkpoint(self, iteration_number): + """Tries to reload a checkpoint at the selected iteration number. + + Args: + iteration_number: The checkpoint iteration number to try to load. + + Returns: + If the checkpoint files exist, two unpickled objects that were passed in + as data to save_checkpoint; returns None if the files do not exist. + """ + checkpoint_file = self._generate_filename(self._checkpoint_file_prefix, + iteration_number) + return self._load_data_from_file(checkpoint_file) diff --git a/dopamine/dopamine/common/iteration_statistics.py b/dopamine/dopamine/common/iteration_statistics.py new file mode 100644 index 0000000..f47c575 --- /dev/null +++ b/dopamine/dopamine/common/iteration_statistics.py @@ -0,0 +1,49 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A class for storing iteration-specific metrics. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class IterationStatistics(object): + """A class for storing iteration-specific metrics. + + The internal format is as follows: we maintain a mapping from keys to lists. + Each list contains all the values corresponding to the given key. + + For example, self.data_lists['train_episode_returns'] might contain the + per-episode returns achieved during this iteration. + + Attributes: + data_lists: dict mapping each metric_name (str) to a list of said metric + across episodes. + """ + + def __init__(self): + self.data_lists = {} + + def append(self, data_pairs): + """Add the given values to their corresponding key-indexed lists. + + Args: + data_pairs: A dictionary of key-value pairs to be recorded. + """ + for key, value in data_pairs.items(): + if key not in self.data_lists: + self.data_lists[key] = [] + self.data_lists[key].append(value) diff --git a/dopamine/dopamine/common/logger.py b/dopamine/dopamine/common/logger.py new file mode 100644 index 0000000..8e1b51b --- /dev/null +++ b/dopamine/dopamine/common/logger.py @@ -0,0 +1,105 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A lightweight logging mechanism for dopamine agents.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle +import tensorflow as tf + + +CHECKPOINT_DURATION = 4 + + +class Logger(object): + """Class for maintaining a dictionary of data to log.""" + + def __init__(self, logging_dir): + """Initializes Logger. + + Args: + logging_dir: str, Directory to which logs are written. + """ + # Dict used by logger to store data. + self.data = {} + self._logging_enabled = True + + if not logging_dir: + tf.logging.info('Logging directory not specified, will not log.') + self._logging_enabled = False + return + # Try to create logging directory. + try: + tf.gfile.MakeDirs(logging_dir) + except tf.errors.PermissionDeniedError: + # If it already exists, ignore exception. + pass + if not tf.gfile.Exists(logging_dir): + tf.logging.warning( + 'Could not create directory %s, logging will be disabled.', + logging_dir) + self._logging_enabled = False + return + self._logging_dir = logging_dir + + def __setitem__(self, key, value): + """This method will set an entry at key with value in the dictionary. + + It will effectively overwrite any previous data at the same key. + + Args: + key: str, indicating key where to write the entry. + value: A python object to store. + """ + if self._logging_enabled: + self.data[key] = value + + def _generate_filename(self, filename_prefix, iteration_number): + filename = '{}_{}'.format(filename_prefix, iteration_number) + return os.path.join(self._logging_dir, filename) + + def log_to_file(self, filename_prefix, iteration_number): + """Save the pickled dictionary to a file. + + Args: + filename_prefix: str, name of the file to use (without iteration + number). + iteration_number: int, the iteration number, appended to the end of + filename_prefix. + """ + if not self._logging_enabled: + tf.logging.warning('Logging is disabled.') + return + log_file = self._generate_filename(filename_prefix, iteration_number) + with tf.gfile.GFile(log_file, 'w') as fout: + pickle.dump(self.data, fout, protocol=pickle.HIGHEST_PROTOCOL) + # After writing a checkpoint file, we garbage collect the log file + # that is CHECKPOINT_DURATION versions old. + stale_iteration_number = iteration_number - CHECKPOINT_DURATION + if stale_iteration_number >= 0: + stale_file = self._generate_filename(filename_prefix, + stale_iteration_number) + try: + tf.gfile.Remove(stale_file) + except tf.errors.NotFoundError: + # Ignore if file not found. + pass + + def is_logging_enabled(self): + """Return if logging is enabled.""" + return self._logging_enabled diff --git a/dopamine/dopamine/replay_memory/__init__.py b/dopamine/dopamine/replay_memory/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/replay_memory/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/replay_memory/circular_replay_buffer.py b/dopamine/dopamine/replay_memory/circular_replay_buffer.py new file mode 100644 index 0000000..e10c03b --- /dev/null +++ b/dopamine/dopamine/replay_memory/circular_replay_buffer.py @@ -0,0 +1,835 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The standard DQN replay memory. + +This implementation is an out-of-graph replay memory + in-graph wrapper. It +supports vanilla n-step updates of the form typically found in the literature, +i.e. where rewards are accumulated for n steps and the intermediate trajectory +is not exposed to the agent. This does not allow, for example, performing +off-policy corrections. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import gzip +import math +import os +import pickle + +import numpy as np +import tensorflow as tf + +import gin.tf + +# Defines a type describing part of the tuple returned by the replay +# memory. Each element of the tuple is a tensor of shape [batch, ...] where +# ... is defined the 'shape' field of ReplayElement. The tensor type is +# given by the 'type' field. The 'name' field is for convenience and ease of +# debugging. +ReplayElement = ( + collections.namedtuple('shape_type', ['name', 'shape', 'type'])) + +# A prefix that can not collide with variable names for checkpoint files. +STORE_FILENAME_PREFIX = '$store$_' + +# This constant determines how many iterations a checkpoint is kept for. +CHECKPOINT_DURATION = 4 +MAX_SAMPLE_ATTEMPTS = 1000 + + +def invalid_range(cursor, replay_capacity, stack_size, update_horizon): + """Returns a array with the indices of cursor-related invalid transitions. + + There are update_horizon + stack_size invalid indices: + - The update_horizon indices before the cursor, because we do not have a + valid N-step transition (including the next state). + - The stack_size indices on or immediately after the cursor. + If N = update_horizon, K = stack_size, and the cursor is at c, invalid + indices are: + c - N, c - N + 1, ..., c, c + 1, ..., c + K - 1. + + It handles special cases in a circular buffer in the beginning and the end. + + Args: + cursor: int, the position of the cursor. + replay_capacity: int, the size of the replay memory. + stack_size: int, the size of the stacks returned by the replay memory. + update_horizon: int, the agent's update horizon. + Returns: + np.array of size stack_size with the invalid indices. + """ + assert cursor < replay_capacity + return np.array( + [(cursor - update_horizon + i) % replay_capacity + for i in range(stack_size + update_horizon)]) + + +class OutOfGraphReplayBuffer(object): + """A simple out-of-graph Replay Buffer. + + Stores transitions, state, action, reward, next_state, terminal (and any + extra contents specified) in a circular buffer and provides a uniform + transition sampling function. + + When the states consist of stacks of observations storing the states is + inefficient. This class writes observations and constructs the stacked states + at sample time. + + Attributes: + add_count: int, counter of how many transitions have been added (including + the blank ones at the beginning of an episode). + """ + + def __init__(self, + observation_shape, + stack_size, + replay_capacity, + batch_size, + update_horizon=1, + gamma=0.99, + max_sample_attempts=MAX_SAMPLE_ATTEMPTS, + extra_storage_types=None, + observation_dtype=np.uint8): + """Initializes OutOfGraphReplayBuffer. + + Args: + observation_shape: tuple of ints. + stack_size: int, number of frames to use in state stack. + replay_capacity: int, number of transitions to keep in memory. + batch_size: int. + update_horizon: int, length of update ('n' in n-step update). + gamma: int, the discount factor. + max_sample_attempts: int, the maximum number of attempts allowed to + get a sample. + extra_storage_types: list of ReplayElements defining the type of the extra + contents that will be stored and returned by sample_transition_batch. + observation_dtype: np.dtype, type of the observations. Defaults to + np.uint8 for Atari 2600. + + Raises: + ValueError: If replay_capacity is too small to hold at least one + transition. + """ + assert isinstance(observation_shape, tuple) + if replay_capacity < update_horizon + stack_size: + raise ValueError('There is not enough capacity to cover ' + 'update_horizon and stack_size.') + + tf.logging.info( + 'Creating a %s replay memory with the following parameters:', + self.__class__.__name__) + tf.logging.info('\t observation_shape: %s', str(observation_shape)) + tf.logging.info('\t observation_dtype: %s', str(observation_dtype)) + tf.logging.info('\t stack_size: %d', stack_size) + tf.logging.info('\t replay_capacity: %d', replay_capacity) + tf.logging.info('\t batch_size: %d', batch_size) + tf.logging.info('\t update_horizon: %d', update_horizon) + tf.logging.info('\t gamma: %f', gamma) + + self._observation_shape = observation_shape + self._stack_size = stack_size + self._state_shape = self._observation_shape + (self._stack_size,) + self._replay_capacity = replay_capacity + self._batch_size = batch_size + self._update_horizon = update_horizon + self._gamma = gamma + self._observation_dtype = observation_dtype + self._max_sample_attempts = max_sample_attempts + if extra_storage_types: + self._extra_storage_types = extra_storage_types + else: + self._extra_storage_types = [] + self._create_storage() + self.add_count = np.array(0) + self.invalid_range = np.zeros((self._stack_size)) + # When the horizon is > 1, we compute the sum of discounted rewards as a dot + # product using the precomputed vector . + self._cumulative_discount_vector = np.array( + [math.pow(self._gamma, n) for n in range(update_horizon)], + dtype=np.float32) + + def _create_storage(self): + """Creates the numpy arrays used to store transitions. + """ + self._store = {} + for storage_element in self.get_storage_signature(): + array_shape = [self._replay_capacity] + list(storage_element.shape) + self._store[storage_element.name] = np.empty( + array_shape, dtype=storage_element.type) + + def get_add_args_signature(self): + """The signature of the add function. + + Note - Derived classes may return a different signature. + + Returns: + list of ReplayElements defining the type of the argument signature needed + by the add function. + """ + return self.get_storage_signature() + + def get_storage_signature(self): + """Returns a default list of elements to be stored in this replay memory. + + Note - Derived classes may return a different signature. + + Returns: + list of ReplayElements defining the type of the contents stored. + """ + storage_elements = [ + ReplayElement('observation', self._observation_shape, + self._observation_dtype), + ReplayElement('action', (), np.int32), + ReplayElement('reward', (), np.float32), + ReplayElement('terminal', (), np.uint8) + ] + + for extra_replay_element in self._extra_storage_types: + storage_elements.append(extra_replay_element) + return storage_elements + + def _add_zero_transition(self): + """Adds a padding transition filled with zeros (Used in episode beginnings). + """ + zero_transition = [] + for element_type in self.get_add_args_signature(): + zero_transition.append( + np.zeros(element_type.shape, dtype=element_type.type)) + self._add(*zero_transition) + + def add(self, observation, action, reward, terminal, *args): + """Adds a transition to the replay memory. + + This function checks the types and handles the padding at the beginning of + an episode. Then it calls the _add function. + + Since the next_observation in the transition will be the observation added + next there is no need to pass it. + + If the replay memory is at capacity the oldest transition will be discarded. + + Args: + observation: np.array with shape observation_shape. + action: int, the action in the transition. + reward: float, the reward received in the transition. + terminal: A uint8 acting as a boolean indicating whether the transition + was terminal (1) or not (0). + *args: extra contents with shapes and dtypes according to + extra_storage_types. + """ + self._check_add_types(observation, action, reward, terminal, *args) + if self.is_empty() or self._store['terminal'][self.cursor() - 1] == 1: + for _ in range(self._stack_size - 1): + # Child classes can rely on the padding transitions being filled with + # zeros. This is useful when there is a priority argument. + self._add_zero_transition() + self._add(observation, action, reward, terminal, *args) + + def _add(self, *args): + """Internal add method to add to the storage arrays. + + Args: + *args: All the elements in a transition. + """ + cursor = self.cursor() + + arg_names = [e.name for e in self.get_add_args_signature()] + for arg_name, arg in zip(arg_names, args): + self._store[arg_name][cursor] = arg + + self.add_count += 1 + self.invalid_range = invalid_range( + self.cursor(), self._replay_capacity, self._stack_size, + self._update_horizon) + + def _check_add_types(self, *args): + """Checks if args passed to the add method match those of the storage. + + Args: + *args: Args whose types need to be validated. + + Raises: + ValueError: If args have wrong shape or dtype. + """ + if len(args) != len(self.get_add_args_signature()): + raise ValueError('Add expects {} elements, received {}'.format( + len(self.get_add_args_signature()), len(args))) + for arg_element, store_element in zip(args, self.get_add_args_signature()): + if isinstance(arg_element, np.ndarray): + arg_shape = arg_element.shape + elif isinstance(arg_element, tuple) or isinstance(arg_element, list): + # TODO(b/80536437). This is not efficient when arg_element is a list. + arg_shape = np.array(arg_element).shape + else: + # Assume it is scalar. + arg_shape = tuple() + store_element_shape = tuple(store_element.shape) + if arg_shape != store_element_shape: + raise ValueError('arg has shape {}, expected {}'.format( + arg_shape, store_element_shape)) + + def is_empty(self): + """Is the Replay Buffer empty?""" + return self.add_count == 0 + + def is_full(self): + """Is the Replay Buffer full?""" + return self.add_count >= self._replay_capacity + + def cursor(self): + """Index to the location where the next transition will be written.""" + return self.add_count % self._replay_capacity + + def get_range(self, array, start_index, end_index): + """Returns the range of array at the index handling wraparound if necessary. + + Args: + array: np.array, the array to get the stack from. + start_index: int, index to the start of the range to be returned. Range + will wraparound if start_index is smaller than 0. + end_index: int, exclusive end index. Range will wraparound if end_index + exceeds replay_capacity. + + Returns: + np.array, with shape [end_index - start_index, array.shape[1:]]. + """ + assert end_index > start_index, 'end_index must be larger than start_index' + assert end_index >= 0 + assert start_index < self._replay_capacity + if not self.is_full(): + assert end_index <= self.cursor(), ( + 'Index {} has not been added.'.format(start_index)) + + # Fast slice read when there is no wraparound. + if start_index % self._replay_capacity < end_index % self._replay_capacity: + return_array = array[start_index:end_index, ...] + # Slow list read. + else: + indices = [(start_index + i) % self._replay_capacity + for i in range(end_index - start_index)] + return_array = array[indices, ...] + return return_array + + def get_observation_stack(self, index): + state = self.get_range(self._store['observation'], + index - self._stack_size + 1, index + 1) + # The stacking axis is 0 but the agent expects as the last axis. + return np.moveaxis(state, 0, -1) + + def get_terminal_stack(self, index): + return self.get_range(self._store['terminal'], index - self._stack_size + 1, + index + 1) + + def is_valid_transition(self, index): + """Checks if the index contains a valid transition. + + Checks for collisions with the end of episodes and the current position + of the cursor. + + Args: + index: int, the index to the state in the transition. + + Returns: + Is the index valid: Boolean. + + """ + # Check the index is in the valid range + if index < 0 or index >= self._replay_capacity: + return False + if not self.is_full(): + # The indices and next_indices must be smaller than the cursor. + if index >= self.cursor() - self._update_horizon: + return False + # The first few indices contain the padding states of the first episode. + if index < self._stack_size - 1: + return False + + # Skip transitions that straddle the cursor. + if index in set(self.invalid_range): + return False + + # If there are terminal flags in any other frame other than the last one + # the stack is not valid, so don't sample it. + if self.get_terminal_stack(index)[:-1].any(): + return False + + return True + + def _create_batch_arrays(self, batch_size): + """Create a tuple of arrays with the type of get_transition_elements. + + When using the WrappedReplayBuffer with staging enabled it is important to + create new arrays every sample because StaginArea keeps a pointer to the + returned arrays. + + Args: + batch_size: (int) number of transitions returned. If None the default + batch_size will be used. + + Returns: + Tuple of np.arrays with the shape and type of get_transition_elements. + """ + transition_elements = self.get_transition_elements(batch_size) + batch_arrays = [] + for element in transition_elements: + batch_arrays.append(np.empty(element.shape, dtype=element.type)) + return tuple(batch_arrays) + + def sample_index_batch(self, batch_size): + """Returns a batch of valid indices sampled uniformly. + + Args: + batch_size: int, number of indices returned. + + Returns: + list of ints, a batch of valid indices sampled uniformly. + + Raises: + RuntimeError: If the batch was not constructed after maximum number of + tries. + """ + if self.is_full(): + # add_count >= self._replay_capacity > self._stack_size + min_id = self.cursor() - self._replay_capacity + self._stack_size - 1 + max_id = self.cursor() - self._update_horizon + else: + # add_count < self._replay_capacity + min_id = self._stack_size - 1 + max_id = self.cursor() - self._update_horizon + if max_id <= min_id: + raise RuntimeError('Cannot sample a batch with fewer than stack size ' + '({}) + update_horizon ({}) transitions.'. + format(self._stack_size, self._update_horizon)) + + indices = [] + attempt_count = 0 + while (len(indices) < batch_size and + attempt_count < self._max_sample_attempts): + attempt_count += 1 + index = np.random.randint(min_id, max_id) % self._replay_capacity + if self.is_valid_transition(index): + indices.append(index) + if len(indices) != batch_size: + raise RuntimeError( + 'Max sample attempts: Tried {} times but only sampled {}' + ' valid indices. Batch size is {}'. + format(self._max_sample_attempts, len(indices), batch_size)) + + return indices + + def sample_transition_batch(self, batch_size=None, indices=None): + """Returns a batch of transitions (including any extra contents). + + If get_transition_elements has been overridden and defines elements not + stored in self._store, an empty array will be returned and it will be + left to the child class to fill it. For example, for the child class + OutOfGraphPrioritizedReplayBuffer, the contents of the + sampling_probabilities are stored separately in a sum tree. + + When the transition is terminal next_state_batch has undefined contents. + + NOTE: This transition contains the indices of the sampled elements. These + are only valid during the call to sample_transition_batch, i.e. they may + be used by subclasses of this replay buffer but may point to different data + as soon as sampling is done. + + Args: + batch_size: int, number of transitions returned. If None, the default + batch_size will be used. + indices: None or list of ints, the indices of every transition in the + batch. If None, sample the indices uniformly. + + Returns: + transition_batch: tuple of np.arrays with the shape and type as in + get_transition_elements(). + + Raises: + ValueError: If an element to be sampled is missing from the replay buffer. + """ + if batch_size is None: + batch_size = self._batch_size + if indices is None: + indices = self.sample_index_batch(batch_size) + assert len(indices) == batch_size + + transition_elements = self.get_transition_elements(batch_size) + batch_arrays = self._create_batch_arrays(batch_size) + for batch_element, state_index in enumerate(indices): + trajectory_indices = [(state_index + j) % self._replay_capacity + for j in range(self._update_horizon)] + trajectory_terminals = self._store['terminal'][trajectory_indices] + is_terminal_transition = trajectory_terminals.any() + if not is_terminal_transition: + trajectory_length = self._update_horizon + else: + # np.argmax of a bool array returns the index of the first True. + trajectory_length = np.argmax(trajectory_terminals.astype(np.bool), + 0) + 1 + next_state_index = state_index + trajectory_length + trajectory_discount_vector = ( + self._cumulative_discount_vector[:trajectory_length]) + trajectory_rewards = self.get_range(self._store['reward'], state_index, + next_state_index) + + # Fill the contents of each array in the sampled batch. + assert len(transition_elements) == len(batch_arrays) + for element_array, element in zip(batch_arrays, transition_elements): + if element.name == 'state': + element_array[batch_element] = self.get_observation_stack(state_index) + elif element.name == 'reward': + # cumpute the discounted sum of rewards in the trajectory. + element_array[batch_element] = trajectory_discount_vector.dot( + trajectory_rewards) + elif element.name == 'next_state': + element_array[batch_element] = self.get_observation_stack( + (next_state_index) % self._replay_capacity) + elif element.name == 'terminal': + element_array[batch_element] = is_terminal_transition + elif element.name == 'indices': + element_array[batch_element] = state_index + elif element.name in self._store.keys(): + element_array[batch_element] = ( + self._store[element.name][state_index]) + # We assume the other elements are filled in by the subclass. + + return batch_arrays + + def get_transition_elements(self, batch_size=None): + """Returns a 'type signature' for sample_transition_batch. + + Args: + batch_size: int, number of transitions returned. If None, the default + batch_size will be used. + Returns: + signature: A namedtuple describing the method's return type signature. + """ + batch_size = self._batch_size if batch_size is None else batch_size + + transition_elements = [ + ReplayElement('state', (batch_size,) + self._state_shape, + self._observation_dtype), + ReplayElement('action', (batch_size,), np.int32), + ReplayElement('reward', (batch_size,), np.float32), + ReplayElement('next_state', (batch_size,) + self._state_shape, + self._observation_dtype), + ReplayElement('terminal', (batch_size,), np.uint8), + ReplayElement('indices', (batch_size,), np.int32) + ] + for element in self._extra_storage_types: + transition_elements.append( + ReplayElement(element.name, (batch_size,) + tuple(element.shape), + element.type)) + return transition_elements + + def _generate_filename(self, checkpoint_dir, name, suffix): + return os.path.join(checkpoint_dir, '{}_ckpt.{}.gz'.format(name, suffix)) + + def _return_checkpointable_elements(self): + """Return the dict of elements of the class for checkpointing. + + Returns: + checkpointable_elements: dict containing all non private (starting with + _) members + all the arrays inside self._store. + """ + checkpointable_elements = {} + for member_name, member in self.__dict__.items(): + if member_name == '_store': + for array_name, array in self._store.items(): + checkpointable_elements[STORE_FILENAME_PREFIX + array_name] = array + elif not member_name.startswith('_'): + checkpointable_elements[member_name] = member + return checkpointable_elements + + def save(self, checkpoint_dir, iteration_number): + """Save the OutOfGraphReplayBuffer attributes into a file. + + This method will save all the replay buffer's state in a single file. + + Args: + checkpoint_dir: str, the directory where numpy checkpoint files should be + saved. + iteration_number: int, iteration_number to use as a suffix in naming + numpy checkpoint files. + """ + if not tf.gfile.Exists(checkpoint_dir): + return + + checkpointable_elements = self._return_checkpointable_elements() + + for attr in checkpointable_elements: + filename = self._generate_filename(checkpoint_dir, attr, iteration_number) + with tf.gfile.Open(filename, 'wb') as f: + with gzip.GzipFile(fileobj=f) as outfile: + # Checkpoint the np arrays in self._store with np.save instead of + # pickling the dictionary is critical for file size and performance. + # STORE_FILENAME_PREFIX indicates that the variable is contained in + # self._store. + if attr.startswith(STORE_FILENAME_PREFIX): + array_name = attr[len(STORE_FILENAME_PREFIX):] + np.save(outfile, self._store[array_name], allow_pickle=False) + # Some numpy arrays might not be part of storage + elif isinstance(self.__dict__[attr], np.ndarray): + np.save(outfile, self.__dict__[attr], allow_pickle=False) + else: + pickle.dump(self.__dict__[attr], outfile) + + # After writing a checkpoint file, we garbage collect the checkpoint file + # that is four versions old. + stale_iteration_number = iteration_number - CHECKPOINT_DURATION + if stale_iteration_number >= 0: + stale_filename = self._generate_filename(checkpoint_dir, attr, + stale_iteration_number) + try: + tf.gfile.Remove(stale_filename) + except tf.errors.NotFoundError: + pass + + def load(self, checkpoint_dir, suffix): + """Restores the object from bundle_dictionary and numpy checkpoints. + + Args: + checkpoint_dir: str, the directory where to read the numpy checkpointed + files from. + suffix: str, the suffix to use in numpy checkpoint files. + + Raises: + NotFoundError: If not all expected files are found in directory. + """ + save_elements = self._return_checkpointable_elements() + # We will first make sure we have all the necessary files available to avoid + # loading a partially-specified (i.e. corrupted) replay buffer. + for attr in save_elements: + filename = self._generate_filename(checkpoint_dir, attr, suffix) + if not tf.gfile.Exists(filename): + raise tf.errors.NotFoundError(None, None, + 'Missing file: {}'.format(filename)) + # If we've reached this point then we have verified that all expected files + # are available. + for attr in save_elements: + filename = self._generate_filename(checkpoint_dir, attr, suffix) + with tf.gfile.Open(filename, 'rb') as f: + with gzip.GzipFile(fileobj=f) as infile: + if attr.startswith(STORE_FILENAME_PREFIX): + array_name = attr[len(STORE_FILENAME_PREFIX):] + self._store[array_name] = np.load(infile, allow_pickle=False) + elif isinstance(self.__dict__[attr], np.ndarray): + self.__dict__[attr] = np.load(infile, allow_pickle=False) + else: + self.__dict__[attr] = pickle.load(infile) + + +@gin.configurable(blacklist=['observation_shape', 'stack_size', + 'update_horizon', 'gamma']) +class WrappedReplayBuffer(object): + """Wrapper of OutOfGraphReplayBuffer with an in graph sampling mechanism. + + Usage: + To add a transition: call the add function. + + To sample a batch: Construct operations that depend on any of the + tensors is the transition dictionary. Every sess.run + that requires any of these tensors will sample a new + transition. + """ + + def __init__(self, + observation_shape, + stack_size, + use_staging=True, + replay_capacity=1000000, + batch_size=32, + update_horizon=1, + gamma=0.99, + wrapped_memory=None, + max_sample_attempts=MAX_SAMPLE_ATTEMPTS, + extra_storage_types=None, + observation_dtype=np.uint8): + """Initializes WrappedReplayBuffer. + + Args: + observation_shape: tuple of ints. + stack_size: int, number of frames to use in state stack. + use_staging: bool, when True it would use a staging area to prefetch + the next sampling batch. + replay_capacity: int, number of transitions to keep in memory. + batch_size: int. + update_horizon: int, length of update ('n' in n-step update). + gamma: int, the discount factor. + wrapped_memory: The 'inner' memory data structure. If None, + it creates the standard DQN replay memory. + max_sample_attempts: int, the maximum number of attempts allowed to + get a sample. + extra_storage_types: list of ReplayElements defining the type of the extra + contents that will be stored and returned by sample_transition_batch. + observation_dtype: np.dtype, type of the observations. Defaults to + np.uint8 for Atari 2600. + + Raises: + ValueError: If update_horizon is not positive. + ValueError: If discount factor is not in [0, 1]. + """ + if replay_capacity < update_horizon + 1: + raise ValueError( + 'Update horizon ({}) should be significantly smaller ' + 'than replay capacity ({}).'.format(update_horizon, replay_capacity)) + if not update_horizon >= 1: + raise ValueError('Update horizon must be positive.') + if not 0.0 <= gamma <= 1.0: + raise ValueError('Discount factor (gamma) must be in [0, 1].') + + self.batch_size = batch_size + + # Mainly used to allow subclasses to pass self.memory. + if wrapped_memory is not None: + self.memory = wrapped_memory + else: + self.memory = OutOfGraphReplayBuffer( + observation_shape, stack_size, replay_capacity, batch_size, + update_horizon, gamma, max_sample_attempts, + observation_dtype=observation_dtype, + extra_storage_types=extra_storage_types) + + self.create_sampling_ops(use_staging) + + def add(self, observation, action, reward, terminal, *args): + """Adds a transition to the replay memory. + + Since the next_observation in the transition will be the observation added + next there is no need to pass it. + + If the replay memory is at capacity the oldest transition will be discarded. + + Args: + observation: np.array with shape observation_shape. + action: int, the action in the transition. + reward: float, the reward received in the transition. + terminal: A uint8 acting as a boolean indicating whether the transition + was terminal (1) or not (0). + *args: extra contents with shapes and dtypes according to + extra_storage_types. + """ + self.memory.add(observation, action, reward, terminal, *args) + + def create_sampling_ops(self, use_staging): + """Creates the ops necessary to sample from the replay buffer. + + Creates the transition dictionary containing the sampling tensors. + + Args: + use_staging: bool, when True it would use a staging area to prefetch + the next sampling batch. + """ + with tf.name_scope('sample_replay'): + with tf.device('/cpu:*'): + transition_type = self.memory.get_transition_elements() + transition_tensors = tf.py_func( + self.memory.sample_transition_batch, [], + [return_entry.type for return_entry in transition_type], + name='replay_sample_py_func') + self._set_transition_shape(transition_tensors, transition_type) + if use_staging: + transition_tensors = self._set_up_staging(transition_tensors) + self._set_transition_shape(transition_tensors, transition_type) + + # Unpack sample transition into member variables. + self.unpack_transition(transition_tensors, transition_type) + + def _set_transition_shape(self, transition, transition_type): + """Set shape for each element in the transition. + + Args: + transition: tuple of tf.Tensors. + transition_type: tuple of ReplayElements descriving the shapes of the + respective tensors. + """ + for element, element_type in zip(transition, transition_type): + element.set_shape(element_type.shape) + + def _set_up_staging(self, transition): + """Sets up staging ops for prefetching the next transition. + + This allows us to hide the py_func latency. To do so we use a staging area + to pre-fetch the next batch of transitions. + + Args: + transition: tuple of tf.Tensors with shape + memory.get_transition_elements(). + + Returns: + prefetched_transition: tuple of tf.Tensors with shape + memory.get_transition_elements() that have been previously prefetched. + """ + transition_type = self.memory.get_transition_elements() + + # Create the staging area in CPU. + prefetch_area = tf.contrib.staging.StagingArea( + [shape_with_type.type for shape_with_type in transition_type]) + + # Store prefetch op for tests, but keep it private -- users should not be + # calling _prefetch_batch. + self._prefetch_batch = prefetch_area.put(transition) + initial_prefetch = tf.cond( + tf.equal(prefetch_area.size(), 0), + lambda: prefetch_area.put(transition), tf.no_op) + + # Every time a transition is sampled self.prefetch_batch will be + # called. If the staging area is empty, two put ops will be called. + with tf.control_dependencies([self._prefetch_batch, initial_prefetch]): + prefetched_transition = prefetch_area.get() + + return prefetched_transition + + def unpack_transition(self, transition_tensors, transition_type): + """Unpacks the given transition into member variables. + + Args: + transition_tensors: tuple of tf.Tensors. + transition_type: tuple of ReplayElements matching transition_tensors. + """ + self.transition = collections.OrderedDict() + for element, element_type in zip(transition_tensors, transition_type): + self.transition[element_type.name] = element + + # TODO(bellemare): These are legacy and should probably be removed in + # future versions. + self.states = self.transition['state'] + self.actions = self.transition['action'] + self.rewards = self.transition['reward'] + self.next_states = self.transition['next_state'] + self.terminals = self.transition['terminal'] + self.indices = self.transition['indices'] + + def save(self, checkpoint_dir, iteration_number): + """Save the underlying replay buffer's contents in a file. + + Args: + checkpoint_dir: str, the directory where to read the numpy checkpointed + files from. + iteration_number: int, the iteration_number to use as a suffix in naming + numpy checkpoint files. + """ + self.memory.save(checkpoint_dir, iteration_number) + + def load(self, checkpoint_dir, suffix): + """Loads the replay buffer's state from a saved file. + + Args: + checkpoint_dir: str, the directory where to read the numpy checkpointed + files from. + suffix: str, the suffix to use in numpy checkpoint files. + """ + self.memory.load(checkpoint_dir, suffix) diff --git a/dopamine/dopamine/replay_memory/prioritized_replay_buffer.py b/dopamine/dopamine/replay_memory/prioritized_replay_buffer.py new file mode 100644 index 0000000..426cc1b --- /dev/null +++ b/dopamine/dopamine/replay_memory/prioritized_replay_buffer.py @@ -0,0 +1,327 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""An implementation of Prioritized Experience Replay (PER). + +This implementation is based on the paper "Prioritized Experience Replay" +by Tom Schaul et al. (2015). Many thanks to Tom Schaul, John Quan, and Matteo +Hessel for providing useful pointers on the algorithm and its implementation. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + + +from dopamine.replay_memory import circular_replay_buffer +from dopamine.replay_memory import sum_tree +from dopamine.replay_memory.circular_replay_buffer import ReplayElement +import numpy as np +import tensorflow as tf + +import gin.tf + + +class OutOfGraphPrioritizedReplayBuffer( + circular_replay_buffer.OutOfGraphReplayBuffer): + """An out-of-graph Replay Buffer for Prioritized Experience Replay. + + See circular_replay_buffer.py for details. + """ + + def __init__(self, + observation_shape, + stack_size, + replay_capacity, + batch_size, + update_horizon=1, + gamma=0.99, + max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS, + extra_storage_types=None, + observation_dtype=np.uint8): + """Initializes OutOfGraphPrioritizedReplayBuffer. + + Args: + observation_shape: tuple of ints. + stack_size: int, number of frames to use in state stack. + replay_capacity: int, number of transitions to keep in memory. + batch_size: int. + update_horizon: int, length of update ('n' in n-step update). + gamma: int, the discount factor. + max_sample_attempts: int, the maximum number of attempts allowed to + get a sample. + extra_storage_types: list of ReplayElements defining the type of the extra + contents that will be stored and returned by sample_transition_batch. + observation_dtype: np.dtype, type of the observations. Defaults to + np.uint8 for Atari 2600. + """ + super(OutOfGraphPrioritizedReplayBuffer, self).__init__( + observation_shape=observation_shape, + stack_size=stack_size, + replay_capacity=replay_capacity, + batch_size=batch_size, + update_horizon=update_horizon, + gamma=gamma, + max_sample_attempts=max_sample_attempts, + extra_storage_types=extra_storage_types, + observation_dtype=observation_dtype) + + self.sum_tree = sum_tree.SumTree(replay_capacity) + + def get_add_args_signature(self): + """The signature of the add function. + + The signature is the same as the one for OutOfGraphReplayBuffer, with an + added priority. + + Returns: + list of ReplayElements defining the type of the argument signature needed + by the add function. + """ + parent_add_signature = super(OutOfGraphPrioritizedReplayBuffer, + self).get_add_args_signature() + add_signature = parent_add_signature + [ + ReplayElement('priority', (), np.float32) + ] + return add_signature + + def _add(self, *args): + """Internal add method to add to the underlying memory arrays. + + The arguments need to match add_arg_signature. + + If priority is none, it is set to the maximum priority ever seen. + + Args: + *args: All the elements in a transition. + """ + # Use Schaul et al.'s (2015) scheme of setting the priority of new elements + # to the maximum priority so far. + parent_add_args = [] + # Picks out 'priority' from arguments and passes the other arguments to the + # parent method. + for i, element in enumerate(self.get_add_args_signature()): + if element.name == 'priority': + priority = args[i] + else: + parent_add_args.append(args[i]) + + self.sum_tree.set(self.cursor(), priority) + + super(OutOfGraphPrioritizedReplayBuffer, self)._add(*parent_add_args) + + def sample_index_batch(self, batch_size): + """Returns a batch of valid indices sampled as in Schaul et al. (2015). + + Args: + batch_size: int, number of indices returned. + + Returns: + list of ints, a batch of valid indices sampled uniformly. + + Raises: + Exception: If the batch was not constructed after maximum number of tries. + """ + # Sample stratified indices. Some of them might be invalid. + indices = self.sum_tree.stratified_sample(batch_size) + allowed_attempts = self._max_sample_attempts + for i in range(len(indices)): + if not self.is_valid_transition(indices[i]): + if allowed_attempts == 0: + raise RuntimeError( + 'Max saple attempsts: Tried {} times but only sampled {}' + ' valid indices. Batch size is {}'. + format(self._max_sample_attempts, i, batch_size)) + index = indices[i] + while not self.is_valid_transition(index) and allowed_attempts > 0: + # If index i is not valid keep sampling others. Note that this + # is not stratified. + index = self.sum_tree.sample() + allowed_attempts -= 1 + indices[i] = index + return indices + + def sample_transition_batch(self, batch_size=None, indices=None): + """Returns a batch of transitions with extra storage and the priorities. + + The extra storage are defined through the extra_storage_types constructor + argument. + + When the transition is terminal next_state_batch has undefined contents. + + Args: + batch_size: int, number of transitions returned. If None, the default + batch_size will be used. + indices: None or list of ints, the indices of every transition in the + batch. If None, sample the indices uniformly. + + Returns: + transition_batch: tuple of np.arrays with the shape and type as in + get_transition_elements(). + """ + transition = (super(OutOfGraphPrioritizedReplayBuffer, self). + sample_transition_batch(batch_size, indices)) + transition_elements = self.get_transition_elements(batch_size) + transition_names = [e.name for e in transition_elements] + probabilities_index = transition_names.index('sampling_probabilities') + indices_index = transition_names.index('indices') + indices = transition[indices_index] + # The parent returned an empty array for the probabilities. Fill it with the + # contents of the sum tree. + transition[probabilities_index][:] = self.get_priority(indices) + return transition + + def set_priority(self, indices, priorities): + """Sets the priority of the given elements according to Schaul et al. + + Args: + indices: np.array with dtype int32, of indices in range + [0, replay_capacity). + priorities: float, the corresponding priorities. + """ + assert indices.dtype == np.int32, ('Indices must be integers, ' + 'given: {}'.format(indices.dtype)) + for index, priority in zip(indices, priorities): + self.sum_tree.set(index, priority) + + def get_priority(self, indices): + """Fetches the priorities correspond to a batch of memory indices. + + For any memory location not yet used, the corresponding priority is 0. + + Args: + indices: np.array with dtype int32, of indices in range + [0, replay_capacity). + + Returns: + priorities: float, the corresponding priorities. + """ + assert indices.shape, 'Indices must be an array.' + assert indices.dtype == np.int32, ('Indices must be int32s, ' + 'given: {}'.format(indices.dtype)) + batch_size = len(indices) + priority_batch = np.empty((batch_size), dtype=np.float32) + for i, memory_index in enumerate(indices): + priority_batch[i] = self.sum_tree.get(memory_index) + return priority_batch + + def get_transition_elements(self, batch_size=None): + """Returns a 'type signature' for sample_transition_batch. + + Args: + batch_size: int, number of transitions returned. If None, the default + batch_size will be used. + Returns: + signature: A namedtuple describing the method's return type signature. + """ + parent_transition_type = ( + super(OutOfGraphPrioritizedReplayBuffer, + self).get_transition_elements(batch_size)) + probablilities_type = [ + ReplayElement('sampling_probabilities', (batch_size,), np.float32) + ] + return parent_transition_type + probablilities_type + + +@gin.configurable(blacklist=['observation_shape', 'stack_size', + 'update_horizon', 'gamma']) +class WrappedPrioritizedReplayBuffer( + circular_replay_buffer.WrappedReplayBuffer): + """Wrapper of OutOfGraphPrioritizedReplayBuffer with in-graph sampling. + + Usage: + + * To add a transition: Call the add function. + + * To sample a batch: Query any of the tensors in the transition dictionary. + Every sess.run that requires any of these tensors will + sample a new transition. + """ + + def __init__(self, + observation_shape, + stack_size, + use_staging=True, + replay_capacity=1000000, + batch_size=32, + update_horizon=1, + gamma=0.99, + max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS, + extra_storage_types=None, + observation_dtype=np.uint8): + """Initializes WrappedPrioritizedReplayBuffer. + + Args: + observation_shape: tuple of ints. + stack_size: int, number of frames to use in state stack. + use_staging: bool, when True it would use a staging area to prefetch + the next sampling batch. + replay_capacity: int, number of transitions to keep in memory. + batch_size: int. + update_horizon: int, length of update ('n' in n-step update). + gamma: int, the discount factor. + max_sample_attempts: int, the maximum number of attempts allowed to + get a sample. + extra_storage_types: list of ReplayElements defining the type of the extra + contents that will be stored and returned by sample_transition_batch. + observation_dtype: np.dtype, type of the observations. Defaults to + np.uint8 for Atari 2600. + + Raises: + ValueError: If update_horizon is not positive. + ValueError: If discount factor is not in [0, 1]. + """ + memory = OutOfGraphPrioritizedReplayBuffer( + observation_shape, stack_size, replay_capacity, batch_size, + update_horizon, gamma, max_sample_attempts, + extra_storage_types=extra_storage_types) + super(WrappedPrioritizedReplayBuffer, self).__init__( + observation_shape, + stack_size, + use_staging, + replay_capacity, + batch_size, + update_horizon, + gamma, + wrapped_memory=memory, + extra_storage_types=extra_storage_types) + + def tf_set_priority(self, indices, priorities): + """Sets the priorities for the given indices. + + Args: + indices: tf.Tensor with dtype int32 and shape [n]. + priorities: tf.Tensor with dtype float and shape [n]. + + Returns: + A tf op setting the priorities for prioritized sampling. + """ + return tf.py_func( + self.memory.set_priority, [indices, priorities], [], + name='prioritized_replay_set_priority_py_func') + + def tf_get_priority(self, indices): + """Gets the priorities for the given indices. + + Args: + indices: tf.Tensor with dtype int32 and shape [n]. + + Returns: + priorities: tf.Tensor with dtype float and shape [n], the priorities at + the indices. + """ + return tf.py_func( + self.memory.get_priority, [indices], + tf.float32, + name='prioritized_replay_get_priority_py_func') diff --git a/dopamine/dopamine/replay_memory/sum_tree.py b/dopamine/dopamine/replay_memory/sum_tree.py new file mode 100644 index 0000000..406a491 --- /dev/null +++ b/dopamine/dopamine/replay_memory/sum_tree.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A sum tree data structure. + +Used for prioritized experience replay. See prioritized_replay_buffer.py +and Schaul et al. (2015). +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import random + +import numpy as np + + +class SumTree(object): + """A sum tree data structure for storing replay priorities. + + A sum tree is a complete binary tree whose leaves contain values called + priorities. Internal nodes maintain the sum of the priorities of all leaf + nodes in their subtree. + + For capacity = 4, the tree may look like this: + + +---+ + |2.5| + +-+-+ + | + +-------+--------+ + | | + +-+-+ +-+-+ + |1.5| |1.0| + +-+-+ +-+-+ + | | + +----+----+ +----+----+ + | | | | + +-+-+ +-+-+ +-+-+ +-+-+ + |0.5| |1.0| |0.5| |0.5| + +---+ +---+ +---+ +---+ + + This is stored in a list of numpy arrays: + self.nodes = [ [2.5], [1.5, 1], [0.5, 1, 0.5, 0.5] ] + + For conciseness, we allocate arrays as powers of two, and pad the excess + elements with zero values. + + This is similar to the usual array-based representation of a complete binary + tree, but is a little more user-friendly. + """ + + def __init__(self, capacity): + """Creates the sum tree data structure for the given replay capacity. + + Args: + capacity: int, the maximum number of elements that can be stored in this + data structure. + + Raises: + ValueError: If requested capacity is not positive. + """ + assert isinstance(capacity, int) + if capacity <= 0: + raise ValueError('Sum tree capacity should be positive. Got: {}'. + format(capacity)) + + self.nodes = [] + tree_depth = int(math.ceil(np.log2(capacity))) + level_size = 1 + for _ in range(tree_depth + 1): + nodes_at_this_depth = np.zeros(level_size) + self.nodes.append(nodes_at_this_depth) + + level_size *= 2 + + self.max_recorded_priority = 1.0 + + def _total_priority(self): + """Returns the sum of all priorities stored in this sum tree. + + Returns: + float, sum of priorities stored in this sum tree. + """ + return self.nodes[0][0] + + def sample(self, query_value=None): + """Samples an element from the sum tree. + + Each element has probability p_i / sum_j p_j of being picked, where p_i is + the (positive) value associated with node i (possibly unnormalized). + + Args: + query_value: float in [0, 1], used as the random value to select a + sample. If None, will select one randomly in [0, 1). + + Returns: + int, a random element from the sum tree. + + Raises: + Exception: If the sum tree is empty (i.e. its node values sum to 0), or if + the supplied query_value is larger than the total sum. + """ + if self._total_priority() == 0.0: + raise Exception('Cannot sample from an empty sum tree.') + + if query_value and (query_value < 0. or query_value > 1.): + raise ValueError('query_value must be in [0, 1].') + + # Sample a value in range [0, R), where R is the value stored at the root. + query_value = random.random() if query_value is None else query_value + query_value *= self._total_priority() + + # Now traverse the sum tree. + node_index = 0 + for nodes_at_this_depth in self.nodes[1:]: + # Compute children of previous depth's node. + left_child = node_index * 2 + + left_sum = nodes_at_this_depth[left_child] + # Each subtree describes a range [0, a), where a is its value. + if query_value < left_sum: # Recurse into left subtree. + node_index = left_child + else: # Recurse into right subtree. + node_index = left_child + 1 + # Adjust query to be relative to right subtree. + query_value -= left_sum + + return node_index + + def stratified_sample(self, batch_size): + """Performs stratified sampling using the sum tree. + + Let R be the value at the root (total value of sum tree). This method will + divide [0, R) into batch_size segments, pick a random number from each of + those segments, and use that random number to sample from the sum_tree. This + is as specified in Schaul et al. (2015). + + Args: + batch_size: int, the number of strata to use. + Returns: + list of batch_size elements sampled from the sum tree. + + Raises: + Exception: If the sum tree is empty (i.e. its node values sum to 0). + """ + if self._total_priority() == 0.0: + raise Exception('Cannot sample from an empty sum tree.') + + bounds = np.linspace(0., 1., batch_size + 1) + assert len(bounds) == batch_size + 1 + segments = [(bounds[i], bounds[i+1]) for i in range(batch_size)] + query_values = [random.uniform(x[0], x[1]) for x in segments] + return [self.sample(query_value=x) for x in query_values] + + def get(self, node_index): + """Returns the value of the leaf node corresponding to the index. + + Args: + node_index: The index of the leaf node. + Returns: + The value of the leaf node. + """ + return self.nodes[-1][node_index] + + def set(self, node_index, value): + """Sets the value of a leaf node and updates internal nodes accordingly. + + This operation takes O(log(capacity)). + Args: + node_index: int, the index of the leaf node to be updated. + value: float, the value which we assign to the node. This value must be + nonnegative. Setting value = 0 will cause the element to never be + sampled. + + Raises: + ValueError: If the given value is negative. + """ + if value < 0.0: + raise ValueError('Sum tree values should be nonnegative. Got {}'. + format(value)) + self.max_recorded_priority = max(value, self.max_recorded_priority) + + delta_value = value - self.nodes[-1][node_index] + + # Now traverse back the tree, adjusting all sums along the way. + for nodes_at_this_depth in reversed(self.nodes): + # Note: Adding a delta leads to some tolerable numerical inaccuracies. + nodes_at_this_depth[node_index] += delta_value + node_index //= 2 + + assert node_index == 0, ('Sum tree traversal failed, final node index ' + 'is not 0.') diff --git a/dopamine/dopamine/utils/__init__.py b/dopamine/dopamine/utils/__init__.py new file mode 100644 index 0000000..920cbb5 --- /dev/null +++ b/dopamine/dopamine/utils/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/dopamine/dopamine/utils/test_utils.py b/dopamine/dopamine/utils/test_utils.py new file mode 100644 index 0000000..05ecd73 --- /dev/null +++ b/dopamine/dopamine/utils/test_utils.py @@ -0,0 +1,34 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Common testing utilities shared across agents.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + + +import mock +import tensorflow as tf + + +class MockReplayBuffer(object): + """Mock ReplayBuffer to verify the way the agent interacts with it.""" + + def __init__(self): + with tf.variable_scope('MockReplayBuffer', reuse=tf.AUTO_REUSE): + self.add = mock.Mock() + self.memory = mock.Mock() + self.memory.add_count = 0 diff --git a/dopamine/gym/preprocessing.py b/dopamine/gym/preprocessing.py new file mode 100644 index 0000000..80f7233 --- /dev/null +++ b/dopamine/gym/preprocessing.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A wrapper class around Gym environments. + +This class makes general Gym environments conformant with the API Dopamine is +expecting. +""" + +import gin.tf + + +@gin.configurable +class GymPreprocessing(object): + """A Wrapper class around Gym environments.""" + + def __init__(self, environment): + self.environment = environment + self.game_over = False + + @property + def observation_space(self): + return self.environment.observation_space + + @property + def action_space(self): + return self.environment.action_space + + @property + def reward_range(self): + return self.environment.reward_range + + @property + def metadata(self): + return self.environment.metadata + + def reset(self): + return self.environment.reset() + + def step(self, action): + observation, reward, game_over, info = self.environment.step(action) + self.game_over = game_over + return observation, reward, game_over, info diff --git a/dopamine/setup.py b/dopamine/setup.py new file mode 100644 index 0000000..adb82f4 --- /dev/null +++ b/dopamine/setup.py @@ -0,0 +1,92 @@ +# coding=utf-8 +# Copyright 2018 The Dopamine Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Setup script for Dopamine. + +This script will install Dopamine as a Python module. + +See: https://github.com/google/dopamine + +""" + +import codecs +from os import path +from setuptools import find_packages +from setuptools import setup + +here = path.abspath(path.dirname(__file__)) + +# Get the long description from the README file. +with codecs.open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +install_requires = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2', + 'opencv-python >= 3.4.1.15', + 'gym >= 0.10.5'] +tests_require = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2', + 'opencv-python >= 3.4.1.15', + 'gym >= 0.10.5', 'mock >= 1.0.0'] + +dopamine_description = ( + 'Dopamine: A framework for flexible Reinforcement Learning research') + +setup( + name='dopamine_rl', + version='1.0.5', + include_package_data=True, + packages=find_packages(exclude=['docs']), # Required + package_data={'testdata': ['testdata/*.gin']}, + install_requires=install_requires, + tests_require=tests_require, + description=dopamine_description, + long_description=long_description, + url='https://github.com/google/dopamine', # Optional + author='The Dopamine Team', # Optional + author_email='opensource@google.com', + classifiers=[ # Optional + 'Development Status :: 4 - Beta', + + # Indicate who your project is intended for + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + + # Pick your license as you wish + 'License :: OSI Approved :: Apache Software License', + + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Software Development', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', + + ], + project_urls={ # Optional + 'Documentation': 'https://github.com/google/dopamine', + 'Bug Reports': 'https://github.com/google/dopamine/issues', + 'Source': 'https://github.com/google/dopamine', + }, + license='Apache 2.0', + keywords='dopamine reinforcement-learning python machine learning' +)