From f216fdf1e0e2cd1a1450f9ce5621dd5af814ba89 Mon Sep 17 00:00:00 2001
From: Kaixiang Lin <linkaixi@msu.edu>
Date: Mon, 24 Jun 2019 18:22:46 -0700
Subject: [PATCH] first commit.

---
 .gitignore                                    | 175 ++++
 README.md                                     |  48 +
 code.md                                       |  65 ++
 dopamine/dopamine/__init__.py                 |  15 +
 dopamine/dopamine/agents/__init__.py          |  15 +
 dopamine/dopamine/agents/agent_utils.py       |  81 ++
 dopamine/dopamine/agents/dqn/__init__.py      |  15 +
 dopamine/dopamine/agents/dqn/configs/dqn.gin  |  35 +
 .../dopamine/agents/dqn/configs/dqn_icml.gin  |  33 +
 .../agents/dqn/configs/dqn_nature.gin         |  36 +
 dopamine/dopamine/agents/dqn/dqn_agent.py     | 521 +++++++++++
 dopamine/dopamine/agents/dqnrpg/__init__.py   |  15 +
 .../dopamine/agents/dqnrpg/configs/dqnrpg.gin |  35 +
 .../dopamine/agents/dqnrpg/dqnrpg_agent.py    | 585 ++++++++++++
 dopamine/dopamine/agents/epg/__init__.py      |  15 +
 dopamine/dopamine/agents/epg/configs/epg.gin  |  36 +
 .../dopamine/agents/epg/configs/epg_pong.gin  |  35 +
 dopamine/dopamine/agents/epg/epg_agent.py     | 550 ++++++++++++
 .../agents/implicit_quantile/__init__.py      |  15 +
 .../configs/implicit_quantile.gin             |  40 +
 .../configs/implicit_quantile_icml.gin        |  37 +
 .../implicit_quantile_agent.py                | 358 ++++++++
 .../agents/implicit_quantilerpg/__init__.py   |  15 +
 .../configs/implicit_quantilerpg.gin          |  41 +
 .../implicit_quantilerpg_agent.py             | 431 +++++++++
 dopamine/dopamine/agents/lpg/__init__.py      |  15 +
 dopamine/dopamine/agents/lpg/configs/lpg.gin  |  36 +
 dopamine/dopamine/agents/lpg/lpg_agent.py     | 590 +++++++++++++
 dopamine/dopamine/agents/rainbow/__init__.py  |  15 +
 .../dopamine/agents/rainbow/configs/c51.gin   |  35 +
 .../agents/rainbow/configs/c51_icml.gin       |  36 +
 .../agents/rainbow/configs/rainbow.gin        |  35 +
 .../agents/rainbow/configs/rainbow_aaai.gin   |  37 +
 .../dopamine/agents/rainbow/rainbow_agent.py  | 504 +++++++++++
 .../dopamine/agents/rainbowrpg/__init__.py    |  15 +
 .../agents/rainbowrpg/configs/c51rpg.gin      |  35 +
 .../agents/rainbowrpg/configs/rainbowrpg.gin  |  35 +
 .../agents/rainbowrpg/rainbowrpg_agent.py     | 699 +++++++++++++++
 dopamine/dopamine/agents/repg/__init__.py     |  15 +
 .../dopamine/agents/repg/configs/repg.gin     |  36 +
 dopamine/dopamine/agents/repg/repg_agent.py   | 607 +++++++++++++
 dopamine/dopamine/agents/rpg/__init__.py      |  15 +
 dopamine/dopamine/agents/rpg/configs/rpg.gin  |  36 +
 .../dopamine/agents/rpg/configs/rpg_pong.gin  |  36 +
 dopamine/dopamine/agents/rpg/rpg_agent.py     | 613 +++++++++++++
 dopamine/dopamine/atari/__init__.py           |  15 +
 dopamine/dopamine/atari/preprocessing.py      | 216 +++++
 dopamine/dopamine/atari/run_experiment.py     | 592 +++++++++++++
 dopamine/dopamine/atari/train.py              | 186 ++++
 dopamine/dopamine/common/__init__.py          |  14 +
 dopamine/dopamine/common/checkpointer.py      | 177 ++++
 .../dopamine/common/iteration_statistics.py   |  49 +
 dopamine/dopamine/common/logger.py            | 105 +++
 dopamine/dopamine/replay_memory/__init__.py   |  15 +
 .../replay_memory/circular_replay_buffer.py   | 835 ++++++++++++++++++
 .../prioritized_replay_buffer.py              | 327 +++++++
 dopamine/dopamine/replay_memory/sum_tree.py   | 205 +++++
 dopamine/dopamine/utils/__init__.py           |  15 +
 dopamine/dopamine/utils/test_utils.py         |  34 +
 dopamine/gym/preprocessing.py                 |  54 ++
 dopamine/setup.py                             |  92 ++
 61 files changed, 9623 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 code.md
 create mode 100644 dopamine/dopamine/__init__.py
 create mode 100644 dopamine/dopamine/agents/__init__.py
 create mode 100644 dopamine/dopamine/agents/agent_utils.py
 create mode 100644 dopamine/dopamine/agents/dqn/__init__.py
 create mode 100644 dopamine/dopamine/agents/dqn/configs/dqn.gin
 create mode 100644 dopamine/dopamine/agents/dqn/configs/dqn_icml.gin
 create mode 100644 dopamine/dopamine/agents/dqn/configs/dqn_nature.gin
 create mode 100644 dopamine/dopamine/agents/dqn/dqn_agent.py
 create mode 100644 dopamine/dopamine/agents/dqnrpg/__init__.py
 create mode 100644 dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin
 create mode 100644 dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py
 create mode 100644 dopamine/dopamine/agents/epg/__init__.py
 create mode 100644 dopamine/dopamine/agents/epg/configs/epg.gin
 create mode 100644 dopamine/dopamine/agents/epg/configs/epg_pong.gin
 create mode 100644 dopamine/dopamine/agents/epg/epg_agent.py
 create mode 100644 dopamine/dopamine/agents/implicit_quantile/__init__.py
 create mode 100644 dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin
 create mode 100644 dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin
 create mode 100644 dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py
 create mode 100644 dopamine/dopamine/agents/implicit_quantilerpg/__init__.py
 create mode 100644 dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin
 create mode 100644 dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py
 create mode 100644 dopamine/dopamine/agents/lpg/__init__.py
 create mode 100644 dopamine/dopamine/agents/lpg/configs/lpg.gin
 create mode 100644 dopamine/dopamine/agents/lpg/lpg_agent.py
 create mode 100644 dopamine/dopamine/agents/rainbow/__init__.py
 create mode 100644 dopamine/dopamine/agents/rainbow/configs/c51.gin
 create mode 100644 dopamine/dopamine/agents/rainbow/configs/c51_icml.gin
 create mode 100644 dopamine/dopamine/agents/rainbow/configs/rainbow.gin
 create mode 100644 dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin
 create mode 100644 dopamine/dopamine/agents/rainbow/rainbow_agent.py
 create mode 100644 dopamine/dopamine/agents/rainbowrpg/__init__.py
 create mode 100644 dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin
 create mode 100644 dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin
 create mode 100644 dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py
 create mode 100644 dopamine/dopamine/agents/repg/__init__.py
 create mode 100644 dopamine/dopamine/agents/repg/configs/repg.gin
 create mode 100644 dopamine/dopamine/agents/repg/repg_agent.py
 create mode 100644 dopamine/dopamine/agents/rpg/__init__.py
 create mode 100644 dopamine/dopamine/agents/rpg/configs/rpg.gin
 create mode 100644 dopamine/dopamine/agents/rpg/configs/rpg_pong.gin
 create mode 100644 dopamine/dopamine/agents/rpg/rpg_agent.py
 create mode 100644 dopamine/dopamine/atari/__init__.py
 create mode 100644 dopamine/dopamine/atari/preprocessing.py
 create mode 100644 dopamine/dopamine/atari/run_experiment.py
 create mode 100644 dopamine/dopamine/atari/train.py
 create mode 100644 dopamine/dopamine/common/__init__.py
 create mode 100644 dopamine/dopamine/common/checkpointer.py
 create mode 100644 dopamine/dopamine/common/iteration_statistics.py
 create mode 100644 dopamine/dopamine/common/logger.py
 create mode 100644 dopamine/dopamine/replay_memory/__init__.py
 create mode 100644 dopamine/dopamine/replay_memory/circular_replay_buffer.py
 create mode 100644 dopamine/dopamine/replay_memory/prioritized_replay_buffer.py
 create mode 100644 dopamine/dopamine/replay_memory/sum_tree.py
 create mode 100644 dopamine/dopamine/utils/__init__.py
 create mode 100644 dopamine/dopamine/utils/test_utils.py
 create mode 100644 dopamine/gym/preprocessing.py
 create mode 100644 dopamine/setup.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3c3d5de
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,175 @@
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+.DS_Store
+*/temp/*
+*.pyc
+*./.idea/*
+.idea/* 
+*.DS_Store*
+*.ipynb_checkpoints/*
+notebooks/.ipynb_checkpoints/*
+*.dropbox*
+*Icon*
+*/__pycache__/*
+*/.ipynb_checkpoints/*
+## Intermediate documents:
+*.dvi
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.brf
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+
+# achemso
+acs-*.bib
+
+# amsthm
+*.thm
+
+# beamer
+*.nav
+*.snm
+*.vrb
+
+# cprotect
+*.cpt
+
+#(e)ledmac/(e)ledpar
+*.end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+
+# gnuplottex
+*-gnuplottex-*
+
+# hyperref
+*.brf
+
+# knitr
+*-concordance.tex
+*.tikz
+*-tikzDictionary
+
+# listings
+*.lol
+
+# makeidx
+*.idx
+*.ilg
+*.ind
+*.ist
+
+# minitoc
+*.maf
+*.mtc
+*.mtc[0-9]
+*.mtc[1-9][0-9]
+
+# minted
+_minted*
+*.pyg
+*.pyc
+# morewrites
+*.mw
+
+# mylatexformat
+*.fmt
+
+# nomencl
+*.nlo
+
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+
+# pdfcomment
+*.upa
+*.upb
+
+#pythontex
+*.pytxcode
+pythontex-files-*/
+
+# Texpad
+.texpadtmp
+
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+
+# todonotes
+*.tdo
+
+# xindy
+*.xdy
+
+# xypic precompiled matrices
+*.xyc
+
+# WinEdt
+*.bak
+*.sav
+
+# endfloat
+*.ttt
+*.fff
+
+# Latexian
+TSWLatexianTemp*
+
+main.pdf
+
+*.dropbox*
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b5c107d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,48 @@
+# Ranking Policy Gradient
+Ranking Policy Gradient (RPG) is a sample-efficienct  policy gradient method
+that learns optimal ranking of actions with respect to the  long term reward.
+This codebase contains the implementation of RPG using the
+[dopamine](https://github.com/google/dopamine) framework. 
+
+
+## Instructions
+
+
+### Install via source
+#### Step 1. 
+Follow the install [instruction](https://github.com/KaixiangLin/dopamine/blob/master/README.md#install-via-source) of 
+dopamine framework for [Ubuntu](https://github.com/KaixiangLin/dopamine/blob/master/README.md#ubuntu) 
+or [Max OS X](https://github.com/KaixiangLin/dopamine/blob/master/README.md#mac-os-x). 
+
+#### Step 2. 
+Download the RPG source, i.e.
+
+```
+git clone git@github.com:illidanlab/rpg.git
+```
+
+
+## Running the tests
+
+```
+cd ./rpg/dopamine 
+python -um dopamine.atari.train \
+  --agent_name=rpg \
+  --base_dir=/tmp/dopamine \
+  --random_seed 1 \
+  --game_name=Pong \
+  --gin_files='dopamine/agents/rpg/configs/rpg.gin'
+```
+
+## Reproduce 
+To reproduce the results in the paper, please refer to the instruction in [here](code.md). 
+
+### Reference
+
+If you use this RPG implementation in your work, please consider citing the following papers:
+```
+TODO(RPG): 
+```
+
+## Acknowledgments
+TODO(dopamine framework, fundings). 
diff --git a/code.md b/code.md
new file mode 100644
index 0000000..8e2f717
--- /dev/null
+++ b/code.md
@@ -0,0 +1,65 @@
+# Overview
+
+This document explain the structure of this codebase and hyperparameters of experiments. 
+
+
+## File organization
+
+### Step 1. 
+Please refer to the instruction of dopamine structure in [here](https://github.com/KaixiangLin/dopamine/blob/master/docs/README.md#file-organization)
+
+### Step 2. 
+We add variants of RPG agents in [this folder](dopamine/dopamine/agents) and we explain each agent as follows:
+
+
+|  Folder | Exploration  |  Supervision | 
+|---|---|---|
+| rpg  | epsilon-greedy  |  RPG (Hinge loss) |
+| lpg  | epsilon-greedy|  LPG (Cross-Entropy) |
+| epg  | EPG   | LPG (Cross-Entropy) |
+|repg  | EPG   |  RPG (Hinge loss) |
+|implicit_quantilerpg| implicit_quantile  |  RPG (Hinge loss) |
+
+
+* EPG: EPG is the stochastic listwise policy gradient 
+with off-policy supervised learning, which is the vanilla policy gradient trained 
+with off-policy supervised learning. The exploration and supervision agent is parameterized 
+by the same neural network. The supervision agent minimizes the cross-entropy loss 
+over the near-optimal trajectories collected in an online fashion.
+
+* LPG: LPG is the deterministic listwise policy gradient with off-policy supervised learning. 
+We choose an action greedily based on the value of logits during the evaluation, and it stochastically 
+explores the environment as EPG.
+
+* RPG: RPG explores the environment using a separate agent: epsilon-greedy, EPG in Pong and 
+Implicit Quantile in other games. Then rpg conducts supervised
+learning by minimizing the hinge loss. 
+
+In this codebase, the folder [rpg](dopamine/dopamine/agents/rpg) 
+contain the code of RPG with epsilon-greedy exploration, and similarly [repg](dopamine/dopamine/agents/repg) for EPG exploration, 
+[implicit_quantilerpg](dopamine/dopamine/agents/implicit_quantilerpg)
+ for implicit quantile network exploration. 
+
+The agents with relatively simple exploration strategy (rpg, lpg, epg, repg) perform well on Pong,
+comparing to the state-of-the-arts, since there are higher chance to hit the good trajectories with in Pong. 
+For more complicated games, we adopt implicit quantile network as the exploration agent. 
+
+## Hyperparameters
+The hyperparameters of networks, optimizers, etc., are same as the [baselines](https://github.com/KaixiangLin/dopamine/tree/master/baselines) in dopamine. 
+The trajectory reward threshold c (see Def 5 in the paper (TODO)) for each game is given as follows:
+
+| game  | c  |
+|---|---|
+|  Boxing | 100  |
+|  Breakout | 400  |
+|  Bowling | 80  |
+|  BankHeist | 1100  |
+|  DoubleDunk | 18  |
+|  Pitfall | 0  |
+|  Pong |  1 |
+|  Robotank| 65  |
+
+
+
+
+
diff --git a/dopamine/dopamine/__init__.py b/dopamine/dopamine/__init__.py
new file mode 100644
index 0000000..f2b5d90
--- /dev/null
+++ b/dopamine/dopamine/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name = 'dopamine'
diff --git a/dopamine/dopamine/agents/__init__.py b/dopamine/dopamine/agents/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/agent_utils.py b/dopamine/dopamine/agents/agent_utils.py
new file mode 100644
index 0000000..38b6d78
--- /dev/null
+++ b/dopamine/dopamine/agents/agent_utils.py
@@ -0,0 +1,81 @@
+import random
+from collections import deque
+
+def collect_trajectory(agent, reward):
+    """for pong """
+    if reward < 0:
+        agent.replay_buffer.clear()
+    elif reward > 0:
+        agent.replay_buffer.add(agent._last_observation, agent.action, reward, False)
+        while agent.replay_buffer.size() > 0:
+            experience = agent.replay_buffer.get_sample()
+            state, action, reward, _ = experience
+            agent._store_transition(state, action, reward, False)
+    else:
+        agent.replay_buffer.add(agent._last_observation, agent.action, reward, False)
+
+
+
+class ReplayBufferRegular(object):
+    """ for uniformly sampling.
+
+    """
+
+    def __init__(self, buffer_size, random_seed=1234):
+        self.buffer_size = buffer_size
+        self.count = 0
+        # Right side of deque contains newest experience
+        self.buffer = deque()
+        random.seed(random_seed)
+        self.ptr, self.path_start_idx = 0, 0
+
+    def add(self, state, action, reward, terminal):
+        experience = [state, action, reward, terminal]
+        assert self.count < self.buffer_size
+        self.buffer.append(experience)
+        self.count += 1
+        self.ptr += 1
+        # else:
+        #     self.path_start_idx -= 1
+        #     self.ptr = self.buffer_size - 1
+        #     self.buffer.popleft()
+        #     self.buffer.append(experience)
+
+    def get_sample(self):
+        self.count -= 1
+        return self.buffer.popleft()
+
+    def size(self):
+        return self.count
+
+    def clear(self):
+        self.buffer.clear()
+        self.count = 0
+        self.ptr = 0
+        self.path_start_idx = 0
+
+
+
+""" Threshold of episodic return for each game """
+# we only collect trajectory that has return larger than following
+episodic_return = {"Pong":      21,
+                   "Breakout":  200,
+                   "Bowling":   80,
+                   "Boxing":    100,
+                   "Freeway":   33,
+                   "BankHeist": 1100,
+                   "Robotank":  65,
+                   "Pitfall":   0,
+                   "DoubleDunk":18}
+
+# When we have the return on evaluation phase that is greater than following,
+# we stop training
+episodic_return_switch = {"Pong":      21,
+                          "Breakout":  200,
+                          "Bowling":   80,  # maximum can be more than as 93
+                          "Boxing":    100,
+                          "Freeway":   32,
+                          "BankHeist": 1100,
+                          "Robotank":  65,
+                          "Pitfall": -0.1,
+                          "DoubleDunk":18}
diff --git a/dopamine/dopamine/agents/dqn/__init__.py b/dopamine/dopamine/agents/dqn/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/dqn/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/dqn/configs/dqn.gin b/dopamine/dopamine/agents/dqn/configs/dqn.gin
new file mode 100644
index 0000000..06c3427
--- /dev/null
+++ b/dopamine/dopamine/agents/dqn/configs/dqn.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 20000  # agent steps
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 8000  # agent steps
+DQNAgent.epsilon_train = 0.01
+DQNAgent.epsilon_eval = 0.001
+DQNAgent.epsilon_decay_period = 250000  # agent steps
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = True
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/dqn/configs/dqn_icml.gin b/dopamine/dopamine/agents/dqn/configs/dqn_icml.gin
new file mode 100644
index 0000000..c4dcb24
--- /dev/null
+++ b/dopamine/dopamine/agents/dqn/configs/dqn_icml.gin
@@ -0,0 +1,33 @@
+# Hyperparameters used for reporting DQN results in Bellemare et al. (2017).
+import dopamine.atari.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 50000  # agent steps
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 10000  # agent steps
+DQNAgent.epsilon_train = 0.01
+DQNAgent.epsilon_eval = 0.001
+DQNAgent.epsilon_decay_period = 1000000  # agent steps
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'
+# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
+Runner.sticky_actions = False
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/dqn/configs/dqn_nature.gin b/dopamine/dopamine/agents/dqn/configs/dqn_nature.gin
new file mode 100644
index 0000000..024bff4
--- /dev/null
+++ b/dopamine/dopamine/agents/dqn/configs/dqn_nature.gin
@@ -0,0 +1,36 @@
+# Hyperparameters used in Mnih et al. (2015).
+import dopamine.atari.preprocessing
+import dopamine.atari.run_experiment
+import dopamine.agents.dqn.dqn_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNAgent.gamma = 0.99
+DQNAgent.update_horizon = 1
+DQNAgent.min_replay_history = 50000  # agent steps
+DQNAgent.update_period = 4
+DQNAgent.target_update_period = 10000  # agent steps
+DQNAgent.epsilon_train = 0.1
+DQNAgent.epsilon_eval = 0.05
+DQNAgent.epsilon_decay_period = 1000000  # agent steps
+DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'
+# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
+Runner.sticky_actions = False
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/dqn/dqn_agent.py b/dopamine/dopamine/agents/dqn/dqn_agent.py
new file mode 100644
index 0000000..3509441
--- /dev/null
+++ b/dopamine/dopamine/agents/dqn/dqn_agent.py
@@ -0,0 +1,521 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a DQN agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import os
+import random
+
+
+
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+slim = tf.contrib.slim
+
+
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
+NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
+NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
+
+
+def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon):
+  """Returns the current epsilon for the agent's epsilon-greedy policy.
+
+  This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et
+  al., 2015). The schedule is as follows:
+    Begin at 1. until warmup_steps steps have been taken; then
+    Linearly decay epsilon from 1. to epsilon in decay_period steps; and then
+    Use epsilon from there on.
+
+  Args:
+    decay_period: float, the period over which epsilon is decayed.
+    step: int, the number of training steps completed so far.
+    warmup_steps: int, the number of steps taken before epsilon is decayed.
+    epsilon: float, the final value to which to decay the epsilon parameter.
+
+  Returns:
+    A float, the current epsilon value computed according to the schedule.
+  """
+  steps_left = decay_period + warmup_steps - step
+  bonus = (1.0 - epsilon) * steps_left / decay_period
+  bonus = np.clip(bonus, 0., 1. - epsilon)
+  return epsilon + bonus
+
+
+@gin.configurable
+class DQNAgent(object):
+  """An implementation of the DQN agent."""
+
+  def __init__(self,
+               sess,
+               num_actions,
+               observation_shape=NATURE_DQN_OBSERVATION_SHAPE,
+               observation_dtype=NATURE_DQN_DTYPE,
+               stack_size=NATURE_DQN_STACK_SIZE,
+               gamma=0.99,
+               update_horizon=1,
+               min_replay_history=20000,
+               update_period=4,
+               target_update_period=8000,
+               epsilon_fn=linearly_decaying_epsilon,
+               epsilon_train=0.01,
+               epsilon_eval=0.001,
+               epsilon_decay_period=250000,
+               tf_device='/cpu:*',
+               use_staging=True,
+               max_tf_checkpoints_to_keep=3,
+               optimizer=tf.train.RMSPropOptimizer(
+                   learning_rate=0.00025,
+                   decay=0.95,
+                   momentum=0.0,
+                   epsilon=0.00001,
+                   centered=True),
+               summary_writer=None,
+               summary_writing_frequency=500):
+    """Initializes the agent and constructs the components of its graph.
+
+    Args:
+      sess: `tf.Session`, for executing ops.
+      num_actions: int, number of actions the agent can take at any state.
+      observation_shape: tuple of ints describing the observation shape.
+      observation_dtype: tf.DType, specifies the type of the observations. Note
+        that if your inputs are continuous, you should set this to tf.float32.
+      stack_size: int, number of frames to use in state stack.
+      gamma: float, discount factor with the usual RL meaning.
+      update_horizon: int, horizon at which updates are performed, the 'n' in
+        n-step update.
+      min_replay_history: int, number of transitions that should be experienced
+        before the agent begins training its value function.
+      update_period: int, period between DQN updates.
+      target_update_period: int, update period for the target network.
+      epsilon_fn: function expecting 4 parameters:
+        (decay_period, step, warmup_steps, epsilon). This function should return
+        the epsilon value used for exploration during training.
+      epsilon_train: float, the value to which the agent's epsilon is eventually
+        decayed during training.
+      epsilon_eval: float, epsilon used when evaluating the agent.
+      epsilon_decay_period: int, length of the epsilon decay schedule.
+      tf_device: str, Tensorflow device on which the agent's graph is executed.
+      use_staging: bool, when True use a staging area to prefetch the next
+        training batch, speeding training up by about 30%.
+      max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to
+        keep.
+      optimizer: `tf.train.Optimizer`, for training the value function.
+      summary_writer: SummaryWriter object for outputting training statistics.
+        Summary writing disabled if set to None.
+      summary_writing_frequency: int, frequency with which summaries will be
+        written. Lower values will result in slower training.
+    """
+    assert isinstance(observation_shape, tuple)
+    tf.logging.info('Creating %s agent with the following parameters:',
+                    self.__class__.__name__)
+    tf.logging.info('\t gamma: %f', gamma)
+    tf.logging.info('\t update_horizon: %f', update_horizon)
+    tf.logging.info('\t min_replay_history: %d', min_replay_history)
+    tf.logging.info('\t update_period: %d', update_period)
+    tf.logging.info('\t target_update_period: %d', target_update_period)
+    tf.logging.info('\t epsilon_train: %f', epsilon_train)
+    tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
+    tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
+    tf.logging.info('\t tf_device: %s', tf_device)
+    tf.logging.info('\t use_staging: %s', use_staging)
+    tf.logging.info('\t optimizer: %s', optimizer)
+
+    self.num_actions = num_actions
+    self.observation_shape = tuple(observation_shape)
+    self.observation_dtype = observation_dtype
+    self.stack_size = stack_size
+    self.gamma = gamma
+    self.update_horizon = update_horizon
+    self.cumulative_gamma = math.pow(gamma, update_horizon)
+    self.min_replay_history = min_replay_history
+    self.target_update_period = target_update_period
+    self.epsilon_fn = epsilon_fn
+    self.epsilon_train = epsilon_train
+    self.epsilon_eval = epsilon_eval
+    self.epsilon_decay_period = epsilon_decay_period
+    self.update_period = update_period
+    self.eval_mode = False
+    self.training_steps = 0
+    self.optimizer = optimizer
+    self.summary_writer = summary_writer
+    self.summary_writing_frequency = summary_writing_frequency
+
+    with tf.device(tf_device):
+      # Create a placeholder for the state input to the DQN network.
+      # The last axis indicates the number of consecutive frames stacked.
+      state_shape = (1,) + self.observation_shape + (stack_size,)
+      self.state = np.zeros(state_shape)
+      self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
+                                     name='state_ph')
+      self._replay = self._build_replay_buffer(use_staging)
+
+      self._build_networks()
+
+      self._train_op = self._build_train_op()
+      self._sync_qt_ops = self._build_sync_op()
+
+    if self.summary_writer is not None:
+      # All tf.summaries should have been defined prior to running this.
+      self._merged_summaries = tf.summary.merge_all()
+    self._sess = sess
+    self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)
+
+    # Variables to be initialized by the agent once it interacts with the
+    # environment.
+    self._observation = None
+    self._last_observation = None
+
+  def _get_network_type(self):
+    """Returns the type of the outputs of a Q value network.
+
+    Returns:
+      net_type: _network_type object defining the outputs of the network.
+    """
+    return collections.namedtuple('DQN_network', ['q_values'])
+
+  def _network_template(self, state):
+    """Builds the convolutional network used to compute the agent's Q-values.
+
+    Args:
+      state: `tf.Tensor`, contains the agent's current state.
+
+    Returns:
+      net: _network_type object containing the tensors output by the network.
+    """
+    net = tf.cast(state, tf.float32)
+    net = tf.div(net, 255.)
+    net = slim.conv2d(net, 32, [8, 8], stride=4)
+    net = slim.conv2d(net, 64, [4, 4], stride=2)
+    net = slim.conv2d(net, 64, [3, 3], stride=1)
+    net = slim.flatten(net)
+    net = slim.fully_connected(net, 512)
+    q_values = slim.fully_connected(net, self.num_actions, activation_fn=None)
+    return self._get_network_type()(q_values)
+
+  def _build_networks(self):
+    """Builds the Q-value network computations needed for acting and training.
+
+    These are:
+      self.online_convnet: For computing the current state's Q-values.
+      self.target_convnet: For computing the next state's target Q-values.
+      self._net_outputs: The actual Q-values.
+      self._q_argmax: The action maximizing the current state's Q-values.
+      self._replay_net_outputs: The replayed states' Q-values.
+      self._replay_next_target_net_outputs: The replayed next states' target
+        Q-values (see Mnih et al., 2015 for details).
+    """
+    # Calling online_convnet will generate a new graph as defined in
+    # self._get_network_template using whatever input is passed, but will always
+    # share the same weights.
+    self.online_convnet = tf.make_template('Online', self._network_template)
+    self.target_convnet = tf.make_template('Target', self._network_template)
+    self._net_outputs = self.online_convnet(self.state_ph)
+    # TODO(bellemare): Ties should be broken. They are unlikely to happen when
+    # using a deep network, but may affect performance with a linear
+    # approximation scheme.
+    self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+    self._replay_net_outputs = self.online_convnet(self._replay.states)
+    self._replay_next_target_net_outputs = self.target_convnet(
+        self._replay.next_states)
+
+  def _build_replay_buffer(self, use_staging):
+    """Creates the replay buffer used by the agent.
+
+    Args:
+      use_staging: bool, if True, uses a staging area to prefetch data for
+        faster training.
+
+    Returns:
+      A WrapperReplayBuffer object.
+    """
+    return circular_replay_buffer.WrappedReplayBuffer(
+        observation_shape=self.observation_shape,
+        stack_size=self.stack_size,
+        use_staging=use_staging,
+        update_horizon=self.update_horizon,
+        gamma=self.gamma,
+        observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+  def _build_target_q_op(self):
+    """Build an op used as a target for the Q-value.
+
+    Returns:
+      target_q_op: An op calculating the Q-value.
+    """
+    # Get the maximum Q-value across the actions dimension.
+    replay_next_qt_max = tf.reduce_max(
+        self._replay_next_target_net_outputs.q_values, 1)
+    # Calculate the Bellman target value.
+    #   Q_t = R_t + \gamma^N * Q'_t+1
+    # where,
+    #   Q'_t+1 = \argmax_a Q(S_t+1, a)
+    #          (or) 0 if S_t is a terminal state,
+    # and
+    #   N is the update horizon (by default, N=1).
+    return self._replay.rewards + self.cumulative_gamma * replay_next_qt_max * (
+        1. - tf.cast(self._replay.terminals, tf.float32))
+
+  def _build_train_op(self):
+    """Builds a training op.
+
+    Returns:
+      train_op: An op performing one step of training from replay data.
+    """
+    replay_action_one_hot = tf.one_hot(
+        self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
+    replay_chosen_q = tf.reduce_sum(
+        self._replay_net_outputs.q_values * replay_action_one_hot,
+        reduction_indices=1,
+        name='replay_chosen_q')
+
+    target = tf.stop_gradient(self._build_target_q_op())
+    loss = tf.losses.huber_loss(
+        target, replay_chosen_q, reduction=tf.losses.Reduction.NONE)
+    if self.summary_writer is not None:
+      with tf.variable_scope('Losses'):
+        tf.summary.scalar('HuberLoss', tf.reduce_mean(loss))
+    return self.optimizer.minimize(tf.reduce_mean(loss))
+
+  def _build_sync_op(self):
+    """Builds ops for assigning weights from online to target network.
+
+    Returns:
+      ops: A list of ops assigning weights from online to target network.
+    """
+    # Get trainable variables from online and target DQNs
+    sync_qt_ops = []
+    trainables_online = tf.get_collection(
+        tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online')
+    trainables_target = tf.get_collection(
+        tf.GraphKeys.TRAINABLE_VARIABLES, scope='Target')
+    for (w_online, w_target) in zip(trainables_online, trainables_target):
+      # Assign weights from online to target network.
+      sync_qt_ops.append(w_target.assign(w_online, use_locking=True))
+    return sync_qt_ops
+
+  def begin_episode(self, observation):
+    """Returns the agent's first action for this episode.
+
+    Args:
+      observation: numpy array, the environment's initial observation.
+
+    Returns:
+      int, the selected action.
+    """
+    self._reset_state()
+    self._record_observation(observation)
+
+    if not self.eval_mode:
+      self._train_step()
+
+    self.action = self._select_action()
+    return self.action
+
+  def step(self, reward, observation):
+    """Records the most recent transition and returns the agent's next action.
+
+    We store the observation of the last time step since we want to store it
+    with the reward.
+
+    Args:
+      reward: float, the reward received from the agent's most recent action.
+      observation: numpy array, the most recent observation.
+
+    Returns:
+      int, the selected action.
+    """
+    self._last_observation = self._observation
+    self._record_observation(observation)
+
+    if not self.eval_mode:
+      self._store_transition(self._last_observation, self.action, reward, False)
+      self._train_step()
+
+    self.action = self._select_action()
+    return self.action
+
+  def end_episode(self, reward):
+    """Signals the end of the episode to the agent.
+
+    We store the observation of the current time step, which is the last
+    observation of the episode.
+
+    Args:
+      reward: float, the last reward from the environment.
+    """
+    if not self.eval_mode:
+      self._store_transition(self._observation, self.action, reward, True)
+
+  def _select_action(self):
+    """Select an action from the set of available actions.
+
+    Chooses an action randomly with probability self._calculate_epsilon(), and
+    otherwise acts greedily according to the current Q-value estimates.
+
+    Returns:
+       int, the selected action.
+    """
+    epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn(
+        self.epsilon_decay_period,
+        self.training_steps,
+        self.min_replay_history,
+        self.epsilon_train)
+    if random.random() <= epsilon:
+      # Choose a random action with probability epsilon.
+      return random.randint(0, self.num_actions - 1)
+    else:
+      # Choose the action with highest Q-value at the current state.
+      return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+
+  def _train_step(self):
+    """Runs a single training step.
+
+    Runs a training op if both:
+      (1) A minimum number of frames have been added to the replay buffer.
+      (2) `training_steps` is a multiple of `update_period`.
+
+    Also, syncs weights from online to target network if training steps is a
+    multiple of target update period.
+    """
+    # Run a train op at the rate of self.update_period if enough training steps
+    # have been run. This matches the Nature DQN behaviour.
+    if self._replay.memory.add_count > self.min_replay_history:
+      if self.training_steps % self.update_period == 0:
+        self._sess.run(self._train_op)
+        if (self.summary_writer is not None and
+            self.training_steps > 0 and
+            self.training_steps % self.summary_writing_frequency == 0):
+          summary = self._sess.run(self._merged_summaries)
+          self.summary_writer.add_summary(summary, self.training_steps)
+
+      if self.training_steps % self.target_update_period == 0:
+        self._sess.run(self._sync_qt_ops)
+
+    self.training_steps += 1
+
+  def _record_observation(self, observation):
+    """Records an observation and update state.
+
+    Extracts a frame from the observation vector and overwrites the oldest
+    frame in the state buffer.
+
+    Args:
+      observation: numpy array, an observation from the environment.
+    """
+    # Set current observation. We do the reshaping to handle environments
+    # without frame stacking.
+    observation = np.reshape(observation, self.observation_shape)
+    self._observation = observation[..., 0]
+    self._observation = np.reshape(observation, self.observation_shape)
+    # Swap out the oldest frame with the current frame.
+    self.state = np.roll(self.state, -1, axis=-1)
+    self.state[0, ..., -1] = self._observation
+
+  def _store_transition(self, last_observation, action, reward, is_terminal):
+    """Stores an experienced transition.
+
+    Executes a tf session and executes replay buffer ops in order to store the
+    following tuple in the replay buffer:
+      (last_observation, action, reward, is_terminal).
+
+    Pedantically speaking, this does not actually store an entire transition
+    since the next state is recorded on the following time step.
+
+    Args:
+      last_observation: numpy array, last observation.
+      action: int, the action taken.
+      reward: float, the reward.
+      is_terminal: bool, indicating if the current state is a terminal state.
+    """
+    self._replay.add(last_observation, action, reward, is_terminal)
+
+  def _reset_state(self):
+    """Resets the agent state by filling it with zeros."""
+    self.state.fill(0)
+
+  def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
+    """Returns a self-contained bundle of the agent's state.
+
+    This is used for checkpointing. It will return a dictionary containing all
+    non-TensorFlow objects (to be saved into a file by the caller), and it saves
+    all TensorFlow objects into a checkpoint file.
+
+    Args:
+      checkpoint_dir: str, directory where TensorFlow objects will be saved.
+      iteration_number: int, iteration number to use for naming the checkpoint
+        file.
+
+    Returns:
+      A dict containing additional Python objects to be checkpointed by the
+        experiment. If the checkpoint directory does not exist, returns None.
+    """
+    if not tf.gfile.Exists(checkpoint_dir):
+      return None
+    # Call the Tensorflow saver to checkpoint the graph.
+    self._saver.save(
+        self._sess,
+        os.path.join(checkpoint_dir, 'tf_ckpt'),
+        global_step=iteration_number)
+    # Checkpoint the out-of-graph replay buffer.
+    self._replay.save(checkpoint_dir, iteration_number)
+    bundle_dictionary = {}
+    bundle_dictionary['state'] = self.state
+    bundle_dictionary['eval_mode'] = self.eval_mode
+    bundle_dictionary['training_steps'] = self.training_steps
+    return bundle_dictionary
+
+  def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary):
+    """Restores the agent from a checkpoint.
+
+    Restores the agent's Python objects to those specified in bundle_dictionary,
+    and restores the TensorFlow objects to those specified in the
+    checkpoint_dir. If the checkpoint_dir does not exist, will not reset the
+      agent's state.
+
+    Args:
+      checkpoint_dir: str, path to the checkpoint saved by tf.Save.
+      iteration_number: int, checkpoint version, used when restoring replay
+        buffer.
+      bundle_dictionary: dict, containing additional Python objects owned by
+        the agent.
+
+    Returns:
+      bool, True if unbundling was successful.
+    """
+    try:
+      # self._replay.load() will throw a NotFoundError if it does not find all
+      # the necessary files, in which case we abort the process & return False.
+      self._replay.load(checkpoint_dir, iteration_number)
+    except tf.errors.NotFoundError:
+      return False
+    for key in self.__dict__:
+      if key in bundle_dictionary:
+        self.__dict__[key] = bundle_dictionary[key]
+    # Restore the agent's TensorFlow graph.
+    self._saver.restore(self._sess,
+                        os.path.join(checkpoint_dir,
+                                     'tf_ckpt-{}'.format(iteration_number)))
+    return True
diff --git a/dopamine/dopamine/agents/dqnrpg/__init__.py b/dopamine/dopamine/agents/dqnrpg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/dqnrpg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin b/dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin
new file mode 100644
index 0000000..da6d1bb
--- /dev/null
+++ b/dopamine/dopamine/agents/dqnrpg/configs/dqnrpg.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.dqnrpg.dqnrpg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+DQNRPGAgent.gamma = 0.99
+DQNRPGAgent.update_horizon = 1
+DQNRPGAgent.min_replay_history = 20000  # agent steps
+DQNRPGAgent.update_period = 4
+DQNRPGAgent.target_update_period = 8000  # agent steps
+DQNRPGAgent.epsilon_train = 0.01
+DQNRPGAgent.epsilon_eval = 0
+DQNRPGAgent.epsilon_decay_period = 250000  # agent steps
+DQNRPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+DQNRPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Breakout'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 30
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py b/dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py
new file mode 100644
index 0000000..6291d8c
--- /dev/null
+++ b/dopamine/dopamine/agents/dqnrpg/dqnrpg_agent.py
@@ -0,0 +1,585 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a DQN agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import os
+import random
+
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+from dopamine.agents.agent_utils import *
+slim = tf.contrib.slim
+
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
+NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
+NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
+
+
+def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon):
+    """Returns the current epsilon for the agent's epsilon-greedy policy.
+
+    This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et
+    al., 2015). The schedule is as follows:
+      Begin at 1. until warmup_steps steps have been taken; then
+      Linearly decay epsilon from 1. to epsilon in decay_period steps; and then
+      Use epsilon from there on.
+
+    Args:
+      decay_period: float, the period over which epsilon is decayed.
+      step: int, the number of training steps completed so far.
+      warmup_steps: int, the number of steps taken before epsilon is decayed.
+      epsilon: float, the final value to which to decay the epsilon parameter.
+
+    Returns:
+      A float, the current epsilon value computed according to the schedule.
+    """
+    steps_left = decay_period + warmup_steps - step
+    bonus = (1.0 - epsilon) * steps_left / decay_period
+    bonus = np.clip(bonus, 0., 1. - epsilon)
+    return epsilon + bonus
+
+
+@gin.configurable
+class DQNRPGAgent(object):
+    """An implementation of the DQN agent."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 observation_shape=NATURE_DQN_OBSERVATION_SHAPE,
+                 observation_dtype=NATURE_DQN_DTYPE,
+                 stack_size=NATURE_DQN_STACK_SIZE,
+                 gamma=0.99,
+                 update_horizon=1,
+                 min_replay_history=20000,
+                 update_period=4,
+                 target_update_period=8000,
+                 epsilon_fn=linearly_decaying_epsilon,
+                 epsilon_train=0.01,
+                 epsilon_eval=0.001,
+                 epsilon_decay_period=250000,
+                 tf_device='/cpu:*',
+                 use_staging=True,
+                 max_tf_checkpoints_to_keep=3,
+                 optimizer=tf.train.RMSPropOptimizer(
+                     learning_rate=0.00025,
+                     decay=0.95,
+                     momentum=0.0,
+                     epsilon=0.00001,
+                     centered=True),
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the components of its graph.
+
+        Args:
+          sess: `tf.Session`, for executing ops.
+          num_actions: int, number of actions the agent can take at any state.
+          observation_shape: tuple of ints describing the observation shape.
+          observation_dtype: tf.DType, specifies the type of the observations. Note
+            that if your inputs are continuous, you should set this to tf.float32.
+          stack_size: int, number of frames to use in state stack.
+          gamma: float, discount factor with the usual RL meaning.
+          update_horizon: int, horizon at which updates are performed, the 'n' in
+            n-step update.
+          min_replay_history: int, number of transitions that should be experienced
+            before the agent begins training its value function.
+          update_period: int, period between DQN updates.
+          target_update_period: int, update period for the target network.
+          epsilon_fn: function expecting 4 parameters:
+            (decay_period, step, warmup_steps, epsilon). This function should return
+            the epsilon value used for exploration during training.
+          epsilon_train: float, the value to which the agent's epsilon is eventually
+            decayed during training.
+          epsilon_eval: float, epsilon used when evaluating the agent.
+          epsilon_decay_period: int, length of the epsilon decay schedule.
+          tf_device: str, Tensorflow device on which the agent's graph is executed.
+          use_staging: bool, when True use a staging area to prefetch the next
+            training batch, speeding training up by about 30%.
+          max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to
+            keep.
+          optimizer: `tf.train.Optimizer`, for training the value function.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        assert isinstance(observation_shape, tuple)
+        tf.logging.info('Creating %s agent with the following parameters:',
+                        self.__class__.__name__)
+        tf.logging.info('\t gamma: %f', gamma)
+        tf.logging.info('\t update_horizon: %f', update_horizon)
+        tf.logging.info('\t min_replay_history: %d', min_replay_history)
+        tf.logging.info('\t update_period: %d', update_period)
+        tf.logging.info('\t target_update_period: %d', target_update_period)
+        tf.logging.info('\t epsilon_train: %f', epsilon_train)
+        tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
+        tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
+        tf.logging.info('\t tf_device: %s', tf_device)
+        tf.logging.info('\t use_staging: %s', use_staging)
+        tf.logging.info('\t optimizer: %s', optimizer)
+
+        self.num_actions = num_actions
+        self.observation_shape = tuple(observation_shape)
+        self.observation_dtype = observation_dtype
+        self.stack_size = stack_size
+        self.gamma = gamma
+        self.update_horizon = update_horizon
+        self.cumulative_gamma = math.pow(gamma, update_horizon)
+        self.min_replay_history = min_replay_history
+        self.target_update_period = target_update_period
+        self.epsilon_fn = epsilon_fn
+        self.epsilon_train = epsilon_train
+        self.epsilon_eval = epsilon_eval
+        self.epsilon_decay_period = epsilon_decay_period
+        self.update_period = update_period
+        self.eval_mode = False
+        self.training_steps = 0
+        self.optimizer = optimizer  # DQN optimizer.
+        self.optimizer_rpg = tf.train.RMSPropOptimizer(
+                            learning_rate=0.00025,
+                            decay=0.95,
+                            momentum=0.0,
+                            epsilon=0.00001,
+                            centered=True)  # optimizer for RPG
+        self.summary_writer = summary_writer
+        self.summary_writing_frequency = summary_writing_frequency
+        self.start_training = 1000  # todo task specific
+
+        with tf.device(tf_device):
+            # Create a placeholder for the state input to the DQN network.
+            # The last axis indicates the number of consecutive frames stacked.
+            state_shape = (1,) + self.observation_shape + (stack_size,)
+            self.state = np.zeros(state_shape)
+            self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
+                                           name='state_ph')
+            self._replay = self._build_replay_buffer(use_staging)
+            self._replay_opt = self._build_replay_buffer(use_staging)  # store optimal trajectory
+            self._build_networks()
+
+            self._train_op, self._train_op_rpg = self._build_train_op()
+            self._sync_qt_ops = self._build_sync_op()
+
+        # replay buffer for rpg. only store good trajectories.
+        self.replay_buffer_temp = ReplayBufferRegular(100000)  # temporarily
+
+
+        if self.summary_writer is not None:
+            # All tf.summaries should have been defined prior to running this.
+            self._merged_summaries = tf.summary.merge_all()
+        self._sess = sess
+        self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)
+
+        # Variables to be initialized by the agent once it interacts with the
+        # environment.
+        self._observation = None
+        self._last_observation = None
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of a Q value network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('DQN_network', ['q_values'])
+
+    def _network_template(self, state):
+        """Builds the convolutional network used to compute the agent's Q-values.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(net, 32, [8, 8], stride=4)
+        net = slim.conv2d(net, 64, [4, 4], stride=2)
+        net = slim.conv2d(net, 64, [3, 3], stride=1)
+        net = slim.flatten(net)
+        net = slim.fully_connected(net, 512)
+        q_values = slim.fully_connected(net, self.num_actions, activation_fn=None)
+        return self._get_network_type()(q_values)
+
+    def _build_networks(self):
+        """Builds the Q-value network computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's Q-values.
+          self.target_convnet: For computing the next state's target Q-values.
+          self._net_outputs: The actual Q-values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' Q-values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            Q-values (see Mnih et al., 2015 for details).
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        # DQN explore net.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+        self.target_convnet = tf.make_template('Target', self._network_template)
+
+        self._net_outputs = self.online_convnet(self.state_ph)
+        # TODO(bellemare): Ties should be broken. They are unlikely to happen when
+        # using a deep network, but may affect performance with a linear
+        # approximation scheme.
+        self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states)
+        self._replay_next_target_net_outputs = self.target_convnet(
+            self._replay.next_states)
+
+        # RPG learning net.
+        self.rpg_convnet = tf.make_template('RPG', self._network_template)
+        self._rpg_net_outputs = self.rpg_convnet(self.state_ph)
+        self._q_argmax_rpg = tf.argmax(self._rpg_net_outputs.q_values, axis=1)[0]
+        self._replay_rpg_net_outputs = self.rpg_convnet(self._replay_opt.states)
+
+
+
+
+    def _build_replay_buffer(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A WrapperReplayBuffer object.
+        """
+        return circular_replay_buffer.WrappedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma,
+            observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+    def _build_target_q_op(self):
+        """Build an op used as a target for the Q-value.
+
+        Returns:
+          target_q_op: An op calculating the Q-value.
+        """
+        # Get the maximum Q-value across the actions dimension.
+        replay_next_qt_max = tf.reduce_max(
+            self._replay_next_target_net_outputs.q_values, 1)
+        # Calculate the Bellman target value.
+        #   Q_t = R_t + \gamma^N * Q'_t+1
+        # where,
+        #   Q'_t+1 = \argmax_a Q(S_t+1, a)
+        #          (or) 0 if S_t is a terminal state,
+        # and
+        #   N is the update horizon (by default, N=1).
+        return self._replay.rewards + self.cumulative_gamma * replay_next_qt_max * (
+                1. - tf.cast(self._replay.terminals, tf.float32))
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+
+        # RPG loss
+        replay_opt_action_one_hot = tf.one_hot(
+            self._replay_opt.actions, self.num_actions, 1., 0., name='action_one_hot_rpg')
+        replay_chosen_q_rpg = tf.reduce_sum(
+            self._replay_rpg_net_outputs.q_values * replay_opt_action_one_hot,
+            reduction_indices=1,
+            name='replay_chosen_q_rpg')
+        margin = 1
+        qvalue = self._replay_rpg_net_outputs.q_values
+        # debug self.temp_action_one_hot = replay_action_one_hot
+        # self.temp_qvalue = qvalue
+        self.temp1 = (qvalue + margin) * (1 - replay_opt_action_one_hot) + qvalue * replay_opt_action_one_hot
+        self.temp2 = -(tf.reshape(replay_chosen_q_rpg, [-1, 1]) * tf.ones([1, self.num_actions])) \
+                     * ((1 - replay_opt_action_one_hot) + (replay_opt_action_one_hot))
+        self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2)
+        rpg_loss = tf.reduce_mean(self.hingeloss)
+
+        # DQN loss
+        replay_action_one_hot = tf.one_hot(
+            self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
+        replay_chosen_q = tf.reduce_sum(
+            self._replay_net_outputs.q_values * replay_action_one_hot,
+            reduction_indices=1,
+            name='replay_chosen_q')
+        target = tf.stop_gradient(self._build_target_q_op())
+        loss = tf.losses.huber_loss(
+            target, replay_chosen_q, reduction=tf.losses.Reduction.NONE)
+        mean_loss = tf.reduce_mean(loss)
+        if self.summary_writer is not None:
+            with tf.variable_scope('Losses'):
+                tf.summary.scalar('HuberLoss', mean_loss)
+                tf.summary.scalar("hingeLossRPG", rpg_loss)
+        return self.optimizer.minimize(mean_loss), self.optimizer_rpg.minimize(rpg_loss)
+
+    def _build_sync_op(self):
+        """Builds ops for assigning weights from online to target network.
+
+        Returns:
+          ops: A list of ops assigning weights from online to target network.
+        """
+        # Get trainable variables from online and target DQNs
+        sync_qt_ops = []
+        trainables_online = tf.get_collection(
+            tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online')
+        trainables_target = tf.get_collection(
+            tf.GraphKeys.TRAINABLE_VARIABLES, scope='Target')
+        for (w_online, w_target) in zip(trainables_online, trainables_target):
+            # Assign weights from online to target network.
+            sync_qt_ops.append(w_target.assign(w_online, use_locking=True))
+        return sync_qt_ops
+
+    def begin_episode(self, observation):
+        """Returns the agent's first action for this episode.
+
+        Args:
+          observation: numpy array, the environment's initial observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._reset_state()
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def step(self, reward, observation):
+        """Records the most recent transition and returns the agent's next action.
+
+        We store the observation of the last time step since we want to store it
+        with the reward.
+
+        Args:
+          reward: float, the reward received from the agent's most recent action.
+          observation: numpy array, the most recent observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._last_observation = self._observation
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._store_transition(self._last_observation, self.action, reward, False)
+            self.replay_buffer_temp.add(self._last_observation, self.action, reward, False)
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def end_episode(self, reward):
+        """Signals the end of the episode to the agent.
+
+        We store the observation of the current time step, which is the last
+        observation of the episode.
+
+        Args:
+          reward: float, the last reward from the environment.
+        """
+        if not self.eval_mode:
+            self.replay_buffer_temp.clear()  # this episode is not optimal
+            self._store_transition(self._observation, self.action, reward, True)
+
+    def end_episode_(self, reward, total_reward, step_number):
+        """ This episodes is optimal trajectory """
+        if not self.eval_mode:
+            # for DQN
+            self._store_transition(self._observation, self.action, reward, True)
+
+            # replay buffer for RPG.
+            self.replay_buffer_temp.add(self._observation, self.action, reward, True)
+            count = step_number
+            while count > 0:
+                experience = self.replay_buffer_temp.get_sample()
+                state, action, reward, _ = experience
+                count -= 1
+                # self.replay_buffer_opt.add(state, action, reward, False)
+                self._replay_opt.add(state, action, reward, False)
+
+    def _select_action(self):
+        """Select an action from the set of available actions.
+
+        Chooses an action randomly with probability self._calculate_epsilon(), and
+        otherwise acts greedily according to the current Q-value estimates.
+
+        Returns:
+           int, the selected action.
+        """
+        if self.eval_mode is not True:
+            epsilon = self.epsilon_fn(
+                self.epsilon_decay_period,
+                self.training_steps,
+                self.min_replay_history,
+                self.epsilon_train)
+            if random.random() <= epsilon:
+                # Choose a random action with probability epsilon.
+                return random.randint(0, self.num_actions - 1)
+            else:
+                # Choose the action with highest Q-value at the current state.
+                return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+        else:
+            # evaluation mode: use rpg.
+            return self._sess.run(self._q_argmax_rpg, {self.state_ph: self.state})
+
+    def _train_step(self):
+        """Runs a single training step.
+
+        Runs a training op if both:
+          (1) A minimum number of frames have been added to the replay buffer.
+          (2) `training_steps` is a multiple of `update_period`.
+
+        Also, syncs weights from online to target network if training steps is a
+        multiple of target update period.
+        """
+        # Run a train op at the rate of self.update_period if enough training steps
+        # have been run. This matches the Nature DQN behaviour.
+        if self._replay.memory.add_count > self.min_replay_history:
+            if self.training_steps % self.update_period == 0:
+                self._sess.run(self._train_op)
+                if self._replay_opt.memory.add_count > self.start_training:
+                    self._sess.run(self._train_op_rpg)
+
+                if (self.summary_writer is not None and
+                        self.training_steps > 0 and
+                        self.training_steps % self.summary_writing_frequency == 0):
+                    summary = self._sess.run(self._merged_summaries)
+                    self.summary_writer.add_summary(summary, self.training_steps)
+
+            if self.training_steps % self.target_update_period == 0:
+                self._sess.run(self._sync_qt_ops)
+
+        self.training_steps += 1
+
+    def _record_observation(self, observation):
+        """Records an observation and update state.
+
+        Extracts a frame from the observation vector and overwrites the oldest
+        frame in the state buffer.
+
+        Args:
+          observation: numpy array, an observation from the environment.
+        """
+        # Set current observation. We do the reshaping to handle environments
+        # without frame stacking.
+        observation = np.reshape(observation, self.observation_shape)
+        self._observation = observation[..., 0]
+        self._observation = np.reshape(observation, self.observation_shape)
+        # Swap out the oldest frame with the current frame.
+        self.state = np.roll(self.state, -1, axis=-1)
+        self.state[0, ..., -1] = self._observation
+
+    def _store_transition(self, last_observation, action, reward, is_terminal):
+        """Stores an experienced transition.
+
+        Executes a tf session and executes replay buffer ops in order to store the
+        following tuple in the replay buffer:
+          (last_observation, action, reward, is_terminal).
+
+        Pedantically speaking, this does not actually store an entire transition
+        since the next state is recorded on the following time step.
+
+        Args:
+          last_observation: numpy array, last observation.
+          action: int, the action taken.
+          reward: float, the reward.
+          is_terminal: bool, indicating if the current state is a terminal state.
+        """
+        self._replay.add(last_observation, action, reward, is_terminal)
+
+    def _reset_state(self):
+        """Resets the agent state by filling it with zeros."""
+        self.state.fill(0)
+
+    def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
+        """Returns a self-contained bundle of the agent's state.
+
+        This is used for checkpointing. It will return a dictionary containing all
+        non-TensorFlow objects (to be saved into a file by the caller), and it saves
+        all TensorFlow objects into a checkpoint file.
+
+        Args:
+          checkpoint_dir: str, directory where TensorFlow objects will be saved.
+          iteration_number: int, iteration number to use for naming the checkpoint
+            file.
+
+        Returns:
+          A dict containing additional Python objects to be checkpointed by the
+            experiment. If the checkpoint directory does not exist, returns None.
+        """
+        if not tf.gfile.Exists(checkpoint_dir):
+            return None
+        # Call the Tensorflow saver to checkpoint the graph.
+        self._saver.save(
+            self._sess,
+            os.path.join(checkpoint_dir, 'tf_ckpt'),
+            global_step=iteration_number)
+        # Checkpoint the out-of-graph replay buffer.
+        self._replay.save(checkpoint_dir, iteration_number)
+        bundle_dictionary = {}
+        bundle_dictionary['state'] = self.state
+        bundle_dictionary['eval_mode'] = self.eval_mode
+        bundle_dictionary['training_steps'] = self.training_steps
+        return bundle_dictionary
+
+    def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary):
+        """Restores the agent from a checkpoint.
+
+        Restores the agent's Python objects to those specified in bundle_dictionary,
+        and restores the TensorFlow objects to those specified in the
+        checkpoint_dir. If the checkpoint_dir does not exist, will not reset the
+          agent's state.
+
+        Args:
+          checkpoint_dir: str, path to the checkpoint saved by tf.Save.
+          iteration_number: int, checkpoint version, used when restoring replay
+            buffer.
+          bundle_dictionary: dict, containing additional Python objects owned by
+            the agent.
+
+        Returns:
+          bool, True if unbundling was successful.
+        """
+        try:
+            # self._replay.load() will throw a NotFoundError if it does not find all
+            # the necessary files, in which case we abort the process & return False.
+            self._replay.load(checkpoint_dir, iteration_number)
+        except tf.errors.NotFoundError:
+            return False
+        for key in self.__dict__:
+            if key in bundle_dictionary:
+                self.__dict__[key] = bundle_dictionary[key]
+        # Restore the agent's TensorFlow graph.
+        self._saver.restore(self._sess,
+                            os.path.join(checkpoint_dir,
+                                         'tf_ckpt-{}'.format(iteration_number)))
+        return True
diff --git a/dopamine/dopamine/agents/epg/__init__.py b/dopamine/dopamine/agents/epg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/epg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/epg/configs/epg.gin b/dopamine/dopamine/agents/epg/configs/epg.gin
new file mode 100644
index 0000000..1188e19
--- /dev/null
+++ b/dopamine/dopamine/agents/epg/configs/epg.gin
@@ -0,0 +1,36 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.epg.epg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+EPGAgent.gamma = 0.99
+EPGAgent.game_name = 'Pong' # Boxing, Pong
+EPGAgent.update_horizon = 1
+EPGAgent.min_replay_history = 20000  # agent steps, step more than this, stop exploration.
+EPGAgent.update_period = 4
+EPGAgent.epsilon_train = 0
+EPGAgent.epsilon_eval = 0
+EPGAgent.epsilon_decay_period = 250000  # agent steps
+EPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+EPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+EPGAgent.margin = 1
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'  # Boxing, Pong
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = True
+Runner.num_iterations = 15
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 10000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/epg/configs/epg_pong.gin b/dopamine/dopamine/agents/epg/configs/epg_pong.gin
new file mode 100644
index 0000000..c8c6445
--- /dev/null
+++ b/dopamine/dopamine/agents/epg/configs/epg_pong.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.epg.epg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+EPGAgent.gamma = 0.99
+EPGAgent.update_horizon = 1
+EPGAgent.min_replay_history = 20000  # agent steps, step more than this, stop exploration.
+EPGAgent.update_period = 4
+EPGAgent.epsilon_train = 0.0001
+EPGAgent.epsilon_eval = 0
+EPGAgent.epsilon_decay_period = 250000  # agent steps
+EPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+EPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+EPGAgent.margin = 1
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 15
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 10000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/epg/epg_agent.py b/dopamine/dopamine/agents/epg/epg_agent.py
new file mode 100644
index 0000000..6a17c8f
--- /dev/null
+++ b/dopamine/dopamine/agents/epg/epg_agent.py
@@ -0,0 +1,550 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a DQN agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import os
+import random
+from dopamine.agents.agent_utils import *
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+from collections import deque
+import tensorflow
+from tensorflow.distributions import Categorical
+
+slim = tf.contrib.slim
+
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
+NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
+NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
+
+
+def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon):
+    """Returns the current epsilon for the agent's epsilon-greedy policy.
+
+    This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et
+    al., 2015). The schedule is as follows:
+      Begin at 1. until warmup_steps steps have been taken; then
+      Linearly decay epsilon from 1. to epsilon in decay_period steps; and then
+      Use epsilon from there on.
+
+    Args:
+      decay_period: float, the period over which epsilon is decayed.
+      step: int, the number of training steps completed so far.
+      warmup_steps: int, the number of steps taken before epsilon is decayed.
+      epsilon: float, the final value to which to decay the epsilon parameter.
+
+    Returns:
+      A float, the current epsilon value computed according to the schedule.
+    """
+    steps_left = decay_period + warmup_steps - step
+    bonus = (1.0 - epsilon) * steps_left / decay_period
+    bonus = np.clip(bonus, 0., 1. - epsilon)
+    return epsilon + bonus
+
+
+@gin.configurable
+class EPGAgent(object):
+    """An implementation of the DQN agent."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 game_name="Pong",
+                 observation_shape=NATURE_DQN_OBSERVATION_SHAPE,
+                 observation_dtype=NATURE_DQN_DTYPE,
+                 stack_size=NATURE_DQN_STACK_SIZE,
+                 gamma=0.99,
+                 update_horizon=1,
+                 min_replay_history=20000,
+                 update_period=4,
+                 epsilon_fn=linearly_decaying_epsilon,
+                 epsilon_train=0.01,
+                 epsilon_eval=0.001,
+                 epsilon_decay_period=250000,
+                 margin=1,
+                 tf_device='/cpu:*',
+                 use_staging=True,
+                 max_tf_checkpoints_to_keep=3,
+                 optimizer=tf.train.RMSPropOptimizer(
+                     learning_rate=0.00025,
+                     decay=0.95,
+                     momentum=0.0,
+                     epsilon=0.00001,
+                     centered=True),
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the components of its graph.
+
+        Args:
+          sess: `tf.Session`, for executing ops.
+          num_actions: int, number of actions the agent can take at any state.
+          observation_shape: tuple of ints describing the observation shape.
+          observation_dtype: tf.DType, specifies the type of the observations. Note
+            that if your inputs are continuous, you should set this to tf.float32.
+          stack_size: int, number of frames to use in state stack.
+          gamma: float, discount factor with the usual RL meaning.
+          update_horizon: int, horizon at which updates are performed, the 'n' in
+            n-step update.
+          min_replay_history: int, number of transitions that should be experienced
+            before the agent begins training its value function.
+          update_period: int, period between DQN updates.
+          target_update_period: int, update period for the target network.
+          epsilon_fn: function expecting 4 parameters:
+            (decay_period, step, warmup_steps, epsilon). This function should return
+            the epsilon value used for exploration during training.
+          epsilon_train: float, the value to which the agent's epsilon is eventually
+            decayed during training.
+          epsilon_eval: float, epsilon used when evaluating the agent.
+          epsilon_decay_period: int, length of the epsilon decay schedule.
+          tf_device: str, Tensorflow device on which the agent's graph is executed.
+          use_staging: bool, when True use a staging area to prefetch the next
+            training batch, speeding training up by about 30%.
+          max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to
+            keep.
+          optimizer: `tf.train.Optimizer`, for training the value function.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        assert isinstance(observation_shape, tuple)
+        tf.logging.info('Creating %s agent with the following parameters:',
+                        self.__class__.__name__)
+        tf.logging.info('\t gamma: %f', gamma)
+        tf.logging.info('\t update_horizon: %f', update_horizon)
+        tf.logging.info('\t min_replay_history: %d', min_replay_history)
+        tf.logging.info('\t update_period: %d', update_period)
+        # tf.logging.info('\t random_seed: %d', random_seed)
+        tf.logging.info('\t epsilon_train: %f', epsilon_train)
+        tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
+        tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
+        tf.logging.info('\t tf_device: %s', tf_device)
+        tf.logging.info('\t use_staging: %s', use_staging)
+        tf.logging.info('\t optimizer: %s', optimizer)
+        tf.logging.info('\t game: %s', game_name)
+
+        self.game_name = game_name
+        self.num_actions = num_actions
+        self.observation_shape = tuple(observation_shape)
+        self.observation_dtype = observation_dtype
+        self.stack_size = stack_size
+        self.gamma = gamma
+        self.update_horizon = update_horizon
+        self.cumulative_gamma = math.pow(gamma, update_horizon)
+        self.min_replay_history = min_replay_history
+        self.epsilon_fn = epsilon_fn
+        self.epsilon_train = epsilon_train
+        self.epsilon_eval = epsilon_eval
+        self.epsilon_decay_period = epsilon_decay_period
+        self.update_period = update_period
+        self.eval_mode = False
+        self.training_steps = 0
+        self.optimizer = optimizer
+        self.summary_writer = summary_writer
+        self.summary_writing_frequency = summary_writing_frequency
+        self.margin = margin
+        self.start_training = 1000  # todo task specific PONG IS 1000
+        self.highest_reward = 6    # todo task specific
+        self.isPrinted = False
+        self.current_replay_size = 0
+        self.epsilon_current = 1
+
+        with tf.device(tf_device):
+            # Create a placeholder for the state input to the DQN network.
+            # The last axis indicates the number of consecutive frames stacked.
+            state_shape = (1,) + self.observation_shape + (stack_size,)
+            self.state = np.zeros(state_shape)
+            self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
+                                           name='state_ph')
+            self._replay = self._build_replay_buffer(use_staging)
+
+            self._build_networks()
+
+            self._train_op = self._build_train_op()
+
+        self.replay_buffer = ReplayBufferRegular(100000)
+
+        if self.summary_writer is not None:
+            # All tf.summaries should have been defined prior to running this.
+            self._merged_summaries = tf.summary.merge_all()
+        self._sess = sess
+        self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)
+
+        # Variables to be initialized by the agent once it interacts with the
+        # environment.
+        self._observation = None
+        self._last_observation = None
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of a Q value network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('DQN_network', ['q_values'])
+
+    def _network_template(self, state):
+        """Builds the convolutional network used to compute the agent's Q-values.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(net, 32, [8, 8], stride=4)
+        net = slim.conv2d(net, 64, [4, 4], stride=2)
+        net = slim.conv2d(net, 64, [3, 3], stride=1)
+        net = slim.flatten(net)
+        net = slim.fully_connected(net, 512)
+        q_values = slim.fully_connected(net, self.num_actions, activation_fn=None)
+        return self._get_network_type()(q_values)
+
+    def _build_networks(self):
+        """Builds the Q-value network computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's Q-values.
+          self.target_convnet: For computing the next state's target Q-values.
+          self._net_outputs: The actual Q-values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' Q-values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            Q-values (see Mnih et al., 2015 for details).
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+
+        self._net_outputs = self.online_convnet(self.state_ph)
+        # using a deep network, but may affect performance with a linear
+        # approximation scheme.
+        # self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states)
+        # treat self._net_outputs.q_values as logits
+        self.logsoftmaxprob = tf.nn.log_softmax(self._net_outputs.q_values)
+        self.sample = Categorical(logits=self.logsoftmaxprob).sample(1)
+
+
+    def _build_replay_buffer(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A WrapperReplayBuffer object.
+        """
+        return circular_replay_buffer.WrappedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma,
+            observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+
+        replay_action_one_hot = tf.one_hot(
+            self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
+        logits = self._replay_net_outputs.q_values
+        self.logsoftmaxprob = tf.nn.log_softmax(logits)
+        self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1)
+        # self.temp_loss = self.neglogprob # * self.y_pl
+        loss = self.actor_loss = tf.reduce_mean(self.neglogprob)
+        self.replay_action_one_hot = replay_action_one_hot
+
+        if self.summary_writer is not None:
+            with tf.variable_scope('Losses'):
+                tf.summary.scalar('hingeLoss', loss)
+        return self.optimizer.minimize(loss)
+
+    def begin_episode(self, observation):
+        """Returns the agent's first action for this episode.
+
+        Args:
+          observation: numpy array, the environment's initial observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._reset_state()
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def step(self, reward, observation):
+        """Records the most recent transition and returns the agent's next action.
+
+        We store the observation of the last time step since we want to store it
+        with the reward.
+
+        Args:
+          reward: float, the reward received from the agent's most recent action.
+          observation: numpy array, the most recent observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._last_observation = self._observation
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+
+            self._train_step()
+
+        self.action = self._select_action()
+        if isinstance(self.action, np.ndarray):
+            pass
+        return self.action
+
+    def end_episode(self, reward):
+        """Signals the end of the episode to the agent.
+
+        We store the observation of the current time step, which is the last
+        observation of the episode.
+
+        Args:
+          reward: float, the last reward from the environment.
+        """
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+
+    def _select_action(self):
+        """Select an action from the set of available actions.
+
+        Chooses an action randomly with probability self._calculate_epsilon(), and
+        otherwise acts greedily according to the current Q-value estimates.
+
+        Returns:
+           int, the selected action.
+        """
+
+        self.epsilon_current = self.training_steps
+        self.current_replay_size = self._replay.memory.add_count
+        return self._sess.run(self.sample, {self.state_ph: self.state})[0][0]
+
+
+    def _train_step(self):
+        """Runs a single training step.
+
+        Runs a training op if both:
+          (1) A minimum number of frames have been added to the replay buffer.
+          (2) `training_steps` is a multiple of `update_period`.
+
+        Also, syncs weights from online to target network if training steps is a
+        multiple of target update period.
+        """
+        # Run a train op at the rate of self.update_period if enough training steps
+        # have been run. This matches the Nature DQN behaviour.
+
+        if self._replay.memory.add_count > self.start_training:
+            if self.training_steps % self.update_period == 0:
+
+                # debug checked.
+                # _, neglogprob, logsoftmaxprob, \
+                # actor_loss, replay_action_one_hot = self._sess.run([self._train_op,
+                #                                                    self.neglogprob,
+                #                                                    self.logsoftmaxprob,
+                #                                                    self.actor_loss,
+                #                                                    self.replay_action_one_hot])
+                self._sess.run(self._train_op)
+                if (self.summary_writer is not None and
+                        self.training_steps > 0 and
+                        self.training_steps % self.summary_writing_frequency == 0):
+                    summary = self._sess.run(self._merged_summaries)
+                    self.summary_writer.add_summary(summary, self.training_steps)
+
+            # if self.training_steps % self.target_update_period == 0:
+            #     self._sess.run(self._sync_qt_ops)
+
+        self.training_steps += 1
+
+        if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False:
+            print("start training at {}".format(self.training_steps))
+            self.isPrinted = True
+
+    def _record_observation(self, observation):
+        """Records an observation and update state.
+
+        Extracts a frame from the observation vector and overwrites the oldest
+        frame in the state buffer.
+
+        Args:
+          observation: numpy array, an observation from the environment.
+        """
+        # Set current observation. We do the reshaping to handle environments
+        # without frame stacking.
+        observation = np.reshape(observation, self.observation_shape)
+        self._observation = observation[..., 0]
+        self._observation = np.reshape(observation, self.observation_shape)
+        # Swap out the oldest frame with the current frame.
+        self.state = np.roll(self.state, -1, axis=-1)
+        self.state[0, ..., -1] = self._observation
+
+    def _store_transition(self, last_observation, action, reward, is_terminal):
+        """Stores an experienced transition.
+
+        Executes a tf session and executes replay buffer ops in order to store the
+        following tuple in the replay buffer:
+          (last_observation, action, reward, is_terminal).
+
+        Pedantically speaking, this does not actually store an entire transition
+        since the next state is recorded on the following time step.
+
+        Args:
+          last_observation: numpy array, last observation.
+          action: int, the action taken.
+          reward: float, the reward.
+          is_terminal: bool, indicating if the current state is a terminal state.
+        """
+        self._replay.add(last_observation, action, reward, is_terminal)
+
+    def _reset_state(self):
+        """Resets the agent state by filling it with zeros."""
+        self.state.fill(0)
+
+    def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
+        """Returns a self-contained bundle of the agent's state.
+
+        This is used for checkpointing. It will return a dictionary containing all
+        non-TensorFlow objects (to be saved into a file by the caller), and it saves
+        all TensorFlow objects into a checkpoint file.
+
+        Args:
+          checkpoint_dir: str, directory where TensorFlow objects will be saved.
+          iteration_number: int, iteration number to use for naming the checkpoint
+            file.
+
+        Returns:
+          A dict containing additional Python objects to be checkpointed by the
+            experiment. If the checkpoint directory does not exist, returns None.
+        """
+        if not tf.gfile.Exists(checkpoint_dir):
+            return None
+        # Call the Tensorflow saver to checkpoint the graph.
+        self._saver.save(
+            self._sess,
+            os.path.join(checkpoint_dir, 'tf_ckpt'),
+            global_step=iteration_number)
+        # Checkpoint the out-of-graph replay buffer.
+        self._replay.save(checkpoint_dir, iteration_number)
+        bundle_dictionary = {}
+        bundle_dictionary['state'] = self.state
+        bundle_dictionary['eval_mode'] = self.eval_mode
+        bundle_dictionary['training_steps'] = self.training_steps
+        return bundle_dictionary
+
+    def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary):
+        """Restores the agent from a checkpoint.
+
+        Restores the agent's Python objects to those specified in bundle_dictionary,
+        and restores the TensorFlow objects to those specified in the
+        checkpoint_dir. If the checkpoint_dir does not exist, will not reset the
+          agent's state.
+
+        Args:
+          checkpoint_dir: str, path to the checkpoint saved by tf.Save.
+          iteration_number: int, checkpoint version, used when restoring replay
+            buffer.
+          bundle_dictionary: dict, containing additional Python objects owned by
+            the agent.
+
+        Returns:
+          bool, True if unbundling was successful.
+        """
+        try:
+            # self._replay.load() will throw a NotFoundError if it does not find all
+            # the necessary files, in which case we abort the process & return False.
+            self._replay.load(checkpoint_dir, iteration_number)
+        except tf.errors.NotFoundError:
+            return False
+        for key in self.__dict__:
+            if key in bundle_dictionary:
+                self.__dict__[key] = bundle_dictionary[key]
+        # Restore the agent's TensorFlow graph.
+        self._saver.restore(self._sess,
+                            os.path.join(checkpoint_dir,
+                                         'tf_ckpt-{}'.format(iteration_number)))
+        return True
+
+
+class ReplayBufferRegular(object):
+    """ for uniformly sampling.
+
+    """
+
+    def __init__(self, buffer_size, random_seed=1234):
+        self.buffer_size = buffer_size
+        self.count = 0
+        # Right side of deque contains newest experience
+        self.buffer = deque()
+        random.seed(random_seed)
+        self.ptr, self.path_start_idx = 0, 0
+
+    def add(self, state, action, reward, terminal):
+        experience = [state, action, reward, terminal]
+        assert self.count < self.buffer_size
+        self.buffer.append(experience)
+        self.count += 1
+        self.ptr += 1
+        # else:
+        #     self.path_start_idx -= 1
+        #     self.ptr = self.buffer_size - 1
+        #     self.buffer.popleft()
+        #     self.buffer.append(experience)
+
+    def get_sample(self):
+        self.count -= 1
+        return self.buffer.popleft()
+
+    def size(self):
+        return self.count
+
+    def clear(self):
+        self.buffer.clear()
+        self.count = 0
+        self.ptr = 0
+        self.path_start_idx = 0
diff --git a/dopamine/dopamine/agents/implicit_quantile/__init__.py b/dopamine/dopamine/agents/implicit_quantile/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantile/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin
new file mode 100644
index 0000000..4719e02
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile.gin
@@ -0,0 +1,40 @@
+# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+
+import dopamine.agents.implicit_quantile.implicit_quantile_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+ImplicitQuantileAgent.kappa = 1.0
+ImplicitQuantileAgent.num_tau_samples = 64
+ImplicitQuantileAgent.num_tau_prime_samples = 64
+ImplicitQuantileAgent.num_quantile_samples = 32
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 3
+RainbowAgent.min_replay_history = 20000 # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000 # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+# IQN currently does not support prioritized replay.
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+Runner.game_name = 'Breakout'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 30
+Runner.training_steps = 250000
+Runner.evaluation_steps = 125000
+Runner.max_steps_per_episode = 27000
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin
new file mode 100644
index 0000000..265dde1
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantile/configs/implicit_quantile_icml.gin
@@ -0,0 +1,37 @@
+# Hyperparameters follow Dabney et al. (2018)
+import dopamine.agents.implicit_quantile.implicit_quantile_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+ImplicitQuantileAgent.kappa = 1.0
+ImplicitQuantileAgent.num_tau_samples = 64
+ImplicitQuantileAgent.num_tau_prime_samples = 64
+ImplicitQuantileAgent.num_quantile_samples = 32
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 50000 # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 10000 # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 1000000 # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00005
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+Runner.game_name = 'Pong'
+Runner.sticky_actions = False
+Runner.num_iterations = 200
+Runner.training_steps = 250000
+Runner.evaluation_steps = 125000
+Runner.max_steps_per_episode = 27000
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py b/dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py
new file mode 100644
index 0000000..1b689bc
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantile/implicit_quantile_agent.py
@@ -0,0 +1,358 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The implicit quantile networks (IQN) agent.
+
+The agent follows the description given in "Implicit Quantile Networks for
+Distributional RL" (Dabney et. al, 2018).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import math
+
+
+from dopamine.agents.rainbow import rainbow_agent
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+class ImplicitQuantileAgent(rainbow_agent.RainbowAgent):
+  """An extension of Rainbow to perform implicit quantile regression."""
+
+  def __init__(self,
+               sess,
+               num_actions,
+               kappa=1.0,
+               num_tau_samples=32,
+               num_tau_prime_samples=32,
+               num_quantile_samples=32,
+               quantile_embedding_dim=64,
+               double_dqn=False,
+               summary_writer=None,
+               summary_writing_frequency=500):
+    """Initializes the agent and constructs the Graph.
+
+    Most of this constructor's parameters are IQN-specific hyperparameters whose
+    values are taken from Dabney et al. (2018).
+
+    Args:
+      sess: `tf.Session` object for running associated ops.
+      num_actions: int, number of actions the agent can take at any state.
+      kappa: float, Huber loss cutoff.
+      num_tau_samples: int, number of online quantile samples for loss
+        estimation.
+      num_tau_prime_samples: int, number of target quantile samples for loss
+        estimation.
+      num_quantile_samples: int, number of quantile samples for computing
+        Q-values.
+      quantile_embedding_dim: int, embedding dimension for the quantile input.
+      double_dqn: boolean, whether to perform double DQN style learning
+        as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
+      summary_writer: SummaryWriter object for outputting training statistics.
+        Summary writing disabled if set to None.
+      summary_writing_frequency: int, frequency with which summaries will be
+        written. Lower values will result in slower training.
+    """
+    self.kappa = kappa
+    # num_tau_samples = N below equation (3) in the paper.
+    self.num_tau_samples = num_tau_samples
+    # num_tau_prime_samples = N' below equation (3) in the paper.
+    self.num_tau_prime_samples = num_tau_prime_samples
+    # num_quantile_samples = k below equation (3) in the paper.
+    self.num_quantile_samples = num_quantile_samples
+    # quantile_embedding_dim = n above equation (4) in the paper.
+    self.quantile_embedding_dim = quantile_embedding_dim
+    # option to perform double dqn.
+    self.double_dqn = double_dqn
+
+    super(ImplicitQuantileAgent, self).__init__(
+        sess=sess,
+        num_actions=num_actions,
+        summary_writer=summary_writer,
+        summary_writing_frequency=summary_writing_frequency)
+
+  def _get_network_type(self):
+    """Returns the type of the outputs of the implicit quantile network.
+
+    Returns:
+      _network_type object defining the outputs of the network.
+    """
+    return collections.namedtuple(
+        'iqn_network', ['quantile_values', 'quantiles'])
+
+  def _network_template(self, state, num_quantiles):
+    r"""Builds an Implicit Quantile ConvNet.
+
+    Takes state and quantile as inputs and outputs state-action quantile values.
+
+    Args:
+      state: A `tf.placeholder` for the RL state.
+      num_quantiles: int, number of quantile inputs.
+
+    Returns:
+      _network_type object containing quantile value outputs of the network.
+    """
+
+    weights_initializer = slim.variance_scaling_initializer(
+        factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)
+
+    state_net = tf.cast(state, tf.float32)
+    state_net = tf.div(state_net, 255.)
+    state_net = slim.conv2d(
+        state_net, 32, [8, 8], stride=4,
+        weights_initializer=weights_initializer)
+    state_net = slim.conv2d(
+        state_net, 64, [4, 4], stride=2,
+        weights_initializer=weights_initializer)
+    state_net = slim.conv2d(
+        state_net, 64, [3, 3], stride=1,
+        weights_initializer=weights_initializer)
+    state_net = slim.flatten(state_net)
+    state_net_size = state_net.get_shape().as_list()[-1]
+    state_net_tiled = tf.tile(state_net, [num_quantiles, 1])
+
+    batch_size = state_net.get_shape().as_list()[0]
+    quantiles_shape = [num_quantiles * batch_size, 1]
+    quantiles = tf.random_uniform(
+        quantiles_shape, minval=0, maxval=1, dtype=tf.float32)
+
+    quantile_net = tf.tile(quantiles, [1, self.quantile_embedding_dim])
+    pi = tf.constant(math.pi)
+    quantile_net = tf.cast(tf.range(
+        1, self.quantile_embedding_dim + 1, 1), tf.float32) * pi * quantile_net
+    quantile_net = tf.cos(quantile_net)
+    quantile_net = slim.fully_connected(quantile_net, state_net_size,
+                                        weights_initializer=weights_initializer)
+    # Hadamard product.
+    net = tf.multiply(state_net_tiled, quantile_net)
+
+    net = slim.fully_connected(
+        net, 512, weights_initializer=weights_initializer)
+    quantile_values = slim.fully_connected(
+        net,
+        self.num_actions,
+        activation_fn=None,
+        weights_initializer=weights_initializer)
+
+    return self._get_network_type()(quantile_values=quantile_values,
+                                    quantiles=quantiles)
+
+  def _build_networks(self):
+    """Builds the IQN computations needed for acting and training.
+
+    These are:
+      self.online_convnet: For computing the current state's quantile values.
+      self.target_convnet: For computing the next state's target quantile
+        values.
+      self._net_outputs: The actual quantile values.
+      self._q_argmax: The action maximizing the current state's Q-values.
+      self._replay_net_outputs: The replayed states' quantile values.
+      self._replay_next_target_net_outputs: The replayed next states' target
+        quantile values.
+    """
+    # Calling online_convnet will generate a new graph as defined in
+    # self._get_network_template using whatever input is passed, but will always
+    # share the same weights.
+    self.online_convnet = tf.make_template('Online', self._network_template)
+    self.target_convnet = tf.make_template('Target', self._network_template)
+
+    # Compute the Q-values which are used for action selection in the current
+    # state.
+    self._net_outputs = self.online_convnet(self.state_ph,
+                                            self.num_quantile_samples)
+    # Shape of self._net_outputs.quantile_values:
+    # num_quantile_samples x num_actions.
+    # e.g. if num_actions is 2, it might look something like this:
+    # Vals for Quantile .2  Vals for Quantile .4  Vals for Quantile .6
+    #    [[0.1, 0.5],         [0.15, -0.3],          [0.15, -0.2]]
+    # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
+    self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
+    self._q_argmax = tf.argmax(self._q_values, axis=0)
+
+    self._replay_net_outputs = self.online_convnet(self._replay.states,
+                                                   self.num_tau_samples)
+    # Shape: (num_tau_samples x batch_size) x num_actions.
+    self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
+    self._replay_net_quantiles = self._replay_net_outputs.quantiles
+
+    # Do the same for next states in the replay buffer.
+    self._replay_net_target_outputs = self.target_convnet(
+        self._replay.next_states, self.num_tau_prime_samples)
+    # Shape: (num_tau_prime_samples x batch_size) x num_actions.
+    vals = self._replay_net_target_outputs.quantile_values
+    self._replay_net_target_quantile_values = vals
+
+    # Compute Q-values which are used for action selection for the next states
+    # in the replay buffer. Compute the argmax over the Q-values.
+    if self.double_dqn:
+      outputs_action = self.online_convnet(self._replay.next_states,
+                                           self.num_quantile_samples)
+    else:
+      outputs_action = self.target_convnet(self._replay.next_states,
+                                           self.num_quantile_samples)
+
+    # Shape: (num_quantile_samples x batch_size) x num_actions.
+    target_quantile_values_action = outputs_action.quantile_values
+    # Shape: num_quantile_samples x batch_size x num_actions.
+    target_quantile_values_action = tf.reshape(target_quantile_values_action,
+                                               [self.num_quantile_samples,
+                                                self._replay.batch_size,
+                                                self.num_actions])
+    # Shape: batch_size x num_actions.
+    self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
+        target_quantile_values_action, axis=0))
+    self._replay_next_qt_argmax = tf.argmax(
+        self._replay_net_target_q_values, axis=1)
+
+  def _build_target_quantile_values_op(self):
+    """Build an op used as a target for return values at given quantiles.
+
+    Returns:
+      An op calculating the target quantile return.
+    """
+    batch_size = tf.shape(self._replay.rewards)[0]
+    # Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
+    rewards = self._replay.rewards[:, None]
+    rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
+
+    is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
+    # Incorporate terminal state to discount factor.
+    # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
+    gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
+    gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
+                                  [self.num_tau_prime_samples, 1])
+
+    # Get the indices of the maximium Q-value across the action dimension.
+    # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
+
+    replay_next_qt_argmax = tf.tile(
+        self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
+
+    # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
+    batch_indices = tf.cast(tf.range(
+        self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
+
+    # Shape of batch_indexed_target_values:
+    # (num_tau_prime_samples x batch_size) x 2.
+    batch_indexed_target_values = tf.concat(
+        [batch_indices, replay_next_qt_argmax], axis=1)
+
+    # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
+    target_quantile_values = tf.gather_nd(
+        self._replay_net_target_quantile_values,
+        batch_indexed_target_values)[:, None]
+
+    return rewards + gamma_with_terminal * target_quantile_values
+
+  def _build_train_op(self):
+    """Builds a training op.
+
+    Returns:
+      train_op: An op performing one step of training from replay data.
+    """
+    batch_size = tf.shape(self._replay.rewards)[0]
+
+    target_quantile_values = tf.stop_gradient(
+        self._build_target_quantile_values_op())
+    # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
+    # the manner in which the target_quantile_values are tiled.
+    target_quantile_values = tf.reshape(target_quantile_values,
+                                        [self.num_tau_prime_samples,
+                                         batch_size, 1])
+    # Transpose dimensions so that the dimensionality is batch_size x
+    # self.num_tau_prime_samples x 1 to prepare for computation of
+    # Bellman errors.
+    # Final shape of target_quantile_values:
+    # batch_size x num_tau_prime_samples x 1.
+    target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
+
+    # Shape of indices: (num_tau_samples x batch_size) x 1.
+    # Expand dimension by one so that it can be used to index into all the
+    # quantiles when using the tf.gather_nd function (see below).
+    indices = tf.range(self.num_tau_samples * batch_size)[:, None]
+
+    # Expand the dimension by one so that it can be used to index into all the
+    # quantiles when using the tf.gather_nd function (see below).
+    reshaped_actions = self._replay.actions[:, None]
+    reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
+    # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
+    reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
+
+    chosen_action_quantile_values = tf.gather_nd(
+        self._replay_net_quantile_values, reshaped_actions)
+    # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
+    # in which the quantile values are tiled.
+    chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
+                                               [self.num_tau_samples,
+                                                batch_size, 1])
+    # Transpose dimensions so that the dimensionality is batch_size x
+    # self.num_tau_samples x 1 to prepare for computation of
+    # Bellman errors.
+    # Final shape of chosen_action_quantile_values:
+    # batch_size x num_tau_samples x 1.
+    chosen_action_quantile_values = tf.transpose(
+        chosen_action_quantile_values, [1, 0, 2])
+
+    # Shape of bellman_erors and huber_loss:
+    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    bellman_errors = target_quantile_values[
+        :, :, None, :] - chosen_action_quantile_values[:, None, :, :]
+    # The huber loss (see Section 2.3 of the paper) is defined via two cases:
+    # case_one: |bellman_errors| <= kappa
+    # case_two: |bellman_errors| > kappa
+    huber_loss_case_one = tf.to_float(
+        tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
+    huber_loss_case_two = tf.to_float(
+        tf.abs(bellman_errors) > self.kappa) * self.kappa * (
+            tf.abs(bellman_errors) - 0.5 * self.kappa)
+    huber_loss = huber_loss_case_one + huber_loss_case_two
+
+    # Reshape replay_quantiles to batch_size x num_tau_samples x 1
+    replay_quantiles = tf.reshape(
+        self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
+    replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2])
+
+    # Tile by num_tau_prime_samples along a new dimension. Shape is now
+    # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    # These quantiles will be used for computation of the quantile huber loss
+    # below (see section 2.3 of the paper).
+    replay_quantiles = tf.to_float(tf.tile(
+        replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
+    # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
+    quantile_huber_loss = (tf.abs(replay_quantiles - tf.stop_gradient(
+        tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
+    # Sum over current quantile value (num_tau_samples) dimension,
+    # average over target quantile value (num_tau_prime_samples) dimension.
+    # Shape: batch_size x num_tau_prime_samples x 1.
+    loss = tf.reduce_sum(quantile_huber_loss, axis=2)
+    # Shape: batch_size x 1.
+    loss = tf.reduce_mean(loss, axis=1)
+
+    # TODO(kumasaurabh): Add prioritized replay functionality here.
+    update_priorities_op = tf.no_op()
+    with tf.control_dependencies([update_priorities_op]):
+      if self.summary_writer is not None:
+        with tf.variable_scope('Losses'):
+          tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
+      return self.optimizer.minimize(tf.reduce_mean(loss)), tf.reduce_mean(loss)
diff --git a/dopamine/dopamine/agents/implicit_quantilerpg/__init__.py b/dopamine/dopamine/agents/implicit_quantilerpg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantilerpg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin b/dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin
new file mode 100644
index 0000000..06ebcdf
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantilerpg/configs/implicit_quantilerpg.gin
@@ -0,0 +1,41 @@
+# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+
+import dopamine.agents.implicit_quantilerpg.implicit_quantilerpg_agent
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.agents.rainbowrpg.rainbowrpg_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+ImplicitQuantileRPGAgent.kappa = 1.0
+ImplicitQuantileRPGAgent.num_tau_samples = 64
+ImplicitQuantileRPGAgent.num_tau_prime_samples = 64
+ImplicitQuantileRPGAgent.num_quantile_samples = 32
+RainbowRPGAgent.gamma = 0.99
+RainbowRPGAgent.update_horizon = 3
+RainbowRPGAgent.min_replay_history = 20000 # agent steps
+RainbowRPGAgent.update_period = 4
+RainbowRPGAgent.target_update_period = 8000 # agent steps
+RainbowRPGAgent.epsilon_train = 0.01
+RainbowRPGAgent.epsilon_eval = 0.001
+RainbowRPGAgent.epsilon_decay_period = 250000  # agent steps
+# IQN currently does not support prioritized replay.
+RainbowRPGAgent.replay_scheme = 'uniform'
+RainbowRPGAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
+RainbowRPGAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+Runner.game_name = 'Breakout'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 15
+Runner.training_steps = 250000
+Runner.evaluation_steps = 125000
+Runner.max_steps_per_episode = 27000
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py b/dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py
new file mode 100644
index 0000000..2e8140a
--- /dev/null
+++ b/dopamine/dopamine/agents/implicit_quantilerpg/implicit_quantilerpg_agent.py
@@ -0,0 +1,431 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The implicit quantile networks (IQN) agent.
+
+The agent follows the description given in "Implicit Quantile Networks for
+Distributional RL" (Dabney et. al, 2018).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import math
+from dopamine.replay_memory import circular_replay_buffer
+from dopamine.agents.rainbow import rainbow_agent
+from dopamine.agents.rainbowrpg import rainbowrpg_agent
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+from dopamine.agents.agent_utils import *
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+class ImplicitQuantileRPGAgent(rainbowrpg_agent.RainbowRPGAgent):
+    """An extension of Rainbow to perform implicit quantile regression."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 kappa=1.0,
+                 num_tau_samples=32,
+                 num_tau_prime_samples=32,
+                 num_quantile_samples=32,
+                 quantile_embedding_dim=64,
+                 double_dqn=False,
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the Graph.
+
+        Most of this constructor's parameters are IQN-specific hyperparameters whose
+        values are taken from Dabney et al. (2018).
+
+        Args:
+          sess: `tf.Session` object for running associated ops.
+          num_actions: int, number of actions the agent can take at any state.
+          kappa: float, Huber loss cutoff.
+          num_tau_samples: int, number of online quantile samples for loss
+            estimation.
+          num_tau_prime_samples: int, number of target quantile samples for loss
+            estimation.
+          num_quantile_samples: int, number of quantile samples for computing
+            Q-values.
+          quantile_embedding_dim: int, embedding dimension for the quantile input.
+          double_dqn: boolean, whether to perform double DQN style learning
+            as described in Van Hasselt et al.: https://arxiv.org/abs/1509.06461.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        self.kappa = kappa
+        # num_tau_samples = N below equation (3) in the paper.
+        self.num_tau_samples = num_tau_samples
+        # num_tau_prime_samples = N' below equation (3) in the paper.
+        self.num_tau_prime_samples = num_tau_prime_samples
+        # num_quantile_samples = k below equation (3) in the paper.
+        self.num_quantile_samples = num_quantile_samples
+        # quantile_embedding_dim = n above equation (4) in the paper.
+        self.quantile_embedding_dim = quantile_embedding_dim
+        # option to perform double dqn.
+        self.double_dqn = double_dqn
+
+        super(ImplicitQuantileRPGAgent, self).__init__(
+            sess=sess,
+            num_actions=num_actions,
+            summary_writer=summary_writer,
+            summary_writing_frequency=summary_writing_frequency)
+
+        self.start_training = 1000
+
+
+    def _get_network_type_rpg(self):
+        """Returns the type of the outputs of a value distribution network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('c51_network',
+                                      ['q_values', 'logits', 'probabilities'])
+
+    def _network_template_rpg(self, state):
+        """Builds a convolutional network that outputs Q-value distributions.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        weights_initializer = slim.variance_scaling_initializer(
+            factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)
+
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(
+            net, 32, [8, 8], stride=4, weights_initializer=weights_initializer)
+        net = slim.conv2d(
+            net, 64, [4, 4], stride=2, weights_initializer=weights_initializer)
+        net = slim.conv2d(
+            net, 64, [3, 3], stride=1, weights_initializer=weights_initializer)
+        net = slim.flatten(net)
+        net = slim.fully_connected(
+            net, 512, weights_initializer=weights_initializer)
+        net = slim.fully_connected(
+            net,
+            self.num_actions * self._num_atoms,
+            activation_fn=None,
+            weights_initializer=weights_initializer)
+
+        logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms])
+        probabilities = tf.contrib.layers.softmax(logits)
+        q_values = tf.reduce_sum(self._support * probabilities, axis=2)
+        return self._get_network_type_rpg()(q_values, logits, probabilities)
+
+    def _build_networks_rpg(self):
+        # RPG learning net.
+        self.rpg_convnet = tf.make_template('RPG', self._network_template_rpg)
+        self._rpg_net_outputs = self.rpg_convnet(self.state_ph)
+        self._q_argmax_rpg = tf.argmax(self._rpg_net_outputs.q_values, axis=1)[0]
+        self._replay_rpg_net_outputs = self.rpg_convnet(self._replay_opt.states)
+
+    def _build_train_op_rpg(self):
+        # RPG loss
+        replay_action_one_hot = tf.one_hot(
+            self._replay_opt.actions, self.num_actions, 1., 0., name='action_one_hot_rpg')
+        replay_chosen_q = tf.reduce_sum(
+            self._replay_rpg_net_outputs.q_values * replay_action_one_hot,
+            reduction_indices=1,
+            name='replay_chosen_q_rpg')
+        margin = 1
+        qvalue = self._replay_rpg_net_outputs.q_values
+        # debug self.temp_action_one_hot = replay_action_one_hot
+        self.temp_qvalue = qvalue
+        self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot
+        self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \
+                     * ((1 - replay_action_one_hot) + (replay_action_one_hot))
+        self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2)
+        rpg_loss = tf.reduce_mean(self.hingeloss)
+        return self.optimizer_rpg.minimize(rpg_loss)
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of the implicit quantile network.
+
+        Returns:
+          _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple(
+            'iqn_network', ['quantile_values', 'quantiles'])
+
+    def _network_template(self, state, num_quantiles):
+        r"""Builds an Implicit Quantile ConvNet.
+
+        Takes state and quantile as inputs and outputs state-action quantile values.
+
+        Args:
+          state: A `tf.placeholder` for the RL state.
+          num_quantiles: int, number of quantile inputs.
+
+        Returns:
+          _network_type object containing quantile value outputs of the network.
+        """
+
+        weights_initializer = slim.variance_scaling_initializer(
+            factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)
+
+        state_net = tf.cast(state, tf.float32)
+        state_net = tf.div(state_net, 255.)
+        state_net = slim.conv2d(
+            state_net, 32, [8, 8], stride=4,
+            weights_initializer=weights_initializer)
+        state_net = slim.conv2d(
+            state_net, 64, [4, 4], stride=2,
+            weights_initializer=weights_initializer)
+        state_net = slim.conv2d(
+            state_net, 64, [3, 3], stride=1,
+            weights_initializer=weights_initializer)
+        state_net = slim.flatten(state_net)
+        state_net_size = state_net.get_shape().as_list()[-1]
+        state_net_tiled = tf.tile(state_net, [num_quantiles, 1])
+
+        batch_size = state_net.get_shape().as_list()[0]
+        quantiles_shape = [num_quantiles * batch_size, 1]
+        quantiles = tf.random_uniform(
+            quantiles_shape, minval=0, maxval=1, dtype=tf.float32)
+
+        quantile_net = tf.tile(quantiles, [1, self.quantile_embedding_dim])
+        pi = tf.constant(math.pi)
+        quantile_net = tf.cast(tf.range(
+            1, self.quantile_embedding_dim + 1, 1), tf.float32) * pi * quantile_net
+        quantile_net = tf.cos(quantile_net)
+        quantile_net = slim.fully_connected(quantile_net, state_net_size,
+                                            weights_initializer=weights_initializer)
+        # Hadamard product.
+        net = tf.multiply(state_net_tiled, quantile_net)
+
+        net = slim.fully_connected(
+            net, 512, weights_initializer=weights_initializer)
+        quantile_values = slim.fully_connected(
+            net,
+            self.num_actions,
+            activation_fn=None,
+            weights_initializer=weights_initializer)
+
+        return self._get_network_type()(quantile_values=quantile_values,
+                                        quantiles=quantiles)
+
+    def _build_networks(self):
+        """Builds the IQN computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's quantile values.
+          self.target_convnet: For computing the next state's target quantile
+            values.
+          self._net_outputs: The actual quantile values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' quantile values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            quantile values.
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+        self.target_convnet = tf.make_template('Target', self._network_template)
+
+        # Compute the Q-values which are used for action selection in the current
+        # state.
+        self._net_outputs = self.online_convnet(self.state_ph,
+                                                self.num_quantile_samples)
+        # Shape of self._net_outputs.quantile_values:
+        # num_quantile_samples x num_actions.
+        # e.g. if num_actions is 2, it might look something like this:
+        # Vals for Quantile .2  Vals for Quantile .4  Vals for Quantile .6
+        #    [[0.1, 0.5],         [0.15, -0.3],          [0.15, -0.2]]
+        # Q-values = [(0.1 + 0.15 + 0.15)/3, (0.5 + 0.15 + -0.2)/3].
+        self._q_values = tf.reduce_mean(self._net_outputs.quantile_values, axis=0)
+        self._q_argmax = tf.argmax(self._q_values, axis=0)
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states,
+                                                       self.num_tau_samples)
+        # Shape: (num_tau_samples x batch_size) x num_actions.
+        self._replay_net_quantile_values = self._replay_net_outputs.quantile_values
+        self._replay_net_quantiles = self._replay_net_outputs.quantiles
+
+        # Do the same for next states in the replay buffer.
+        self._replay_net_target_outputs = self.target_convnet(
+            self._replay.next_states, self.num_tau_prime_samples)
+        # Shape: (num_tau_prime_samples x batch_size) x num_actions.
+        vals = self._replay_net_target_outputs.quantile_values
+        self._replay_net_target_quantile_values = vals
+
+        # Compute Q-values which are used for action selection for the next states
+        # in the replay buffer. Compute the argmax over the Q-values.
+        if self.double_dqn:
+            outputs_action = self.online_convnet(self._replay.next_states,
+                                                 self.num_quantile_samples)
+        else:
+            outputs_action = self.target_convnet(self._replay.next_states,
+                                                 self.num_quantile_samples)
+
+        # Shape: (num_quantile_samples x batch_size) x num_actions.
+        target_quantile_values_action = outputs_action.quantile_values
+        # Shape: num_quantile_samples x batch_size x num_actions.
+        target_quantile_values_action = tf.reshape(target_quantile_values_action,
+                                                   [self.num_quantile_samples,
+                                                    self._replay.batch_size,
+                                                    self.num_actions])
+        # Shape: batch_size x num_actions.
+        self._replay_net_target_q_values = tf.squeeze(tf.reduce_mean(
+            target_quantile_values_action, axis=0))
+        self._replay_next_qt_argmax = tf.argmax(
+            self._replay_net_target_q_values, axis=1)
+
+    def _build_target_quantile_values_op(self):
+        """Build an op used as a target for return values at given quantiles.
+
+        Returns:
+          An op calculating the target quantile return.
+        """
+        batch_size = tf.shape(self._replay.rewards)[0]
+        # Shape of rewards: (num_tau_prime_samples x batch_size) x 1.
+        rewards = self._replay.rewards[:, None]
+        rewards = tf.tile(rewards, [self.num_tau_prime_samples, 1])
+
+        is_terminal_multiplier = 1. - tf.to_float(self._replay.terminals)
+        # Incorporate terminal state to discount factor.
+        # size of gamma_with_terminal: (num_tau_prime_samples x batch_size) x 1.
+        gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
+        gamma_with_terminal = tf.tile(gamma_with_terminal[:, None],
+                                      [self.num_tau_prime_samples, 1])
+
+        # Get the indices of the maximium Q-value across the action dimension.
+        # Shape of replay_next_qt_argmax: (num_tau_prime_samples x batch_size) x 1.
+
+        replay_next_qt_argmax = tf.tile(
+            self._replay_next_qt_argmax[:, None], [self.num_tau_prime_samples, 1])
+
+        # Shape of batch_indices: (num_tau_prime_samples x batch_size) x 1.
+        batch_indices = tf.cast(tf.range(
+            self.num_tau_prime_samples * batch_size)[:, None], tf.int64)
+
+        # Shape of batch_indexed_target_values:
+        # (num_tau_prime_samples x batch_size) x 2.
+        batch_indexed_target_values = tf.concat(
+            [batch_indices, replay_next_qt_argmax], axis=1)
+
+        # Shape of next_target_values: (num_tau_prime_samples x batch_size) x 1.
+        target_quantile_values = tf.gather_nd(
+            self._replay_net_target_quantile_values,
+            batch_indexed_target_values)[:, None]
+
+        return rewards + gamma_with_terminal * target_quantile_values
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+        batch_size = tf.shape(self._replay.rewards)[0]
+
+        target_quantile_values = tf.stop_gradient(
+            self._build_target_quantile_values_op())
+        # Reshape to self.num_tau_prime_samples x batch_size x 1 since this is
+        # the manner in which the target_quantile_values are tiled.
+        target_quantile_values = tf.reshape(target_quantile_values,
+                                            [self.num_tau_prime_samples,
+                                             batch_size, 1])
+        # Transpose dimensions so that the dimensionality is batch_size x
+        # self.num_tau_prime_samples x 1 to prepare for computation of
+        # Bellman errors.
+        # Final shape of target_quantile_values:
+        # batch_size x num_tau_prime_samples x 1.
+        target_quantile_values = tf.transpose(target_quantile_values, [1, 0, 2])
+
+        # Shape of indices: (num_tau_samples x batch_size) x 1.
+        # Expand dimension by one so that it can be used to index into all the
+        # quantiles when using the tf.gather_nd function (see below).
+        indices = tf.range(self.num_tau_samples * batch_size)[:, None]
+
+        # Expand the dimension by one so that it can be used to index into all the
+        # quantiles when using the tf.gather_nd function (see below).
+        reshaped_actions = self._replay.actions[:, None]
+        reshaped_actions = tf.tile(reshaped_actions, [self.num_tau_samples, 1])
+        # Shape of reshaped_actions: (num_tau_samples x batch_size) x 2.
+        reshaped_actions = tf.concat([indices, reshaped_actions], axis=1)
+
+        chosen_action_quantile_values = tf.gather_nd(
+            self._replay_net_quantile_values, reshaped_actions)
+        # Reshape to self.num_tau_samples x batch_size x 1 since this is the manner
+        # in which the quantile values are tiled.
+        chosen_action_quantile_values = tf.reshape(chosen_action_quantile_values,
+                                                   [self.num_tau_samples,
+                                                    batch_size, 1])
+        # Transpose dimensions so that the dimensionality is batch_size x
+        # self.num_tau_samples x 1 to prepare for computation of
+        # Bellman errors.
+        # Final shape of chosen_action_quantile_values:
+        # batch_size x num_tau_samples x 1.
+        chosen_action_quantile_values = tf.transpose(
+            chosen_action_quantile_values, [1, 0, 2])
+
+        # Shape of bellman_erors and huber_loss:
+        # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+        bellman_errors = target_quantile_values[
+                         :, :, None, :] - chosen_action_quantile_values[:, None, :, :]
+        # The huber loss (see Section 2.3 of the paper) is defined via two cases:
+        # case_one: |bellman_errors| <= kappa
+        # case_two: |bellman_errors| > kappa
+        huber_loss_case_one = tf.to_float(
+            tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors ** 2
+        huber_loss_case_two = tf.to_float(
+            tf.abs(bellman_errors) > self.kappa) * self.kappa * (
+                                      tf.abs(bellman_errors) - 0.5 * self.kappa)
+        huber_loss = huber_loss_case_one + huber_loss_case_two
+
+        # Reshape replay_quantiles to batch_size x num_tau_samples x 1
+        replay_quantiles = tf.reshape(
+            self._replay_net_quantiles, [self.num_tau_samples, batch_size, 1])
+        replay_quantiles = tf.transpose(replay_quantiles, [1, 0, 2])
+
+        # Tile by num_tau_prime_samples along a new dimension. Shape is now
+        # batch_size x num_tau_prime_samples x num_tau_samples x 1.
+        # These quantiles will be used for computation of the quantile huber loss
+        # below (see section 2.3 of the paper).
+        replay_quantiles = tf.to_float(tf.tile(
+            replay_quantiles[:, None, :, :], [1, self.num_tau_prime_samples, 1, 1]))
+        # Shape: batch_size x num_tau_prime_samples x num_tau_samples x 1.
+        quantile_huber_loss = (tf.abs(replay_quantiles - tf.stop_gradient(
+            tf.to_float(bellman_errors < 0))) * huber_loss) / self.kappa
+        # Sum over current quantile value (num_tau_samples) dimension,
+        # average over target quantile value (num_tau_prime_samples) dimension.
+        # Shape: batch_size x num_tau_prime_samples x 1.
+        loss = tf.reduce_sum(quantile_huber_loss, axis=2)
+        # Shape: batch_size x 1.
+        loss = tf.reduce_mean(loss, axis=1)
+
+        # TODO(kumasaurabh): Add prioritized replay functionality here.
+        update_priorities_op = tf.no_op()
+        with tf.control_dependencies([update_priorities_op]):
+            if self.summary_writer is not None:
+                with tf.variable_scope('Losses'):
+                    tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
+            return self.optimizer.minimize(tf.reduce_mean(loss)), tf.reduce_mean(loss)
diff --git a/dopamine/dopamine/agents/lpg/__init__.py b/dopamine/dopamine/agents/lpg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/lpg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/lpg/configs/lpg.gin b/dopamine/dopamine/agents/lpg/configs/lpg.gin
new file mode 100644
index 0000000..532856e
--- /dev/null
+++ b/dopamine/dopamine/agents/lpg/configs/lpg.gin
@@ -0,0 +1,36 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.lpg.lpg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+LPGAgent.gamma = 0.99
+LPGAgent.game_name = 'Pong' # Boxing, Pong
+LPGAgent.update_horizon = 1
+LPGAgent.min_replay_history = 200000  # agent steps, step more than this, stop exploration.
+LPGAgent.update_period = 4
+LPGAgent.epsilon_train = 0.0001
+LPGAgent.epsilon_eval = 0
+LPGAgent.epsilon_decay_period = 250000  # agent steps
+LPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+LPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+LPGAgent.margin = 1
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'  # Boxing, Pong
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 15
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 10000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 512
diff --git a/dopamine/dopamine/agents/lpg/lpg_agent.py b/dopamine/dopamine/agents/lpg/lpg_agent.py
new file mode 100644
index 0000000..f96d8e2
--- /dev/null
+++ b/dopamine/dopamine/agents/lpg/lpg_agent.py
@@ -0,0 +1,590 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a DQN agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import os
+import random
+from dopamine.agents.agent_utils import *
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+from tensorflow.distributions import Categorical
+
+import gin.tf
+from collections import deque
+
+slim = tf.contrib.slim
+
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
+NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
+NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
+
+
+def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon):
+    """Returns the current epsilon for the agent's epsilon-greedy policy.
+
+    This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et
+    al., 2015). The schedule is as follows:
+      Begin at 1. until warmup_steps steps have been taken; then
+      Linearly decay epsilon from 1. to epsilon in decay_period steps; and then
+      Use epsilon from there on.
+
+    Args:
+      decay_period: float, the period over which epsilon is decayed.
+      step: int, the number of training steps completed so far.
+      warmup_steps: int, the number of steps taken before epsilon is decayed.
+      epsilon: float, the final value to which to decay the epsilon parameter.
+
+    Returns:
+      A float, the current epsilon value computed according to the schedule.
+    """
+    steps_left = decay_period + warmup_steps - step
+    bonus = (1.0 - epsilon) * steps_left / decay_period
+    bonus = np.clip(bonus, 0., 1. - epsilon)
+    return epsilon + bonus
+
+
+@gin.configurable
+class LPGAgent(object):
+    """An implementation of the DQN agent."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 game_name="Pong",
+                 observation_shape=NATURE_DQN_OBSERVATION_SHAPE,
+                 observation_dtype=NATURE_DQN_DTYPE,
+                 stack_size=NATURE_DQN_STACK_SIZE,
+                 gamma=0.99,
+                 update_horizon=1,
+                 min_replay_history=20000,
+                 update_period=4,
+                 epsilon_fn=linearly_decaying_epsilon,
+                 epsilon_train=0.01,
+                 epsilon_eval=0.001,
+                 epsilon_decay_period=250000,
+                 margin=1,
+                 tf_device='/cpu:*',
+                 use_staging=True,
+                 max_tf_checkpoints_to_keep=3,
+                 optimizer=tf.train.RMSPropOptimizer(
+                     learning_rate=0.00025,
+                     decay=0.95,
+                     momentum=0.0,
+                     epsilon=0.00001,
+                     centered=True),
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the components of its graph.
+
+        Args:
+          sess: `tf.Session`, for executing ops.
+          num_actions: int, number of actions the agent can take at any state.
+          observation_shape: tuple of ints describing the observation shape.
+          observation_dtype: tf.DType, specifies the type of the observations. Note
+            that if your inputs are continuous, you should set this to tf.float32.
+          stack_size: int, number of frames to use in state stack.
+          gamma: float, discount factor with the usual RL meaning.
+          update_horizon: int, horizon at which updates are performed, the 'n' in
+            n-step update.
+          min_replay_history: int, number of transitions that should be experienced
+            before the agent begins training its value function.
+          update_period: int, period between DQN updates.
+          target_update_period: int, update period for the target network.
+          epsilon_fn: function expecting 4 parameters:
+            (decay_period, step, warmup_steps, epsilon). This function should return
+            the epsilon value used for exploration during training.
+          epsilon_train: float, the value to which the agent's epsilon is eventually
+            decayed during training.
+          epsilon_eval: float, epsilon used when evaluating the agent.
+          epsilon_decay_period: int, length of the epsilon decay schedule.
+          tf_device: str, Tensorflow device on which the agent's graph is executed.
+          use_staging: bool, when True use a staging area to prefetch the next
+            training batch, speeding training up by about 30%.
+          max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to
+            keep.
+          optimizer: `tf.train.Optimizer`, for training the value function.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        assert isinstance(observation_shape, tuple)
+        tf.logging.info('Creating %s agent with the following parameters:',
+                        self.__class__.__name__)
+        tf.logging.info('\t gamma: %f', gamma)
+        tf.logging.info('\t update_horizon: %f', update_horizon)
+        tf.logging.info('\t min_replay_history: %d', min_replay_history)
+        tf.logging.info('\t update_period: %d', update_period)
+        # tf.logging.info('\t random_seed: %d', random_seed)
+        tf.logging.info('\t epsilon_train: %f', epsilon_train)
+        tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
+        tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
+        tf.logging.info('\t tf_device: %s', tf_device)
+        tf.logging.info('\t use_staging: %s', use_staging)
+        tf.logging.info('\t optimizer: %s', optimizer)
+        tf.logging.info('\t game: %s', game_name)
+
+        self.game_name = game_name
+        self.num_actions = num_actions
+        self.observation_shape = tuple(observation_shape)
+        self.observation_dtype = observation_dtype
+        self.stack_size = stack_size
+        self.gamma = gamma
+        self.update_horizon = update_horizon
+        self.cumulative_gamma = math.pow(gamma, update_horizon)
+        self.min_replay_history = min_replay_history
+        self.epsilon_fn = epsilon_fn
+        self.epsilon_train = epsilon_train
+        self.epsilon_eval = epsilon_eval
+        self.epsilon_decay_period = epsilon_decay_period
+        self.update_period = update_period
+        self.eval_mode = False
+        self.training_steps = 0
+        self.optimizer = optimizer
+        self.summary_writer = summary_writer
+        self.summary_writing_frequency = summary_writing_frequency
+        self.margin = margin
+        self.start_training = 1000
+        # todo task specific FOR PONG IS 1000 IF THIS IS TOO SMALL WE END UP WITH A DETERMINISTIC POLICY QUCKKLY
+        self.highest_reward = 6    # todo task specific
+        self.isPrinted = False
+        self.current_replay_size = 0
+        self.epsilon_current = 1
+
+        with tf.device(tf_device):
+            # Create a placeholder for the state input to the DQN network.
+            # The last axis indicates the number of consecutive frames stacked.
+            state_shape = (1,) + self.observation_shape + (stack_size,)
+            self.state = np.zeros(state_shape)
+            self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
+                                           name='state_ph')
+            self._replay = self._build_replay_buffer(use_staging)
+
+            self._build_networks()
+
+            self._train_op = self._build_train_op()
+
+        self.replay_buffer = ReplayBufferRegular(100000)
+
+        if self.summary_writer is not None:
+            # All tf.summaries should have been defined prior to running this.
+            self._merged_summaries = tf.summary.merge_all()
+        self._sess = sess
+        self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)
+
+        # Variables to be initialized by the agent once it interacts with the
+        # environment.
+        self._observation = None
+        self._last_observation = None
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of a Q value network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('DQN_network', ['q_values'])
+
+    def _network_template(self, state):
+        """Builds the convolutional network used to compute the agent's Q-values.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(net, 32, [8, 8], stride=4)
+        net = slim.conv2d(net, 64, [4, 4], stride=2)
+        net = slim.conv2d(net, 64, [3, 3], stride=1)
+        net = slim.flatten(net)
+        net = slim.fully_connected(net, 512)
+        q_values = slim.fully_connected(net, self.num_actions, activation_fn=None)
+        return self._get_network_type()(q_values)
+
+    def _build_networks(self):
+        """Builds the Q-value network computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's Q-values.
+          self.target_convnet: For computing the next state's target Q-values.
+          self._net_outputs: The actual Q-values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' Q-values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            Q-values (see Mnih et al., 2015 for details).
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+
+        self._net_outputs = self.online_convnet(self.state_ph)
+        # using a deep network, but may affect performance with a linear
+        # approximation scheme.
+        self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states)
+        # treat self._net_outputs.q_values as logits
+        self.logsoftmaxprob = tf.nn.log_softmax(self._net_outputs.q_values)
+        self.sample = Categorical(logits=self.logsoftmaxprob).sample(1)
+
+
+    def _build_replay_buffer(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A WrapperReplayBuffer object.
+        """
+        return circular_replay_buffer.WrappedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma,
+            observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+
+        replay_action_one_hot = tf.one_hot(
+            self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
+        logits = self._replay_net_outputs.q_values
+        self.logsoftmaxprob = tf.nn.log_softmax(logits)
+        self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1)
+        # self.temp_loss = self.neglogprob # * self.y_pl
+        loss = self.actor_loss = tf.reduce_mean(self.neglogprob)
+        self.replay_action_one_hot = replay_action_one_hot
+
+        if self.summary_writer is not None:
+            with tf.variable_scope('Losses'):
+                tf.summary.scalar('hingeLoss', loss)
+        return self.optimizer.minimize(loss)
+
+    def begin_episode(self, observation):
+        """Returns the agent's first action for this episode.
+
+        Args:
+          observation: numpy array, the environment's initial observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._reset_state()
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def step(self, reward, observation):
+        """Records the most recent transition and returns the agent's next action.
+
+        We store the observation of the last time step since we want to store it
+        with the reward.
+
+        Args:
+          reward: float, the reward received from the agent's most recent action.
+          observation: numpy array, the most recent observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._last_observation = self._observation
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            # if reward < 0:
+            #     self.replay_buffer.clear()
+            # elif reward > 0:
+            #     self.replay_buffer.add(self._last_observation, self.action, reward, False)
+            #     while self.replay_buffer.size() > 0:
+            #         experience = self.replay_buffer.get_sample()
+            #         state, action, reward, _ = experience
+            #         self._store_transition(state, action, reward, False)
+            # else:
+            #     self.replay_buffer.add(self._last_observation, self.action, reward, False)
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+
+            self._train_step()
+
+        self.action = self._select_action()
+        if isinstance(self.action, np.ndarray):
+            pass
+        return self.action
+
+    def end_episode(self, reward):
+        """Signals the end of the episode to the agent.
+
+        We store the observation of the current time step, which is the last
+        observation of the episode.
+
+        Args:
+          reward: float, the last reward from the environment.
+        """
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+
+    def _select_action_training(self):
+        """Use EPG to select action during training, """
+        return self._sess.run(self.sample, {self.state_ph: self.state})[0][0]
+
+    def _select_action(self):
+        """Select an action from the set of available actions.
+
+        Chooses an action randomly with probability self._calculate_epsilon(), and
+        otherwise acts greedily according to the current Q-value estimates.
+
+        Returns:
+           int, the selected action.
+        """
+        exploration = "Randomexplore"
+        if exploration == "EPG":
+            self.epsilon_current = 0
+            self.current_replay_size = self._replay.memory.add_count
+            if self.eval_mode:
+                return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+            return self._select_action_training()
+
+        elif exploration == "Randomexplore":
+            # epsilon greedy explore.
+            # epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn(
+            #     self.epsilon_decay_period,
+            #     self.training_steps,
+            #     self.min_replay_history,
+            #     self.epsilon_train)
+            if self.training_steps < self.min_replay_history:
+                epsilon = 1
+            else:
+                epsilon = self.epsilon_train
+            if self.eval_mode:
+                epsilon = self.epsilon_eval
+            self.epsilon_current = epsilon
+            self.current_replay_size = self._replay.memory.add_count
+            #
+            if random.random() <= epsilon:
+                # Choose a random action with probability epsilon.
+                return random.randint(0, self.num_actions - 1)
+            else:
+                # Choose the action with highest Q-value at the current state.
+                return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+
+
+    def _train_step(self):
+        """Runs a single training step.
+
+        Runs a training op if both:
+          (1) A minimum number of frames have been added to the replay buffer.
+          (2) `training_steps` is a multiple of `update_period`.
+
+        Also, syncs weights from online to target network if training steps is a
+        multiple of target update period.
+        """
+        # Run a train op at the rate of self.update_period if enough training steps
+        # have been run. This matches the Nature DQN behaviour.
+
+        if self._replay.memory.add_count > self.start_training:
+            if self.training_steps % self.update_period == 0:
+
+                # debug checked.
+                # _, neglogprob, logsoftmaxprob, \
+                # actor_loss, replay_action_one_hot = self._sess.run([self._train_op,
+                #                                                    self.neglogprob,
+                #                                                    self.logsoftmaxprob,
+                #                                                    self.actor_loss,
+                #                                                    self.replay_action_one_hot])
+                self._sess.run(self._train_op)
+                if (self.summary_writer is not None and
+                        self.training_steps > 0 and
+                        self.training_steps % self.summary_writing_frequency == 0):
+                    summary = self._sess.run(self._merged_summaries)
+                    self.summary_writer.add_summary(summary, self.training_steps)
+
+            # if self.training_steps % self.target_update_period == 0:
+            #     self._sess.run(self._sync_qt_ops)
+
+        self.training_steps += 1
+
+        if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False:
+            print("start training at {}".format(self.training_steps))
+            self.isPrinted = True
+
+    def _record_observation(self, observation):
+        """Records an observation and update state.
+
+        Extracts a frame from the observation vector and overwrites the oldest
+        frame in the state buffer.
+
+        Args:
+          observation: numpy array, an observation from the environment.
+        """
+        # Set current observation. We do the reshaping to handle environments
+        # without frame stacking.
+        observation = np.reshape(observation, self.observation_shape)
+        self._observation = observation[..., 0]
+        self._observation = np.reshape(observation, self.observation_shape)
+        # Swap out the oldest frame with the current frame.
+        self.state = np.roll(self.state, -1, axis=-1)
+        self.state[0, ..., -1] = self._observation
+
+    def _store_transition(self, last_observation, action, reward, is_terminal):
+        """Stores an experienced transition.
+
+        Executes a tf session and executes replay buffer ops in order to store the
+        following tuple in the replay buffer:
+          (last_observation, action, reward, is_terminal).
+
+        Pedantically speaking, this does not actually store an entire transition
+        since the next state is recorded on the following time step.
+
+        Args:
+          last_observation: numpy array, last observation.
+          action: int, the action taken.
+          reward: float, the reward.
+          is_terminal: bool, indicating if the current state is a terminal state.
+        """
+        self._replay.add(last_observation, action, reward, is_terminal)
+
+    def _reset_state(self):
+        """Resets the agent state by filling it with zeros."""
+        self.state.fill(0)
+
+    def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
+        """Returns a self-contained bundle of the agent's state.
+
+        This is used for checkpointing. It will return a dictionary containing all
+        non-TensorFlow objects (to be saved into a file by the caller), and it saves
+        all TensorFlow objects into a checkpoint file.
+
+        Args:
+          checkpoint_dir: str, directory where TensorFlow objects will be saved.
+          iteration_number: int, iteration number to use for naming the checkpoint
+            file.
+
+        Returns:
+          A dict containing additional Python objects to be checkpointed by the
+            experiment. If the checkpoint directory does not exist, returns None.
+        """
+        if not tf.gfile.Exists(checkpoint_dir):
+            return None
+        # Call the Tensorflow saver to checkpoint the graph.
+        self._saver.save(
+            self._sess,
+            os.path.join(checkpoint_dir, 'tf_ckpt'),
+            global_step=iteration_number)
+        # Checkpoint the out-of-graph replay buffer.
+        self._replay.save(checkpoint_dir, iteration_number)
+        bundle_dictionary = {}
+        bundle_dictionary['state'] = self.state
+        bundle_dictionary['eval_mode'] = self.eval_mode
+        bundle_dictionary['training_steps'] = self.training_steps
+        return bundle_dictionary
+
+    def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary):
+        """Restores the agent from a checkpoint.
+
+        Restores the agent's Python objects to those specified in bundle_dictionary,
+        and restores the TensorFlow objects to those specified in the
+        checkpoint_dir. If the checkpoint_dir does not exist, will not reset the
+          agent's state.
+
+        Args:
+          checkpoint_dir: str, path to the checkpoint saved by tf.Save.
+          iteration_number: int, checkpoint version, used when restoring replay
+            buffer.
+          bundle_dictionary: dict, containing additional Python objects owned by
+            the agent.
+
+        Returns:
+          bool, True if unbundling was successful.
+        """
+        try:
+            # self._replay.load() will throw a NotFoundError if it does not find all
+            # the necessary files, in which case we abort the process & return False.
+            self._replay.load(checkpoint_dir, iteration_number)
+        except tf.errors.NotFoundError:
+            return False
+        for key in self.__dict__:
+            if key in bundle_dictionary:
+                self.__dict__[key] = bundle_dictionary[key]
+        # Restore the agent's TensorFlow graph.
+        self._saver.restore(self._sess,
+                            os.path.join(checkpoint_dir,
+                                         'tf_ckpt-{}'.format(iteration_number)))
+        return True
+
+
+class ReplayBufferRegular(object):
+    """ for uniformly sampling.
+
+    """
+
+    def __init__(self, buffer_size, random_seed=1234):
+        self.buffer_size = buffer_size
+        self.count = 0
+        # Right side of deque contains newest experience
+        self.buffer = deque()
+        random.seed(random_seed)
+        self.ptr, self.path_start_idx = 0, 0
+
+    def add(self, state, action, reward, terminal):
+        experience = [state, action, reward, terminal]
+        assert self.count < self.buffer_size
+        self.buffer.append(experience)
+        self.count += 1
+        self.ptr += 1
+        # else:
+        #     self.path_start_idx -= 1
+        #     self.ptr = self.buffer_size - 1
+        #     self.buffer.popleft()
+        #     self.buffer.append(experience)
+
+    def get_sample(self):
+        self.count -= 1
+        return self.buffer.popleft()
+
+    def size(self):
+        return self.count
+
+    def clear(self):
+        self.buffer.clear()
+        self.count = 0
+        self.ptr = 0
+        self.path_start_idx = 0
diff --git a/dopamine/dopamine/agents/rainbow/__init__.py b/dopamine/dopamine/agents/rainbow/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbow/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/rainbow/configs/c51.gin b/dopamine/dopamine/agents/rainbow/configs/c51.gin
new file mode 100644
index 0000000..a73d4ed
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbow/configs/c51.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow the settings from Bellemare et al. (2017), but we
+# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to
+# ensure apples-to-apples comparison.
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 20000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00025
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+Runner.game_name = 'Breakout'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 30
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/rainbow/configs/c51_icml.gin b/dopamine/dopamine/agents/rainbow/configs/c51_icml.gin
new file mode 100644
index 0000000..b06aa7d
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbow/configs/c51_icml.gin
@@ -0,0 +1,36 @@
+# Hyperparameters used in Bellemare et al. (2017).
+import dopamine.atari.preprocessing
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 50000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 10000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 1000000  # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00025
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+Runner.game_name = 'Pong'
+# Deterministic ALE version used in the DQN Nature paper (Mnih et al., 2015).
+Runner.sticky_actions = False
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/rainbow/configs/rainbow.gin b/dopamine/dopamine/agents/rainbow/configs/rainbow.gin
new file mode 100644
index 0000000..6a4d92b
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbow/configs/rainbow.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow Hessel et al. (2018), except for sticky_actions,
+# which was False (not using sticky actions) in the original paper.
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 3
+RainbowAgent.min_replay_history = 20000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0  # changed by lkx
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+RainbowAgent.replay_scheme = 'prioritized'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+# Note these parameters are different from C51's.
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+Runner.game_name = 'Bowling'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False   # changed by lkx
+Runner.num_iterations = 15      # changed by lkx
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin b/dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin
new file mode 100644
index 0000000..48be0f6
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbow/configs/rainbow_aaai.gin
@@ -0,0 +1,37 @@
+# Hyperparameters follow Hessel et al. (2018).
+import dopamine.atari.preprocessing
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 3
+RainbowAgent.min_replay_history = 20000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0.001
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+RainbowAgent.replay_scheme = 'prioritized'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+# Note these parameters are different from C51's.
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+Runner.game_name = 'Pong'
+# Deterministic ALE version used in the AAAI paper.
+Runner.sticky_actions = False
+Runner.num_iterations = 200
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+AtariPreprocessing.terminal_on_life_loss = True
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/rainbow/rainbow_agent.py b/dopamine/dopamine/agents/rainbow/rainbow_agent.py
new file mode 100644
index 0000000..67ec08a
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbow/rainbow_agent.py
@@ -0,0 +1,504 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a simplified Rainbow agent.
+
+Specifically, we implement the following components from Rainbow:
+
+  * n-step updates;
+  * prioritized replay; and
+  * distributional RL.
+
+These three components were found to significantly impact the performance of
+the Atari game-playing agent.
+
+Furthermore, our implementation does away with some minor hyperparameter
+choices. Specifically, we
+
+  * keep the beta exponent fixed at beta=0.5, rather than increase it linearly;
+  * remove the alpha parameter, which was set to alpha=0.5 throughout the paper.
+
+Details in "Rainbow: Combining Improvements in Deep Reinforcement Learning" by
+Hessel et al. (2018).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+
+from dopamine.agents.dqn import dqn_agent
+from dopamine.replay_memory import prioritized_replay_buffer
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+class RainbowAgent(dqn_agent.DQNAgent):
+  """A compact implementation of a simplified Rainbow agent."""
+
+  def __init__(self,
+               sess,
+               num_actions,
+               observation_shape=dqn_agent.NATURE_DQN_OBSERVATION_SHAPE,
+               observation_dtype=dqn_agent.NATURE_DQN_DTYPE,
+               stack_size=dqn_agent.NATURE_DQN_STACK_SIZE,
+               num_atoms=51,
+               vmax=10.,
+               gamma=0.99,
+               update_horizon=1,
+               min_replay_history=20000,
+               update_period=4,
+               target_update_period=8000,
+               epsilon_fn=dqn_agent.linearly_decaying_epsilon,
+               epsilon_train=0.01,
+               epsilon_eval=0.001,
+               epsilon_decay_period=250000,
+               replay_scheme='prioritized',
+               tf_device='/cpu:*',
+               use_staging=True,
+               optimizer=tf.train.AdamOptimizer(
+                   learning_rate=0.00025, epsilon=0.0003125),
+               summary_writer=None,
+               summary_writing_frequency=500):
+    """Initializes the agent and constructs the components of its graph.
+
+    Args:
+      sess: `tf.Session`, for executing ops.
+      num_actions: int, number of actions the agent can take at any state.
+      observation_shape: tuple of ints or an int. If single int, the observation
+        is assumed to be a 2D square.
+      observation_dtype: tf.DType, specifies the type of the observations. Note
+        that if your inputs are continuous, you should set this to tf.float32.
+      stack_size: int, number of frames to use in state stack.
+      num_atoms: int, the number of buckets of the value function distribution.
+      vmax: float, the value distribution support is [-vmax, vmax].
+      gamma: float, discount factor with the usual RL meaning.
+      update_horizon: int, horizon at which updates are performed, the 'n' in
+        n-step update.
+      min_replay_history: int, number of transitions that should be experienced
+        before the agent begins training its value function.
+      update_period: int, period between DQN updates.
+      target_update_period: int, update period for the target network.
+      epsilon_fn: function expecting 4 parameters:
+        (decay_period, step, warmup_steps, epsilon). This function should return
+        the epsilon value used for exploration during training.
+      epsilon_train: float, the value to which the agent's epsilon is eventually
+        decayed during training.
+      epsilon_eval: float, epsilon used when evaluating the agent.
+      epsilon_decay_period: int, length of the epsilon decay schedule.
+      replay_scheme: str, 'prioritized' or 'uniform', the sampling scheme of the
+        replay memory.
+      tf_device: str, Tensorflow device on which the agent's graph is executed.
+      use_staging: bool, when True use a staging area to prefetch the next
+        training batch, speeding training up by about 30%.
+      optimizer: `tf.train.Optimizer`, for training the value function.
+      summary_writer: SummaryWriter object for outputting training statistics.
+        Summary writing disabled if set to None.
+      summary_writing_frequency: int, frequency with which summaries will be
+        written. Lower values will result in slower training.
+    """
+    # We need this because some tools convert round floats into ints.
+    vmax = float(vmax)
+    self._num_atoms = num_atoms
+    self._support = tf.linspace(-vmax, vmax, num_atoms)
+    self._replay_scheme = replay_scheme
+    # TODO(b/110897128): Make agent optimizer attribute private.
+    self.optimizer = optimizer
+
+    super(RainbowAgent, self).__init__(
+        sess=sess,
+        num_actions=num_actions,
+        observation_shape=observation_shape,
+        observation_dtype=observation_dtype,
+        stack_size=stack_size,
+        gamma=gamma,
+        update_horizon=update_horizon,
+        min_replay_history=min_replay_history,
+        update_period=update_period,
+        target_update_period=target_update_period,
+        epsilon_fn=epsilon_fn,
+        epsilon_train=epsilon_train,
+        epsilon_eval=epsilon_eval,
+        epsilon_decay_period=epsilon_decay_period,
+        tf_device=tf_device,
+        use_staging=use_staging,
+        optimizer=self.optimizer,
+        summary_writer=summary_writer,
+        summary_writing_frequency=summary_writing_frequency)
+
+  def _get_network_type(self):
+    """Returns the type of the outputs of a value distribution network.
+
+    Returns:
+      net_type: _network_type object defining the outputs of the network.
+    """
+    return collections.namedtuple('c51_network',
+                                  ['q_values', 'logits', 'probabilities'])
+
+  def _network_template(self, state):
+    """Builds a convolutional network that outputs Q-value distributions.
+
+    Args:
+      state: `tf.Tensor`, contains the agent's current state.
+
+    Returns:
+      net: _network_type object containing the tensors output by the network.
+    """
+    weights_initializer = slim.variance_scaling_initializer(
+        factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)
+
+    net = tf.cast(state, tf.float32)
+    net = tf.div(net, 255.)
+    net = slim.conv2d(
+        net, 32, [8, 8], stride=4, weights_initializer=weights_initializer)
+    net = slim.conv2d(
+        net, 64, [4, 4], stride=2, weights_initializer=weights_initializer)
+    net = slim.conv2d(
+        net, 64, [3, 3], stride=1, weights_initializer=weights_initializer)
+    net = slim.flatten(net)
+    net = slim.fully_connected(
+        net, 512, weights_initializer=weights_initializer)
+    net = slim.fully_connected(
+        net,
+        self.num_actions * self._num_atoms,
+        activation_fn=None,
+        weights_initializer=weights_initializer)
+
+    logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms])
+    probabilities = tf.contrib.layers.softmax(logits)
+    q_values = tf.reduce_sum(self._support * probabilities, axis=2)
+    return self._get_network_type()(q_values, logits, probabilities)
+
+  def _build_replay_buffer(self, use_staging):
+    """Creates the replay buffer used by the agent.
+
+    Args:
+      use_staging: bool, if True, uses a staging area to prefetch data for
+        faster training.
+
+    Returns:
+      A `WrappedPrioritizedReplayBuffer` object.
+
+    Raises:
+      ValueError: if given an invalid replay scheme.
+    """
+    if self._replay_scheme not in ['uniform', 'prioritized']:
+      raise ValueError('Invalid replay scheme: {}'.format(self._replay_scheme))
+    return prioritized_replay_buffer.WrappedPrioritizedReplayBuffer(
+        observation_shape=self.observation_shape,
+        stack_size=self.stack_size,
+        use_staging=use_staging,
+        update_horizon=self.update_horizon,
+        gamma=self.gamma)
+
+  def _build_target_distribution(self):
+    """Builds the C51 target distribution as per Bellemare et al. (2017).
+
+    First, we compute the support of the Bellman target, r + gamma Z'. Where Z'
+    is the support of the next state distribution:
+
+      * Evenly spaced in [-vmax, vmax] if the current state is nonterminal;
+      * 0 otherwise (duplicated num_atoms times).
+
+    Second, we compute the next-state probabilities, corresponding to the action
+    with highest expected value.
+
+    Finally we project the Bellman target (support + probabilities) onto the
+    original support.
+
+    Returns:
+      target_distribution: tf.tensor, the target distribution from the replay.
+    """
+    batch_size = self._replay.batch_size
+
+    # size of rewards: batch_size x 1
+    rewards = self._replay.rewards[:, None]
+
+    # size of tiled_support: batch_size x num_atoms
+    tiled_support = tf.tile(self._support, [batch_size])
+    tiled_support = tf.reshape(tiled_support, [batch_size, self._num_atoms])
+
+    # size of target_support: batch_size x num_atoms
+
+    is_terminal_multiplier = 1. - tf.cast(self._replay.terminals, tf.float32)
+    # Incorporate terminal state to discount factor.
+    # size of gamma_with_terminal: batch_size x 1
+    gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
+    gamma_with_terminal = gamma_with_terminal[:, None]
+
+    target_support = rewards + gamma_with_terminal * tiled_support
+
+    # size of next_qt_argmax: 1 x batch_size
+    next_qt_argmax = tf.argmax(
+        self._replay_next_target_net_outputs.q_values, axis=1)[:, None]
+    batch_indices = tf.range(tf.to_int64(batch_size))[:, None]
+    # size of next_qt_argmax: batch_size x 2
+    batch_indexed_next_qt_argmax = tf.concat(
+        [batch_indices, next_qt_argmax], axis=1)
+
+    # size of next_probabilities: batch_size x num_atoms
+    next_probabilities = tf.gather_nd(
+        self._replay_next_target_net_outputs.probabilities,
+        batch_indexed_next_qt_argmax)
+
+    return project_distribution(target_support, next_probabilities,
+                                self._support)
+
+  def _build_train_op(self):
+    """Builds a training op.
+
+    Returns:
+      train_op: An op performing one step of training from replay data.
+    """
+    target_distribution = tf.stop_gradient(self._build_target_distribution())
+
+    # size of indices: batch_size x 1.
+    indices = tf.range(tf.shape(self._replay_net_outputs.logits)[0])[:, None]
+    # size of reshaped_actions: batch_size x 2.
+    reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1)
+    # For each element of the batch, fetch the logits for its selected action.
+    chosen_action_logits = tf.gather_nd(self._replay_net_outputs.logits,
+                                        reshaped_actions)
+
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=target_distribution,
+        logits=chosen_action_logits)
+
+    if self._replay_scheme == 'prioritized':
+      # The original prioritized experience replay uses a linear exponent
+      # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5
+      # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested
+      # a fixed exponent actually performs better, except on Pong.
+      probs = self._replay.transition['sampling_probabilities']
+      loss_weights = 1.0 / tf.sqrt(probs + 1e-10)
+      loss_weights /= tf.reduce_max(loss_weights)
+
+      # Rainbow and prioritized replay are parametrized by an exponent alpha,
+      # but in both cases it is set to 0.5 - for simplicity's sake we leave it
+      # as is here, using the more direct tf.sqrt(). Taking the square root
+      # "makes sense", as we are dealing with a squared loss.
+      # Add a small nonzero value to the loss to avoid 0 priority items. While
+      # technically this may be okay, setting all items to 0 priority will cause
+      # troubles, and also result in 1.0 / 0.0 = NaN correction terms.
+      update_priorities_op = self._replay.tf_set_priority(
+          self._replay.indices, tf.sqrt(loss + 1e-10))
+
+      # Weight the loss by the inverse priorities.
+      loss = loss_weights * loss
+    else:
+      update_priorities_op = tf.no_op()
+
+    with tf.control_dependencies([update_priorities_op]):
+      if self.summary_writer is not None:
+        with tf.variable_scope('Losses'):
+          tf.summary.scalar('CrossEntropyLoss', tf.reduce_mean(loss))
+      # Schaul et al. reports a slightly different rule, where 1/N is also
+      # exponentiated by beta. Not doing so seems more reasonable, and did not
+      # impact performance in our experiments.
+      return self.optimizer.minimize(tf.reduce_mean(loss)), loss
+
+  def _store_transition(self,
+                        last_observation,
+                        action,
+                        reward,
+                        is_terminal,
+                        priority=None):
+    """Stores a transition when in training mode.
+
+    Executes a tf session and executes replay buffer ops in order to store the
+    following tuple in the replay buffer (last_observation, action, reward,
+    is_terminal, priority).
+
+    Args:
+      last_observation: Last observation, type determined via observation_type
+        parameter in the replay_memory constructor.
+      action: An integer, the action taken.
+      reward: A float, the reward.
+      is_terminal: Boolean indicating if the current state is a terminal state.
+      priority: Float. Priority of sampling the transition. If None, the default
+        priority will be used. If replay scheme is uniform, the default priority
+        is 1. If the replay scheme is prioritized, the default priority is the
+        maximum ever seen [Schaul et al., 2015].
+    """
+    if priority is None:
+      priority = (1. if self._replay_scheme == 'uniform' else
+                  self._replay.memory.sum_tree.max_recorded_priority)
+
+    if not self.eval_mode:
+      self._replay.add(last_observation, action, reward, is_terminal, priority)
+
+
+def project_distribution(supports, weights, target_support,
+                         validate_args=False):
+  """Projects a batch of (support, weights) onto target_support.
+
+  Based on equation (7) in (Bellemare et al., 2017):
+    https://arxiv.org/abs/1707.06887
+  In the rest of the comments we will refer to this equation simply as Eq7.
+
+  This code is not easy to digest, so we will use a running example to clarify
+  what is going on, with the following sample inputs:
+
+    * supports =       [[0, 2, 4, 6, 8],
+                        [1, 3, 4, 5, 6]]
+    * weights =        [[0.1, 0.6, 0.1, 0.1, 0.1],
+                        [0.1, 0.2, 0.5, 0.1, 0.1]]
+    * target_support = [4, 5, 6, 7, 8]
+
+  In the code below, comments preceded with 'Ex:' will be referencing the above
+  values.
+
+  Args:
+    supports: Tensor of shape (batch_size, num_dims) defining supports for the
+      distribution.
+    weights: Tensor of shape (batch_size, num_dims) defining weights on the
+      original support points. Although for the CategoricalDQN agent these
+      weights are probabilities, it is not required that they are.
+    target_support: Tensor of shape (num_dims) defining support of the projected
+      distribution. The values must be monotonically increasing. Vmin and Vmax
+      will be inferred from the first and last elements of this tensor,
+      respectively. The values in this tensor must be equally spaced.
+    validate_args: Whether we will verify the contents of the
+      target_support parameter.
+
+  Returns:
+    A Tensor of shape (batch_size, num_dims) with the projection of a batch of
+    (support, weights) onto target_support.
+
+  Raises:
+    ValueError: If target_support has no dimensions, or if shapes of supports,
+      weights, and target_support are incompatible.
+  """
+  target_support_deltas = target_support[1:] - target_support[:-1]
+  # delta_z = `\Delta z` in Eq7.
+  delta_z = target_support_deltas[0]
+  validate_deps = []
+  supports.shape.assert_is_compatible_with(weights.shape)
+  supports[0].shape.assert_is_compatible_with(target_support.shape)
+  target_support.shape.assert_has_rank(1)
+  if validate_args:
+    # Assert that supports and weights have the same shapes.
+    validate_deps.append(
+        tf.Assert(
+            tf.reduce_all(tf.equal(tf.shape(supports), tf.shape(weights))),
+            [supports, weights]))
+    # Assert that elements of supports and target_support have the same shape.
+    validate_deps.append(
+        tf.Assert(
+            tf.reduce_all(
+                tf.equal(tf.shape(supports)[1], tf.shape(target_support))),
+            [supports, target_support]))
+    # Assert that target_support has a single dimension.
+    validate_deps.append(
+        tf.Assert(
+            tf.equal(tf.size(tf.shape(target_support)), 1), [target_support]))
+    # Assert that the target_support is monotonically increasing.
+    validate_deps.append(
+        tf.Assert(tf.reduce_all(target_support_deltas > 0), [target_support]))
+    # Assert that the values in target_support are equally spaced.
+    validate_deps.append(
+        tf.Assert(
+            tf.reduce_all(tf.equal(target_support_deltas, delta_z)),
+            [target_support]))
+
+  with tf.control_dependencies(validate_deps):
+    # Ex: `v_min, v_max = 4, 8`.
+    v_min, v_max = target_support[0], target_support[-1]
+    # Ex: `batch_size = 2`.
+    batch_size = tf.shape(supports)[0]
+    # `N` in Eq7.
+    # Ex: `num_dims = 5`.
+    num_dims = tf.shape(target_support)[0]
+    # clipped_support = `[\hat{T}_{z_j}]^{V_max}_{V_min}` in Eq7.
+    # Ex: `clipped_support = [[[ 4.  4.  4.  6.  8.]]
+    #                         [[ 4.  4.  4.  5.  6.]]]`.
+    clipped_support = tf.clip_by_value(supports, v_min, v_max)[:, None, :]
+    # Ex: `tiled_support = [[[[ 4.  4.  4.  6.  8.]
+    #                         [ 4.  4.  4.  6.  8.]
+    #                         [ 4.  4.  4.  6.  8.]
+    #                         [ 4.  4.  4.  6.  8.]
+    #                         [ 4.  4.  4.  6.  8.]]
+    #                        [[ 4.  4.  4.  5.  6.]
+    #                         [ 4.  4.  4.  5.  6.]
+    #                         [ 4.  4.  4.  5.  6.]
+    #                         [ 4.  4.  4.  5.  6.]
+    #                         [ 4.  4.  4.  5.  6.]]]]`.
+    tiled_support = tf.tile([clipped_support], [1, 1, num_dims, 1])
+    # Ex: `reshaped_target_support = [[[ 4.]
+    #                                  [ 5.]
+    #                                  [ 6.]
+    #                                  [ 7.]
+    #                                  [ 8.]]
+    #                                 [[ 4.]
+    #                                  [ 5.]
+    #                                  [ 6.]
+    #                                  [ 7.]
+    #                                  [ 8.]]]`.
+    reshaped_target_support = tf.tile(target_support[:, None], [batch_size, 1])
+    reshaped_target_support = tf.reshape(reshaped_target_support,
+                                         [batch_size, num_dims, 1])
+    # numerator = `|clipped_support - z_i|` in Eq7.
+    # Ex: `numerator = [[[[ 0.  0.  0.  2.  4.]
+    #                     [ 1.  1.  1.  1.  3.]
+    #                     [ 2.  2.  2.  0.  2.]
+    #                     [ 3.  3.  3.  1.  1.]
+    #                     [ 4.  4.  4.  2.  0.]]
+    #                    [[ 0.  0.  0.  1.  2.]
+    #                     [ 1.  1.  1.  0.  1.]
+    #                     [ 2.  2.  2.  1.  0.]
+    #                     [ 3.  3.  3.  2.  1.]
+    #                     [ 4.  4.  4.  3.  2.]]]]`.
+    numerator = tf.abs(tiled_support - reshaped_target_support)
+    quotient = 1 - (numerator / delta_z)
+    # clipped_quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7.
+    # Ex: `clipped_quotient = [[[[ 1.  1.  1.  0.  0.]
+    #                            [ 0.  0.  0.  0.  0.]
+    #                            [ 0.  0.  0.  1.  0.]
+    #                            [ 0.  0.  0.  0.  0.]
+    #                            [ 0.  0.  0.  0.  1.]]
+    #                           [[ 1.  1.  1.  0.  0.]
+    #                            [ 0.  0.  0.  1.  0.]
+    #                            [ 0.  0.  0.  0.  1.]
+    #                            [ 0.  0.  0.  0.  0.]
+    #                            [ 0.  0.  0.  0.  0.]]]]`.
+    clipped_quotient = tf.clip_by_value(quotient, 0, 1)
+    # Ex: `weights = [[ 0.1  0.6  0.1  0.1  0.1]
+    #                 [ 0.1  0.2  0.5  0.1  0.1]]`.
+    weights = weights[:, None, :]
+    # inner_prod = `\sum_{j=0}^{N-1} clipped_quotient * p_j(x', \pi(x'))`
+    # in Eq7.
+    # Ex: `inner_prod = [[[[ 0.1  0.6  0.1  0.  0. ]
+    #                      [ 0.   0.   0.   0.  0. ]
+    #                      [ 0.   0.   0.   0.1 0. ]
+    #                      [ 0.   0.   0.   0.  0. ]
+    #                      [ 0.   0.   0.   0.  0.1]]
+    #                     [[ 0.1  0.2  0.5  0.  0. ]
+    #                      [ 0.   0.   0.   0.1 0. ]
+    #                      [ 0.   0.   0.   0.  0.1]
+    #                      [ 0.   0.   0.   0.  0. ]
+    #                      [ 0.   0.   0.   0.  0. ]]]]`.
+    inner_prod = clipped_quotient * weights
+    # Ex: `projection = [[ 0.8 0.0 0.1 0.0 0.1]
+    #                    [ 0.8 0.1 0.1 0.0 0.0]]`.
+    projection = tf.reduce_sum(inner_prod, 3)
+    projection = tf.reshape(projection, [batch_size, num_dims])
+    return projection
diff --git a/dopamine/dopamine/agents/rainbowrpg/__init__.py b/dopamine/dopamine/agents/rainbowrpg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbowrpg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin b/dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin
new file mode 100644
index 0000000..052cd13
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbowrpg/configs/c51rpg.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow the settings from Bellemare et al. (2017), but we
+# modify as necessary to match those used in Rainbow (Hessel et al., 2018), to
+# ensure apples-to-apples comparison.
+import dopamine.agents.rainbow.rainbow_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowAgent.num_atoms = 51
+RainbowAgent.vmax = 10.
+RainbowAgent.gamma = 0.99
+RainbowAgent.update_horizon = 1
+RainbowAgent.min_replay_history = 2000  # agent steps
+RainbowAgent.update_period = 4
+RainbowAgent.target_update_period = 8000  # agent steps
+RainbowAgent.epsilon_train = 0.01
+RainbowAgent.epsilon_eval = 0
+RainbowAgent.epsilon_decay_period = 250000  # agent steps
+RainbowAgent.replay_scheme = 'uniform'
+RainbowAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowAgent.optimizer = @tf.train.AdamOptimizer()
+
+tf.train.AdamOptimizer.learning_rate = 0.00025
+tf.train.AdamOptimizer.epsilon = 0.0003125
+
+Runner.game_name = 'Breakout'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 30
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin b/dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin
new file mode 100644
index 0000000..d88945b
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbowrpg/configs/rainbowrpg.gin
@@ -0,0 +1,35 @@
+# Hyperparameters follow Hessel et al. (2018), except for sticky_actions,
+# which was False (not using sticky actions) in the original paper.
+import dopamine.agents.rainbowrpg.rainbowrpg_agent
+import dopamine.atari.run_experiment
+import dopamine.replay_memory.prioritized_replay_buffer
+import gin.tf.external_configurables
+
+RainbowRPGAgent.num_atoms = 51
+RainbowRPGAgent.vmax = 10.
+RainbowRPGAgent.gamma = 0.99
+RainbowRPGAgent.update_horizon = 3
+RainbowRPGAgent.min_replay_history = 20000  # agent steps lkx
+RainbowRPGAgent.update_period = 4
+RainbowRPGAgent.target_update_period = 8000  # agent steps
+RainbowRPGAgent.epsilon_train = 0.01
+RainbowRPGAgent.epsilon_eval = 0  # changed by lkx
+RainbowRPGAgent.epsilon_decay_period = 250000  # agent steps
+RainbowRPGAgent.replay_scheme = 'prioritized'
+RainbowRPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RainbowRPGAgent.optimizer = @tf.train.AdamOptimizer()
+
+# Note these parameters are different from C51's.
+tf.train.AdamOptimizer.learning_rate = 0.0000625
+tf.train.AdamOptimizer.epsilon = 0.00015
+
+Runner.game_name = 'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False   # changed by lkx
+Runner.num_iterations = 15      # changed by lkx
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 125000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
+WrappedPrioritizedReplayBuffer.batch_size = 32  # changed by lkx
diff --git a/dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py b/dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py
new file mode 100644
index 0000000..a25a3c0
--- /dev/null
+++ b/dopamine/dopamine/agents/rainbowrpg/rainbowrpg_agent.py
@@ -0,0 +1,699 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a simplified Rainbow agent.
+
+Specifically, we implement the following components from Rainbow:
+
+  * n-step updates;
+  * prioritized replay; and
+  * distributional RL.
+
+These three components were found to significantly impact the performance of
+the Atari game-playing agent.
+
+Furthermore, our implementation does away with some minor hyperparameter
+choices. Specifically, we
+
+  * keep the beta exponent fixed at beta=0.5, rather than increase it linearly;
+  * remove the alpha parameter, which was set to alpha=0.5 throughout the paper.
+
+Details in "Rainbow: Combining Improvements in Deep Reinforcement Learning" by
+Hessel et al. (2018).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from dopamine.agents.dqn import dqn_agent
+from dopamine.replay_memory import prioritized_replay_buffer
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+from dopamine.agents.agent_utils import *
+
+slim = tf.contrib.slim
+
+
+@gin.configurable
+class RainbowRPGAgent(dqn_agent.DQNAgent):
+    """A compact implementation of a simplified Rainbow agent."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 observation_shape=dqn_agent.NATURE_DQN_OBSERVATION_SHAPE,
+                 observation_dtype=dqn_agent.NATURE_DQN_DTYPE,
+                 stack_size=dqn_agent.NATURE_DQN_STACK_SIZE,
+                 num_atoms=51,
+                 vmax=10.,
+                 gamma=0.99,
+                 update_horizon=1,
+                 min_replay_history=20000,
+                 update_period=4,
+                 target_update_period=8000,
+                 epsilon_fn=dqn_agent.linearly_decaying_epsilon,
+                 epsilon_train=0.01,
+                 epsilon_eval=0.001,
+                 epsilon_decay_period=250000,
+                 replay_scheme='prioritized',
+                 tf_device='/gpu:*',
+                 use_staging=True,
+                 optimizer=tf.train.AdamOptimizer(
+                     learning_rate=0.00025, epsilon=0.0003125),
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the components of its graph.
+
+        Args:
+          sess: `tf.Session`, for executing ops.
+          num_actions: int, number of actions the agent can take at any state.
+          observation_shape: tuple of ints or an int. If single int, the observation
+            is assumed to be a 2D square.
+          observation_dtype: tf.DType, specifies the type of the observations. Note
+            that if your inputs are continuous, you should set this to tf.float32.
+          stack_size: int, number of frames to use in state stack.
+          num_atoms: int, the number of buckets of the value function distribution.
+          vmax: float, the value distribution support is [-vmax, vmax].
+          gamma: float, discount factor with the usual RL meaning.
+          update_horizon: int, horizon at which updates are performed, the 'n' in
+            n-step update.
+          min_replay_history: int, number of transitions that should be experienced
+            before the agent begins training its value function.
+          update_period: int, period between DQN updates.
+          target_update_period: int, update period for the target network.
+          epsilon_fn: function expecting 4 parameters:
+            (decay_period, step, warmup_steps, epsilon). This function should return
+            the epsilon value used for exploration during training.
+          epsilon_train: float, the value to which the agent's epsilon is eventually
+            decayed during training.
+          epsilon_eval: float, epsilon used when evaluating the agent.
+          epsilon_decay_period: int, length of the epsilon decay schedule.
+          replay_scheme: str, 'prioritized' or 'uniform', the sampling scheme of the
+            replay memory.
+          tf_device: str, Tensorflow device on which the agent's graph is executed.
+          use_staging: bool, when True use a staging area to prefetch the next
+            training batch, speeding training up by about 30%.
+          optimizer: `tf.train.Optimizer`, for training the value function.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        # We need this because some tools convert round floats into ints.
+        vmax = float(vmax)
+        self._num_atoms = num_atoms
+        self._support = tf.linspace(-vmax, vmax, num_atoms)
+        self._replay_scheme = replay_scheme
+        # TODO(b/110897128): Make agent optimizer attribute private.
+        self.optimizer = optimizer
+        self.optimizer_rpg = tf.train.RMSPropOptimizer(
+            learning_rate=0.00025,
+            decay=0.95,
+            momentum=0.0,
+            epsilon=0.00001,
+            centered=True)  # optimizer for RPG'=
+        self.start_training = 1000
+
+        super(RainbowRPGAgent, self).__init__(
+            sess=sess,
+            num_actions=num_actions,
+            observation_shape=observation_shape,
+            observation_dtype=observation_dtype,
+            stack_size=stack_size,
+            gamma=gamma,
+            update_horizon=update_horizon,
+            min_replay_history=min_replay_history,
+            update_period=update_period,
+            target_update_period=target_update_period,
+            epsilon_fn=epsilon_fn,
+            epsilon_train=epsilon_train,
+            epsilon_eval=epsilon_eval,
+            epsilon_decay_period=epsilon_decay_period,
+            tf_device=tf_device,
+            use_staging=use_staging,
+            optimizer=self.optimizer,
+            summary_writer=summary_writer,
+            summary_writing_frequency=summary_writing_frequency)
+
+        with tf.device(tf_device):
+            self._replay_opt = self._build_replay_buffer_opt(use_staging)
+            self._build_networks_rpg()
+            self._train_op_rpg = self._build_train_op_rpg()
+
+            # replay buffer for rpg. only store good trajectories.
+        self.replay_buffer_temp = ReplayBufferRegular(100000)  # temporarily
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of a value distribution network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('c51_network',
+                                      ['q_values', 'logits', 'probabilities'])
+
+    def _network_template(self, state):
+        """Builds a convolutional network that outputs Q-value distributions.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        weights_initializer = slim.variance_scaling_initializer(
+            factor=1.0 / np.sqrt(3.0), mode='FAN_IN', uniform=True)
+
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(
+            net, 32, [8, 8], stride=4, weights_initializer=weights_initializer)
+        net = slim.conv2d(
+            net, 64, [4, 4], stride=2, weights_initializer=weights_initializer)
+        net = slim.conv2d(
+            net, 64, [3, 3], stride=1, weights_initializer=weights_initializer)
+        net = slim.flatten(net)
+        net = slim.fully_connected(
+            net, 512, weights_initializer=weights_initializer)
+        net = slim.fully_connected(
+            net,
+            self.num_actions * self._num_atoms,
+            activation_fn=None,
+            weights_initializer=weights_initializer)
+
+        logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms])
+        probabilities = tf.contrib.layers.softmax(logits)
+        q_values = tf.reduce_sum(self._support * probabilities, axis=2)
+        return self._get_network_type()(q_values, logits, probabilities)
+
+    def _build_replay_buffer(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A `WrappedPrioritizedReplayBuffer` object.
+
+        Raises:
+          ValueError: if given an invalid replay scheme.
+        """
+        if self._replay_scheme not in ['uniform', 'prioritized']:
+            raise ValueError('Invalid replay scheme: {}'.format(self._replay_scheme))
+        return prioritized_replay_buffer.WrappedPrioritizedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma)
+
+    def _build_replay_buffer_opt(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A WrapperReplayBuffer object.
+        """
+        return circular_replay_buffer.WrappedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma,
+            observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+    def _build_target_distribution(self):
+        """Builds the C51 target distribution as per Bellemare et al. (2017).
+
+        First, we compute the support of the Bellman target, r + gamma Z'. Where Z'
+        is the support of the next state distribution:
+
+          * Evenly spaced in [-vmax, vmax] if the current state is nonterminal;
+          * 0 otherwise (duplicated num_atoms times).
+
+        Second, we compute the next-state probabilities, corresponding to the action
+        with highest expected value.
+
+        Finally we project the Bellman target (support + probabilities) onto the
+        original support.
+
+        Returns:
+          target_distribution: tf.tensor, the target distribution from the replay.
+        """
+        batch_size = self._replay.batch_size
+
+        # size of rewards: batch_size x 1
+        rewards = self._replay.rewards[:, None]
+
+        # size of tiled_support: batch_size x num_atoms
+        tiled_support = tf.tile(self._support, [batch_size])
+        tiled_support = tf.reshape(tiled_support, [batch_size, self._num_atoms])
+
+        # size of target_support: batch_size x num_atoms
+
+        is_terminal_multiplier = 1. - tf.cast(self._replay.terminals, tf.float32)
+        # Incorporate terminal state to discount factor.
+        # size of gamma_with_terminal: batch_size x 1
+        gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier
+        gamma_with_terminal = gamma_with_terminal[:, None]
+
+        target_support = rewards + gamma_with_terminal * tiled_support
+
+        # size of next_qt_argmax: 1 x batch_size
+        next_qt_argmax = tf.argmax(
+            self._replay_next_target_net_outputs.q_values, axis=1)[:, None]
+        batch_indices = tf.range(tf.to_int64(batch_size))[:, None]
+        # size of next_qt_argmax: batch_size x 2
+        batch_indexed_next_qt_argmax = tf.concat(
+            [batch_indices, next_qt_argmax], axis=1)
+
+        # size of next_probabilities: batch_size x num_atoms
+        next_probabilities = tf.gather_nd(
+            self._replay_next_target_net_outputs.probabilities,
+            batch_indexed_next_qt_argmax)
+
+        return project_distribution(target_support, next_probabilities,
+                                    self._support)
+
+    def _build_networks(self):
+        """Builds the Q-value network computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's Q-values.
+          self.target_convnet: For computing the next state's target Q-values.
+          self._net_outputs: The actual Q-values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' Q-values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            Q-values (see Mnih et al., 2015 for details).
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        # DQN explore net.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+        self.target_convnet = tf.make_template('Target', self._network_template)
+
+        self._net_outputs = self.online_convnet(self.state_ph)
+        # TODO(bellemare): Ties should be broken. They are unlikely to happen when
+        # using a deep network, but may affect performance with a linear
+        # approximation scheme.
+        self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states)
+        self._replay_next_target_net_outputs = self.target_convnet(
+            self._replay.next_states)
+
+    def _build_networks_rpg(self):
+        # RPG learning net.
+        self.rpg_convnet = tf.make_template('RPG', self._network_template)
+        self._rpg_net_outputs = self.rpg_convnet(self.state_ph)
+        self._q_argmax_rpg = tf.argmax(self._rpg_net_outputs.q_values, axis=1)[0]
+        self._replay_rpg_net_outputs = self.rpg_convnet(self._replay_opt.states)
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+        target_distribution = tf.stop_gradient(self._build_target_distribution())
+
+        # size of indices: batch_size x 1.
+        indices = tf.range(tf.shape(self._replay_net_outputs.logits)[0])[:, None]
+        # size of reshaped_actions: batch_size x 2.
+        reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1)
+        # For each element of the batch, fetch the logits for its selected action.
+        chosen_action_logits = tf.gather_nd(self._replay_net_outputs.logits,
+                                            reshaped_actions)
+
+        loss = tf.nn.softmax_cross_entropy_with_logits(
+            labels=target_distribution,
+            logits=chosen_action_logits)
+
+        if self._replay_scheme == 'prioritized':
+            # The original prioritized experience replay uses a linear exponent
+            # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5
+            # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested
+            # a fixed exponent actually performs better, except on Pong.
+            probs = self._replay.transition['sampling_probabilities']
+            loss_weights = 1.0 / tf.sqrt(probs + 1e-10)
+            loss_weights /= tf.reduce_max(loss_weights)
+
+            # Rainbow and prioritized replay are parametrized by an exponent alpha,
+            # but in both cases it is set to 0.5 - for simplicity's sake we leave it
+            # as is here, using the more direct tf.sqrt(). Taking the square root
+            # "makes sense", as we are dealing with a squared loss.
+            # Add a small nonzero value to the loss to avoid 0 priority items. While
+            # technically this may be okay, setting all items to 0 priority will cause
+            # troubles, and also result in 1.0 / 0.0 = NaN correction terms.
+            update_priorities_op = self._replay.tf_set_priority(
+                self._replay.indices, tf.sqrt(loss + 1e-10))
+
+            # Weight the loss by the inverse priorities.
+            loss = loss_weights * loss
+        else:
+            update_priorities_op = tf.no_op()
+
+        with tf.control_dependencies([update_priorities_op]):
+            if self.summary_writer is not None:
+                with tf.variable_scope('Losses'):
+                    tf.summary.scalar('CrossEntropyLoss', tf.reduce_mean(loss))
+            # Schaul et al. reports a slightly different rule, where 1/N is also
+            # exponentiated by beta. Not doing so seems more reasonable, and did not
+            # impact performance in our experiments.
+            return self.optimizer.minimize(tf.reduce_mean(loss)), loss
+
+    def _build_train_op_rpg(self):
+        # RPG loss
+        replay_action_one_hot = tf.one_hot(
+            self._replay_opt.actions, self.num_actions, 1., 0., name='action_one_hot_rpg')
+        replay_chosen_q = tf.reduce_sum(
+            self._replay_rpg_net_outputs.q_values * replay_action_one_hot,
+            reduction_indices=1,
+            name='replay_chosen_q_rpg')
+        margin = 1
+        qvalue = self._replay_rpg_net_outputs.q_values
+        # debug self.temp_action_one_hot = replay_action_one_hot
+        self.temp_qvalue = qvalue
+        self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot
+        self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \
+                     * ((1 - replay_action_one_hot) + (replay_action_one_hot))
+        self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2)
+        rpg_loss = tf.reduce_mean(self.hingeloss)
+        return self.optimizer_rpg.minimize(rpg_loss)
+
+    def _store_transition(self,
+                          last_observation,
+                          action,
+                          reward,
+                          is_terminal,
+                          priority=None):
+        """Stores a transition when in training mode.
+
+        Executes a tf session and executes replay buffer ops in order to store the
+        following tuple in the replay buffer (last_observation, action, reward,
+        is_terminal, priority).
+
+        Args:
+          last_observation: Last observation, type determined via observation_type
+            parameter in the replay_memory constructor.
+          action: An integer, the action taken.
+          reward: A float, the reward.
+          is_terminal: Boolean indicating if the current state is a terminal state.
+          priority: Float. Priority of sampling the transition. If None, the default
+            priority will be used. If replay scheme is uniform, the default priority
+            is 1. If the replay scheme is prioritized, the default priority is the
+            maximum ever seen [Schaul et al., 2015].
+        """
+        if priority is None:
+            priority = (1. if self._replay_scheme == 'uniform' else
+                        self._replay.memory.sum_tree.max_recorded_priority)
+
+        if not self.eval_mode:
+            self._replay.add(last_observation, action, reward, is_terminal, priority)
+
+    def step(self, reward, observation):
+        """Records the most recent transition and returns the agent's next action.
+
+        We store the observation of the last time step since we want to store it
+        with the reward.
+
+        Args:
+          reward: float, the reward received from the agent's most recent action.
+          observation: numpy array, the most recent observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._last_observation = self._observation
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._store_transition(self._last_observation, self.action, reward, False)
+            self.replay_buffer_temp.add(self._last_observation, self.action, reward, False)
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def end_episode(self, reward):
+        """Signals the end of the episode to the agent.
+
+        We store the observation of the current time step, which is the last
+        observation of the episode.
+
+        Args:
+          reward: float, the last reward from the environment.
+        """
+        if not self.eval_mode:
+            self.replay_buffer_temp.clear()  # this episode is not optimal
+            self._store_transition(self._observation, self.action, reward, True)
+
+    def end_episode_(self, reward, total_reward, step_number):
+        """ This episodes is optimal trajectory """
+        if not self.eval_mode:
+            # for DQN
+            self._store_transition(self._observation, self.action, reward, True)
+
+            # replay buffer for RPG.
+            self.replay_buffer_temp.add(self._observation, self.action, reward, True)
+            count = step_number
+            while count > 0:
+                experience = self.replay_buffer_temp.get_sample()
+                state, action, reward, _ = experience
+                count -= 1
+                # self.replay_buffer_opt.add(state, action, reward, False)
+                self._replay_opt.add(state, action, reward, False)
+
+    def _select_action(self):
+        """Select an action from the set of available actions.
+
+        Chooses an action randomly with probability self._calculate_epsilon(), and
+        otherwise acts greedily according to the current Q-value estimates.
+
+        Returns:
+           int, the selected action.
+        """
+        if self.eval_mode is not True:
+            epsilon = self.epsilon_fn(
+                self.epsilon_decay_period,
+                self.training_steps,
+                self.min_replay_history,
+                self.epsilon_train)
+            if random.random() <= epsilon:
+                # Choose a random action with probability epsilon.
+                return random.randint(0, self.num_actions - 1)
+            else:
+                # Choose the action with highest Q-value at the current state.
+                return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+        else:
+            # evaluation mode: use rpg.
+            return self._sess.run(self._q_argmax_rpg, {self.state_ph: self.state})
+
+    def _train_step(self):
+        """Runs a single training step.
+
+        Runs a training op if both:
+          (1) A minimum number of frames have been added to the replay buffer.
+          (2) `training_steps` is a multiple of `update_period`.
+
+        Also, syncs weights from online to target network if training steps is a
+        multiple of target update period.
+        """
+        # Run a train op at the rate of self.update_period if enough training steps
+        # have been run. This matches the Nature DQN behaviour.
+        if self._replay.memory.add_count > self.min_replay_history:
+            if self.training_steps % self.update_period == 0:
+                self._sess.run(self._train_op)
+                if self._replay_opt.memory.add_count > self.start_training:
+                    self._sess.run(self._train_op_rpg)
+
+                if (self.summary_writer is not None and
+                        self.training_steps > 0 and
+                        self.training_steps % self.summary_writing_frequency == 0):
+                    summary = self._sess.run(self._merged_summaries)
+                    self.summary_writer.add_summary(summary, self.training_steps)
+
+            if self.training_steps % self.target_update_period == 0:
+                self._sess.run(self._sync_qt_ops)
+
+        self.training_steps += 1
+
+
+def project_distribution(supports, weights, target_support,
+                         validate_args=False):
+    """Projects a batch of (support, weights) onto target_support.
+
+    Based on equation (7) in (Bellemare et al., 2017):
+      https://arxiv.org/abs/1707.06887
+    In the rest of the comments we will refer to this equation simply as Eq7.
+
+    This code is not easy to digest, so we will use a running example to clarify
+    what is going on, with the following sample inputs:
+
+      * supports =       [[0, 2, 4, 6, 8],
+                          [1, 3, 4, 5, 6]]
+      * weights =        [[0.1, 0.6, 0.1, 0.1, 0.1],
+                          [0.1, 0.2, 0.5, 0.1, 0.1]]
+      * target_support = [4, 5, 6, 7, 8]
+
+    In the code below, comments preceded with 'Ex:' will be referencing the above
+    values.
+
+    Args:
+      supports: Tensor of shape (batch_size, num_dims) defining supports for the
+        distribution.
+      weights: Tensor of shape (batch_size, num_dims) defining weights on the
+        original support points. Although for the CategoricalDQN agent these
+        weights are probabilities, it is not required that they are.
+      target_support: Tensor of shape (num_dims) defining support of the projected
+        distribution. The values must be monotonically increasing. Vmin and Vmax
+        will be inferred from the first and last elements of this tensor,
+        respectively. The values in this tensor must be equally spaced.
+      validate_args: Whether we will verify the contents of the
+        target_support parameter.
+
+    Returns:
+      A Tensor of shape (batch_size, num_dims) with the projection of a batch of
+      (support, weights) onto target_support.
+
+    Raises:
+      ValueError: If target_support has no dimensions, or if shapes of supports,
+        weights, and target_support are incompatible.
+    """
+    target_support_deltas = target_support[1:] - target_support[:-1]
+    # delta_z = `\Delta z` in Eq7.
+    delta_z = target_support_deltas[0]
+    validate_deps = []
+    supports.shape.assert_is_compatible_with(weights.shape)
+    supports[0].shape.assert_is_compatible_with(target_support.shape)
+    target_support.shape.assert_has_rank(1)
+    if validate_args:
+        # Assert that supports and weights have the same shapes.
+        validate_deps.append(
+            tf.Assert(
+                tf.reduce_all(tf.equal(tf.shape(supports), tf.shape(weights))),
+                [supports, weights]))
+        # Assert that elements of supports and target_support have the same shape.
+        validate_deps.append(
+            tf.Assert(
+                tf.reduce_all(
+                    tf.equal(tf.shape(supports)[1], tf.shape(target_support))),
+                [supports, target_support]))
+        # Assert that target_support has a single dimension.
+        validate_deps.append(
+            tf.Assert(
+                tf.equal(tf.size(tf.shape(target_support)), 1), [target_support]))
+        # Assert that the target_support is monotonically increasing.
+        validate_deps.append(
+            tf.Assert(tf.reduce_all(target_support_deltas > 0), [target_support]))
+        # Assert that the values in target_support are equally spaced.
+        validate_deps.append(
+            tf.Assert(
+                tf.reduce_all(tf.equal(target_support_deltas, delta_z)),
+                [target_support]))
+
+    with tf.control_dependencies(validate_deps):
+        # Ex: `v_min, v_max = 4, 8`.
+        v_min, v_max = target_support[0], target_support[-1]
+        # Ex: `batch_size = 2`.
+        batch_size = tf.shape(supports)[0]
+        # `N` in Eq7.
+        # Ex: `num_dims = 5`.
+        num_dims = tf.shape(target_support)[0]
+        # clipped_support = `[\hat{T}_{z_j}]^{V_max}_{V_min}` in Eq7.
+        # Ex: `clipped_support = [[[ 4.  4.  4.  6.  8.]]
+        #                         [[ 4.  4.  4.  5.  6.]]]`.
+        clipped_support = tf.clip_by_value(supports, v_min, v_max)[:, None, :]
+        # Ex: `tiled_support = [[[[ 4.  4.  4.  6.  8.]
+        #                         [ 4.  4.  4.  6.  8.]
+        #                         [ 4.  4.  4.  6.  8.]
+        #                         [ 4.  4.  4.  6.  8.]
+        #                         [ 4.  4.  4.  6.  8.]]
+        #                        [[ 4.  4.  4.  5.  6.]
+        #                         [ 4.  4.  4.  5.  6.]
+        #                         [ 4.  4.  4.  5.  6.]
+        #                         [ 4.  4.  4.  5.  6.]
+        #                         [ 4.  4.  4.  5.  6.]]]]`.
+        tiled_support = tf.tile([clipped_support], [1, 1, num_dims, 1])
+        # Ex: `reshaped_target_support = [[[ 4.]
+        #                                  [ 5.]
+        #                                  [ 6.]
+        #                                  [ 7.]
+        #                                  [ 8.]]
+        #                                 [[ 4.]
+        #                                  [ 5.]
+        #                                  [ 6.]
+        #                                  [ 7.]
+        #                                  [ 8.]]]`.
+        reshaped_target_support = tf.tile(target_support[:, None], [batch_size, 1])
+        reshaped_target_support = tf.reshape(reshaped_target_support,
+                                             [batch_size, num_dims, 1])
+        # numerator = `|clipped_support - z_i|` in Eq7.
+        # Ex: `numerator = [[[[ 0.  0.  0.  2.  4.]
+        #                     [ 1.  1.  1.  1.  3.]
+        #                     [ 2.  2.  2.  0.  2.]
+        #                     [ 3.  3.  3.  1.  1.]
+        #                     [ 4.  4.  4.  2.  0.]]
+        #                    [[ 0.  0.  0.  1.  2.]
+        #                     [ 1.  1.  1.  0.  1.]
+        #                     [ 2.  2.  2.  1.  0.]
+        #                     [ 3.  3.  3.  2.  1.]
+        #                     [ 4.  4.  4.  3.  2.]]]]`.
+        numerator = tf.abs(tiled_support - reshaped_target_support)
+        quotient = 1 - (numerator / delta_z)
+        # clipped_quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7.
+        # Ex: `clipped_quotient = [[[[ 1.  1.  1.  0.  0.]
+        #                            [ 0.  0.  0.  0.  0.]
+        #                            [ 0.  0.  0.  1.  0.]
+        #                            [ 0.  0.  0.  0.  0.]
+        #                            [ 0.  0.  0.  0.  1.]]
+        #                           [[ 1.  1.  1.  0.  0.]
+        #                            [ 0.  0.  0.  1.  0.]
+        #                            [ 0.  0.  0.  0.  1.]
+        #                            [ 0.  0.  0.  0.  0.]
+        #                            [ 0.  0.  0.  0.  0.]]]]`.
+        clipped_quotient = tf.clip_by_value(quotient, 0, 1)
+        # Ex: `weights = [[ 0.1  0.6  0.1  0.1  0.1]
+        #                 [ 0.1  0.2  0.5  0.1  0.1]]`.
+        weights = weights[:, None, :]
+        # inner_prod = `\sum_{j=0}^{N-1} clipped_quotient * p_j(x', \pi(x'))`
+        # in Eq7.
+        # Ex: `inner_prod = [[[[ 0.1  0.6  0.1  0.  0. ]
+        #                      [ 0.   0.   0.   0.  0. ]
+        #                      [ 0.   0.   0.   0.1 0. ]
+        #                      [ 0.   0.   0.   0.  0. ]
+        #                      [ 0.   0.   0.   0.  0.1]]
+        #                     [[ 0.1  0.2  0.5  0.  0. ]
+        #                      [ 0.   0.   0.   0.1 0. ]
+        #                      [ 0.   0.   0.   0.  0.1]
+        #                      [ 0.   0.   0.   0.  0. ]
+        #                      [ 0.   0.   0.   0.  0. ]]]]`.
+        inner_prod = clipped_quotient * weights
+        # Ex: `projection = [[ 0.8 0.0 0.1 0.0 0.1]
+        #                    [ 0.8 0.1 0.1 0.0 0.0]]`.
+        projection = tf.reduce_sum(inner_prod, 3)
+        projection = tf.reshape(projection, [batch_size, num_dims])
+        return projection
diff --git a/dopamine/dopamine/agents/repg/__init__.py b/dopamine/dopamine/agents/repg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/repg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/repg/configs/repg.gin b/dopamine/dopamine/agents/repg/configs/repg.gin
new file mode 100644
index 0000000..a0dad17
--- /dev/null
+++ b/dopamine/dopamine/agents/repg/configs/repg.gin
@@ -0,0 +1,36 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.repg.repg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+REPGAgent.gamma = 0.99
+REPGAgent.game_name = 'Pong' # Boxing, Pong
+REPGAgent.update_horizon = 1
+REPGAgent.min_replay_history = 200000  # agent steps, step more than this, stop exploration.
+REPGAgent.update_period = 4
+REPGAgent.epsilon_train = 0.0001
+REPGAgent.epsilon_eval = 0
+REPGAgent.epsilon_decay_period = 250000  # agent steps
+REPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+REPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+REPGAgent.margin = 1
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'  # Pong
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 15
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 10000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/repg/repg_agent.py b/dopamine/dopamine/agents/repg/repg_agent.py
new file mode 100644
index 0000000..69e0e49
--- /dev/null
+++ b/dopamine/dopamine/agents/repg/repg_agent.py
@@ -0,0 +1,607 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a DQN agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import os
+import random
+
+from dopamine.agents.agent_utils import *
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+from tensorflow.distributions import Categorical
+import gin.tf
+from collections import deque
+
+slim = tf.contrib.slim
+
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
+NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
+NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
+
+
+def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon):
+    """Returns the current epsilon for the agent's epsilon-greedy policy.
+
+    This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et
+    al., 2015). The schedule is as follows:
+      Begin at 1. until warmup_steps steps have been taken; then
+      Linearly decay epsilon from 1. to epsilon in decay_period steps; and then
+      Use epsilon from there on.
+
+    Args:
+      decay_period: float, the period over which epsilon is decayed.
+      step: int, the number of training steps completed so far.
+      warmup_steps: int, the number of steps taken before epsilon is decayed.
+      epsilon: float, the final value to which to decay the epsilon parameter.
+
+    Returns:
+      A float, the current epsilon value computed according to the schedule.
+    """
+    steps_left = decay_period + warmup_steps - step
+    bonus = (1.0 - epsilon) * steps_left / decay_period
+    bonus = np.clip(bonus, 0., 1. - epsilon)
+    return epsilon + bonus
+
+
+@gin.configurable
+class REPGAgent(object):
+    """An implementation of the RPG with EPG exploration agent."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 game_name="Pong",
+                 observation_shape=NATURE_DQN_OBSERVATION_SHAPE,
+                 observation_dtype=NATURE_DQN_DTYPE,
+                 stack_size=NATURE_DQN_STACK_SIZE,
+                 gamma=0.99,
+                 update_horizon=1,
+                 min_replay_history=20000,
+                 update_period=4,
+                 epsilon_fn=linearly_decaying_epsilon,
+                 epsilon_train=0.01,
+                 epsilon_eval=0.001,
+                 epsilon_decay_period=250000,
+                 margin=1,
+                 tf_device='/cpu:*',
+                 use_staging=True,
+                 max_tf_checkpoints_to_keep=3,
+                 optimizer=tf.train.RMSPropOptimizer(
+                     learning_rate=0.00025,
+                     decay=0.95,
+                     momentum=0.0,
+                     epsilon=0.00001,
+                     centered=True),
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the components of its graph.
+
+        Args:
+          sess: `tf.Session`, for executing ops.
+          num_actions: int, number of actions the agent can take at any state.
+          observation_shape: tuple of ints describing the observation shape.
+          observation_dtype: tf.DType, specifies the type of the observations. Note
+            that if your inputs are continuous, you should set this to tf.float32.
+          stack_size: int, number of frames to use in state stack.
+          gamma: float, discount factor with the usual RL meaning.
+          update_horizon: int, horizon at which updates are performed, the 'n' in
+            n-step update.
+          min_replay_history: int, number of transitions that should be experienced
+            before the agent begins training its value function.
+          update_period: int, period between DQN updates.
+          target_update_period: int, update period for the target network.
+          epsilon_fn: function expecting 4 parameters:
+            (decay_period, step, warmup_steps, epsilon). This function should return
+            the epsilon value used for exploration during training.
+          epsilon_train: float, the value to which the agent's epsilon is eventually
+            decayed during training.
+          epsilon_eval: float, epsilon used when evaluating the agent.
+          epsilon_decay_period: int, length of the epsilon decay schedule.
+          tf_device: str, Tensorflow device on which the agent's graph is executed.
+          use_staging: bool, when True use a staging area to prefetch the next
+            training batch, speeding training up by about 30%.
+          max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to
+            keep.
+          optimizer: `tf.train.Optimizer`, for training the value function.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        assert isinstance(observation_shape, tuple)
+        tf.logging.info('Creating %s agent with the following parameters:',
+                        self.__class__.__name__)
+        tf.logging.info('\t gamma: %f', gamma)
+        tf.logging.info('\t update_horizon: %f', update_horizon)
+        tf.logging.info('\t min_replay_history: %d', min_replay_history)
+        tf.logging.info('\t update_period: %d', update_period)
+        # tf.logging.info('\t random_seed: %d', random_seed)
+        tf.logging.info('\t epsilon_train: %f', epsilon_train)
+        tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
+        tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
+        tf.logging.info('\t tf_device: %s', tf_device)
+        tf.logging.info('\t use_staging: %s', use_staging)
+        tf.logging.info('\t optimizer: %s', optimizer)
+        tf.logging.info('\t game: %s', game_name)
+
+        self.game_name = game_name
+        self.num_actions = num_actions
+        self.observation_shape = tuple(observation_shape)
+        self.observation_dtype = observation_dtype
+        self.stack_size = stack_size
+        self.gamma = gamma
+        self.update_horizon = update_horizon
+        self.cumulative_gamma = math.pow(gamma, update_horizon)
+        self.min_replay_history = min_replay_history
+        self.epsilon_fn = epsilon_fn
+        self.epsilon_train = epsilon_train
+        self.epsilon_eval = epsilon_eval
+        self.epsilon_decay_period = epsilon_decay_period
+        self.update_period = update_period
+        self.eval_mode = False
+        self.training_steps = 0
+        self.optimizer = optimizer  # optimizer for RPG
+        self.optimizer_exp = tf.train.RMSPropOptimizer(
+                     learning_rate=0.00025,
+                     decay=0.95,
+                     momentum=0.0,
+                     epsilon=0.00001,
+                     centered=True)  # optimizer for EPG
+        self.summary_writer = summary_writer
+        self.summary_writing_frequency = summary_writing_frequency
+        self.margin = margin
+        self.start_training = 1000  # todo task specific
+        self.highest_reward = 6  # todo task specific
+        # which deviate the hing loss.
+        self.isPrinted = False
+        self.current_replay_size = 0
+        self.epsilon_current = 1
+
+        with tf.device(tf_device):
+            # Create a placeholder for the state input to the DQN network.
+            # The last axis indicates the number of consecutive frames stacked.
+            state_shape = (1,) + self.observation_shape + (stack_size,)
+            self.state = np.zeros(state_shape)
+            self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
+                                           name='state_ph')
+            self._replay = self._build_replay_buffer(use_staging)
+
+            self._build_networks()
+
+            self._train_op, self._train_exp_op = self._build_train_op()
+
+        self.replay_buffer = ReplayBufferRegular(100000)
+
+        if self.summary_writer is not None:
+            # All tf.summaries should have been defined prior to running this.
+            self._merged_summaries = tf.summary.merge_all()
+        self._sess = sess
+        self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)
+
+        # Variables to be initialized by the agent once it interacts with the
+        # environment.
+        self._observation = None
+        self._last_observation = None
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of a Q value network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('DQN_network', ['q_values'])
+
+    def _network_template(self, state):
+        """Builds the convolutional network used to compute the agent's Q-values.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(net, 32, [8, 8], stride=4)
+        net = slim.conv2d(net, 64, [4, 4], stride=2)
+        net = slim.conv2d(net, 64, [3, 3], stride=1)
+        net = slim.flatten(net)
+        net = slim.fully_connected(net, 512)
+        q_values = slim.fully_connected(net, self.num_actions, activation_fn=None)
+        return self._get_network_type()(q_values)
+
+    def _build_networks(self):
+        """Builds the Q-value network computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's Q-values.
+          self.target_convnet: For computing the next state's target Q-values.
+          self._net_outputs: The actual Q-values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' Q-values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            Q-values (see Mnih et al., 2015 for details).
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+        self.explore_convnet = tf.make_template('Explore', self._network_template)
+
+        self._net_outputs = self.online_convnet(self.state_ph)
+        self._exp_net_outputs = self.explore_convnet(self.state_ph)
+
+        # TODO(bellemare): Ties should be broken. They are unlikely to happen when
+        # using a deep network, but may affect performance with a linear
+        # approximation scheme.
+        self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states)
+        self._replay_exp_net_outputs = self.explore_convnet(
+                                                       self._replay.states)
+
+        self.logsoftmaxprob = tf.nn.log_softmax(self._exp_net_outputs.q_values)
+        self.sample = Categorical(logits=self.logsoftmaxprob).sample(1)
+
+    def _build_replay_buffer(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A WrapperReplayBuffer object.
+        """
+        return circular_replay_buffer.WrappedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma,
+            observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+
+        # for hinge loss
+        margin = 1
+
+        replay_action_one_hot = tf.one_hot(
+            self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
+        qvalue = self._replay_net_outputs.q_values
+        replay_chosen_q = tf.reduce_sum(
+            self._replay_net_outputs.q_values * replay_action_one_hot,
+            reduction_indices=1,
+            name='replay_chosen_q')
+        self.temp_action_one_hot = replay_action_one_hot
+        self.temp_qvalue = qvalue
+
+        # Q_j + c - Q_* = temp1 + temp2
+        # temp1 = [Q_j + c, Q_*]
+        # temp2 = [-Q_*, -Q_*]
+        self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot
+        self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \
+                     * ((1 - replay_action_one_hot) + (replay_action_one_hot))
+        self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2)
+        loss = tf.reduce_mean(self.hingeloss)
+
+        # for cross entropy loss
+        logits = self._replay_exp_net_outputs.q_values
+        self.logsoftmaxprob = tf.nn.log_softmax(logits)
+        self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1)
+        # self.temp_loss = self.neglogprob # * self.y_pl
+        self.actor_loss = tf.reduce_mean(self.neglogprob)
+        self.replay_action_one_hot = replay_action_one_hot
+
+        # target = tf.stop_gradient(self._build_target_q_op())
+        # loss = tf.losses.huber_loss(
+        #     target, replay_chosen_q, reduction=tf.losses.Reduction.NONE)
+        if self.summary_writer is not None:
+            with tf.variable_scope('Losses'):
+                tf.summary.scalar('hingeLoss', loss)
+                if self.exploration_strategy == "EPG":
+                    tf.summary.scalar('actorloss', self.actor_loss)
+        return self.optimizer.minimize(loss), self.optimizer_exp.minimize(self.actor_loss)
+
+    def begin_episode(self, observation):
+        """Returns the agent's first action for this episode.
+
+        Args:
+          observation: numpy array, the environment's initial observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._reset_state()
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def step(self, reward, observation):
+        """Records the most recent transition and returns the agent's next action.
+
+        We store the observation of the last time step since we want to store it
+        with the reward.
+
+        Args:
+          reward: float, the reward received from the agent's most recent action.
+          observation: numpy array, the most recent observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._last_observation = self._observation
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def end_episode(self, reward):
+        """Signals the end of the episode to the agent.
+
+        We store the observation of the current time step, which is the last
+        observation of the episode.
+
+        Args:
+          reward: float, the last reward from the environment.
+        """
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+            # if reward < 0:
+            #     self.replay_buffer.clear()
+            # elif reward > 0:
+            #     self.replay_buffer.add(self._last_observation, self.action, reward, True)
+            #     while self.replay_buffer.size() > 0:
+            #         experience = self.replay_buffer.get_sample()
+            #         state, action, reward, _ = experience
+            #         self._store_transition(state, action, reward, True)
+            #         # there is zero transition padding to the memory in self._replay.
+            # else:
+            #     self.replay_buffer.add(self._last_observation, self.action, reward, True)
+
+    def _select_action_training(self):
+        """Use EPG to select action during training, """
+        return self._sess.run(self.sample, {self.state_ph: self.state})[0][0]
+
+    def _select_action(self):
+        """Select an action from the set of available actions.
+
+        Chooses an action randomly with probability self._calculate_epsilon(), and
+        otherwise acts greedily according to the current Q-value estimates.
+
+        Returns:
+           int, the selected action.
+        """
+        # epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn(
+        #     self.epsilon_decay_period,
+        #     self.training_steps,
+        #     self.min_replay_history,
+        #     self.epsilon_train)
+
+        self.epsilon_current = 0
+        self.current_replay_size = self._replay.memory.add_count
+        if self.eval_mode:
+            return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+        return self._select_action_training()
+
+    def _train_step(self):
+        """Runs a single training step.
+
+        Runs a training op if both:
+          (1) A minimum number of frames have been added to the replay buffer.
+          (2) `training_steps` is a multiple of `update_period`.
+
+        Also, syncs weights from online to target network if training steps is a
+        multiple of target update period.
+        """
+        # Run a train op at the rate of self.update_period if enough training steps
+        # have been run. This matches the Nature DQN behaviour.
+
+        if self._replay.memory.add_count > self.start_training:
+            if self.training_steps % self.update_period == 0:
+
+                # # debug checked.
+                # _, temp1, temp2, taction, tqvalue, hingloss = self._sess.run([self._train_op,
+                #                                             self.temp1,
+                #                                             self.temp2,
+                #                                             self.temp_action_one_hot,
+                #                                             self.temp_qvalue,
+                #                                             self.hingeloss])
+                self._sess.run(self._train_op)
+                self._sess.run(self._train_exp_op)
+                if (self.summary_writer is not None and
+                        self.training_steps > 0 and
+                        self.training_steps % self.summary_writing_frequency == 0):
+                    summary = self._sess.run(self._merged_summaries)
+                    self.summary_writer.add_summary(summary, self.training_steps)
+
+            # if self.training_steps % self.target_update_period == 0:
+            #     self._sess.run(self._sync_qt_ops)
+
+        self.training_steps += 1
+
+        if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False:
+            print("start training at {}".format(self.training_steps))
+            self.isPrinted = True
+
+    def _record_observation(self, observation):
+        """Records an observation and update state.
+
+        Extracts a frame from the observation vector and overwrites the oldest
+        frame in the state buffer.
+
+        Args:
+          observation: numpy array, an observation from the environment.
+        """
+        # Set current observation. We do the reshaping to handle environments
+        # without frame stacking.
+        observation = np.reshape(observation, self.observation_shape)
+        self._observation = observation[..., 0]
+        self._observation = np.reshape(observation, self.observation_shape)
+        # Swap out the oldest frame with the current frame.
+        self.state = np.roll(self.state, -1, axis=-1)
+        self.state[0, ..., -1] = self._observation
+
+    def _store_transition(self, last_observation, action, reward, is_terminal):
+        """Stores an experienced transition.
+
+        Executes a tf session and executes replay buffer ops in order to store the
+        following tuple in the replay buffer:
+          (last_observation, action, reward, is_terminal).
+
+        Pedantically speaking, this does not actually store an entire transition
+        since the next state is recorded on the following time step.
+
+        Args:
+          last_observation: numpy array, last observation.
+          action: int, the action taken.
+          reward: float, the reward.
+          is_terminal: bool, indicating if the current state is a terminal state.
+        """
+        self._replay.add(last_observation, action, reward, is_terminal)
+
+    def _reset_state(self):
+        """Resets the agent state by filling it with zeros."""
+        self.state.fill(0)
+
+    def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
+        """Returns a self-contained bundle of the agent's state.
+
+        This is used for checkpointing. It will return a dictionary containing all
+        non-TensorFlow objects (to be saved into a file by the caller), and it saves
+        all TensorFlow objects into a checkpoint file.
+
+        Args:
+          checkpoint_dir: str, directory where TensorFlow objects will be saved.
+          iteration_number: int, iteration number to use for naming the checkpoint
+            file.
+
+        Returns:
+          A dict containing additional Python objects to be checkpointed by the
+            experiment. If the checkpoint directory does not exist, returns None.
+        """
+        if not tf.gfile.Exists(checkpoint_dir):
+            return None
+        # Call the Tensorflow saver to checkpoint the graph.
+        self._saver.save(
+            self._sess,
+            os.path.join(checkpoint_dir, 'tf_ckpt'),
+            global_step=iteration_number)
+        # Checkpoint the out-of-graph replay buffer.
+        self._replay.save(checkpoint_dir, iteration_number)
+        bundle_dictionary = {}
+        bundle_dictionary['state'] = self.state
+        bundle_dictionary['eval_mode'] = self.eval_mode
+        bundle_dictionary['training_steps'] = self.training_steps
+        return bundle_dictionary
+
+    def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary):
+        """Restores the agent from a checkpoint.
+
+        Restores the agent's Python objects to those specified in bundle_dictionary,
+        and restores the TensorFlow objects to those specified in the
+        checkpoint_dir. If the checkpoint_dir does not exist, will not reset the
+          agent's state.
+
+        Args:
+          checkpoint_dir: str, path to the checkpoint saved by tf.Save.
+          iteration_number: int, checkpoint version, used when restoring replay
+            buffer.
+          bundle_dictionary: dict, containing additional Python objects owned by
+            the agent.
+
+        Returns:
+          bool, True if unbundling was successful.
+        """
+        try:
+            # self._replay.load() will throw a NotFoundError if it does not find all
+            # the necessary files, in which case we abort the process & return False.
+            self._replay.load(checkpoint_dir, iteration_number)
+        except tf.errors.NotFoundError:
+            return False
+        for key in self.__dict__:
+            if key in bundle_dictionary:
+                self.__dict__[key] = bundle_dictionary[key]
+        # Restore the agent's TensorFlow graph.
+        self._saver.restore(self._sess,
+                            os.path.join(checkpoint_dir,
+                                         'tf_ckpt-{}'.format(iteration_number)))
+        return True
+
+
+class ReplayBufferRegular(object):
+    """ for uniformly sampling.
+
+    """
+
+    def __init__(self, buffer_size, random_seed=1234):
+        self.buffer_size = buffer_size
+        self.count = 0
+        # Right side of deque contains newest experience
+        self.buffer = deque()
+        random.seed(random_seed)
+        self.ptr, self.path_start_idx = 0, 0
+
+    def add(self, state, action, reward, terminal):
+        experience = [state, action, reward, terminal]
+        assert self.count < self.buffer_size
+        self.buffer.append(experience)
+        self.count += 1
+        self.ptr += 1
+        # else:
+        #     self.path_start_idx -= 1
+        #     self.ptr = self.buffer_size - 1
+        #     self.buffer.popleft()
+        #     self.buffer.append(experience)
+
+    def get_sample(self):
+        self.count -= 1
+        return self.buffer.popleft()
+
+    def size(self):
+        return self.count
+
+    def clear(self):
+        self.buffer.clear()
+        self.count = 0
+        self.ptr = 0
+        self.path_start_idx = 0
diff --git a/dopamine/dopamine/agents/rpg/__init__.py b/dopamine/dopamine/agents/rpg/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/agents/rpg/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/agents/rpg/configs/rpg.gin b/dopamine/dopamine/agents/rpg/configs/rpg.gin
new file mode 100644
index 0000000..96a69fd
--- /dev/null
+++ b/dopamine/dopamine/agents/rpg/configs/rpg.gin
@@ -0,0 +1,36 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.rpg.rpg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+RPGAgent.gamma = 0.99
+RPGAgent.game_name = 'Pong'
+RPGAgent.update_horizon = 1
+RPGAgent.min_replay_history = 200000  # agent steps, step more than this, stop exploration.
+RPGAgent.update_period = 4
+RPGAgent.epsilon_train = 0.0001
+RPGAgent.epsilon_eval = 0
+RPGAgent.epsilon_decay_period = 250000  # agent steps
+RPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+RPGAgent.margin = 1
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 15
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 10000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 32
diff --git a/dopamine/dopamine/agents/rpg/configs/rpg_pong.gin b/dopamine/dopamine/agents/rpg/configs/rpg_pong.gin
new file mode 100644
index 0000000..e155ad4
--- /dev/null
+++ b/dopamine/dopamine/agents/rpg/configs/rpg_pong.gin
@@ -0,0 +1,36 @@
+# Hyperparameters follow the classic Nature DQN, but we modify as necessary to
+# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
+# comparison.
+import dopamine.atari.run_experiment
+import dopamine.agents.rpg.rpg_agent
+import dopamine.replay_memory.circular_replay_buffer
+import gin.tf.external_configurables
+
+RPGAgent.gamma = 0.99
+RPGAgent.game_name = 'Pong'
+RPGAgent.update_horizon = 1
+RPGAgent.min_replay_history = 200000  # agent steps, step more than this, stop exploration.
+RPGAgent.update_period = 4
+RPGAgent.epsilon_train = 0.0001
+RPGAgent.epsilon_eval = 0
+RPGAgent.epsilon_decay_period = 250000  # agent steps
+RPGAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
+RPGAgent.optimizer = @tf.train.RMSPropOptimizer()
+RPGAgent.margin = 1
+
+tf.train.RMSPropOptimizer.learning_rate = 0.00025
+tf.train.RMSPropOptimizer.decay = 0.95
+tf.train.RMSPropOptimizer.momentum = 0.0
+tf.train.RMSPropOptimizer.epsilon = 0.00001
+tf.train.RMSPropOptimizer.centered = True
+
+Runner.game_name = 'Pong'
+# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
+Runner.sticky_actions = False
+Runner.num_iterations = 15
+Runner.training_steps = 250000  # agent steps
+Runner.evaluation_steps = 10000  # agent steps
+Runner.max_steps_per_episode = 27000  # agent steps
+
+WrappedReplayBuffer.replay_capacity = 1000000
+WrappedReplayBuffer.batch_size = 256
diff --git a/dopamine/dopamine/agents/rpg/rpg_agent.py b/dopamine/dopamine/agents/rpg/rpg_agent.py
new file mode 100644
index 0000000..e7ddbd5
--- /dev/null
+++ b/dopamine/dopamine/agents/rpg/rpg_agent.py
@@ -0,0 +1,613 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compact implementation of a DQN agent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import os
+import random
+
+from dopamine.agents.agent_utils import *
+from dopamine.replay_memory import circular_replay_buffer
+import numpy as np
+import tensorflow as tf
+from tensorflow.distributions import Categorical
+import gin.tf
+from collections import deque
+
+slim = tf.contrib.slim
+
+NATURE_DQN_OBSERVATION_SHAPE = (84, 84)  # Size of downscaled Atari 2600 frame.
+NATURE_DQN_DTYPE = tf.uint8  # DType of Atari 2600 observations.
+NATURE_DQN_STACK_SIZE = 4  # Number of frames in the state stack.
+
+
+def linearly_decaying_epsilon(decay_period, step, warmup_steps, epsilon):
+    """Returns the current epsilon for the agent's epsilon-greedy policy.
+
+    This follows the Nature DQN schedule of a linearly decaying epsilon (Mnih et
+    al., 2015). The schedule is as follows:
+      Begin at 1. until warmup_steps steps have been taken; then
+      Linearly decay epsilon from 1. to epsilon in decay_period steps; and then
+      Use epsilon from there on.
+
+    Args:
+      decay_period: float, the period over which epsilon is decayed.
+      step: int, the number of training steps completed so far.
+      warmup_steps: int, the number of steps taken before epsilon is decayed.
+      epsilon: float, the final value to which to decay the epsilon parameter.
+
+    Returns:
+      A float, the current epsilon value computed according to the schedule.
+    """
+    steps_left = decay_period + warmup_steps - step
+    bonus = (1.0 - epsilon) * steps_left / decay_period
+    bonus = np.clip(bonus, 0., 1. - epsilon)
+    return epsilon + bonus
+
+
+@gin.configurable
+class RPGAgent(object):
+    """An implementation of the DQN agent."""
+
+    def __init__(self,
+                 sess,
+                 num_actions,
+                 game_name="Pong",
+                 observation_shape=NATURE_DQN_OBSERVATION_SHAPE,
+                 observation_dtype=NATURE_DQN_DTYPE,
+                 stack_size=NATURE_DQN_STACK_SIZE,
+                 gamma=0.99,
+                 update_horizon=1,
+                 min_replay_history=20000,
+                 update_period=4,
+                 epsilon_fn=linearly_decaying_epsilon,
+                 epsilon_train=0.01,
+                 epsilon_eval=0.001,
+                 epsilon_decay_period=250000,
+                 margin=1,
+                 tf_device='/cpu:*',
+                 use_staging=True,
+                 max_tf_checkpoints_to_keep=3,
+                 optimizer=tf.train.RMSPropOptimizer(
+                     learning_rate=0.00025,
+                     decay=0.95,
+                     momentum=0.0,
+                     epsilon=0.00001,
+                     centered=True),
+                 summary_writer=None,
+                 summary_writing_frequency=500):
+        """Initializes the agent and constructs the components of its graph.
+
+        Args:
+          sess: `tf.Session`, for executing ops.
+          num_actions: int, number of actions the agent can take at any state.
+          observation_shape: tuple of ints describing the observation shape.
+          observation_dtype: tf.DType, specifies the type of the observations. Note
+            that if your inputs are continuous, you should set this to tf.float32.
+          stack_size: int, number of frames to use in state stack.
+          gamma: float, discount factor with the usual RL meaning.
+          update_horizon: int, horizon at which updates are performed, the 'n' in
+            n-step update.
+          min_replay_history: int, number of transitions that should be experienced
+            before the agent begins training its value function.
+          update_period: int, period between DQN updates.
+          target_update_period: int, update period for the target network.
+          epsilon_fn: function expecting 4 parameters:
+            (decay_period, step, warmup_steps, epsilon). This function should return
+            the epsilon value used for exploration during training.
+          epsilon_train: float, the value to which the agent's epsilon is eventually
+            decayed during training.
+          epsilon_eval: float, epsilon used when evaluating the agent.
+          epsilon_decay_period: int, length of the epsilon decay schedule.
+          tf_device: str, Tensorflow device on which the agent's graph is executed.
+          use_staging: bool, when True use a staging area to prefetch the next
+            training batch, speeding training up by about 30%.
+          max_tf_checkpoints_to_keep: int, the number of TensorFlow checkpoints to
+            keep.
+          optimizer: `tf.train.Optimizer`, for training the value function.
+          summary_writer: SummaryWriter object for outputting training statistics.
+            Summary writing disabled if set to None.
+          summary_writing_frequency: int, frequency with which summaries will be
+            written. Lower values will result in slower training.
+        """
+        assert isinstance(observation_shape, tuple)
+        tf.logging.info('Creating %s agent with the following parameters:',
+                        self.__class__.__name__)
+        tf.logging.info('\t gamma: %f', gamma)
+        tf.logging.info('\t update_horizon: %f', update_horizon)
+        tf.logging.info('\t min_replay_history: %d', min_replay_history)
+        tf.logging.info('\t update_period: %d', update_period)
+        # tf.logging.info('\t random_seed: %d', random_seed)
+        tf.logging.info('\t epsilon_train: %f', epsilon_train)
+        tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
+        tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
+        tf.logging.info('\t tf_device: %s', tf_device)
+        tf.logging.info('\t use_staging: %s', use_staging)
+        tf.logging.info('\t optimizer: %s', optimizer)
+        tf.logging.info('\t game: %s', game_name)
+
+        self.game_name = game_name
+        self.num_actions = num_actions
+        self.observation_shape = tuple(observation_shape)
+        self.observation_dtype = observation_dtype
+        self.stack_size = stack_size
+        self.gamma = gamma
+        self.update_horizon = update_horizon
+        self.cumulative_gamma = math.pow(gamma, update_horizon)
+        self.min_replay_history = min_replay_history
+        self.epsilon_fn = epsilon_fn
+        self.epsilon_train = epsilon_train
+        self.epsilon_eval = epsilon_eval
+        self.epsilon_decay_period = epsilon_decay_period
+        self.update_period = update_period
+        self.eval_mode = False
+        self.training_steps = 0
+        self.optimizer = optimizer
+        self.summary_writer = summary_writer
+        self.summary_writing_frequency = summary_writing_frequency
+        self.margin = margin
+        self.start_training = 1000  # todo task specific
+        self.highest_reward = 6  # todo task specific
+        self.exploration_strategy = "NonEGP"   # NonEPG for random explore.
+        # todo EPG exploration failed since when hinge loss is small, it will optimize the cross entropy,
+        # which deviate the hing loss.
+        self.isPrinted = False
+        self.current_replay_size = 0
+        self.epsilon_current = 1
+
+        with tf.device(tf_device):
+            # Create a placeholder for the state input to the DQN network.
+            # The last axis indicates the number of consecutive frames stacked.
+            state_shape = (1,) + self.observation_shape + (stack_size,)
+            self.state = np.zeros(state_shape)
+            self.state_ph = tf.placeholder(self.observation_dtype, state_shape,
+                                           name='state_ph')
+            self._replay = self._build_replay_buffer(use_staging)
+
+            self._build_networks()
+
+            self._train_op = self._build_train_op()
+
+        self.replay_buffer = ReplayBufferRegular(100000)
+
+        if self.summary_writer is not None:
+            # All tf.summaries should have been defined prior to running this.
+            self._merged_summaries = tf.summary.merge_all()
+        self._sess = sess
+        self._saver = tf.train.Saver(max_to_keep=max_tf_checkpoints_to_keep)
+
+        # Variables to be initialized by the agent once it interacts with the
+        # environment.
+        self._observation = None
+        self._last_observation = None
+
+    def _get_network_type(self):
+        """Returns the type of the outputs of a Q value network.
+
+        Returns:
+          net_type: _network_type object defining the outputs of the network.
+        """
+        return collections.namedtuple('DQN_network', ['q_values'])
+
+    def _network_template(self, state):
+        """Builds the convolutional network used to compute the agent's Q-values.
+
+        Args:
+          state: `tf.Tensor`, contains the agent's current state.
+
+        Returns:
+          net: _network_type object containing the tensors output by the network.
+        """
+        net = tf.cast(state, tf.float32)
+        net = tf.div(net, 255.)
+        net = slim.conv2d(net, 32, [8, 8], stride=4)
+        net = slim.conv2d(net, 64, [4, 4], stride=2)
+        net = slim.conv2d(net, 64, [3, 3], stride=1)
+        net = slim.flatten(net)
+        net = slim.fully_connected(net, 512)
+        q_values = slim.fully_connected(net, self.num_actions, activation_fn=None)
+        return self._get_network_type()(q_values)
+
+    def _build_networks(self):
+        """Builds the Q-value network computations needed for acting and training.
+
+        These are:
+          self.online_convnet: For computing the current state's Q-values.
+          self.target_convnet: For computing the next state's target Q-values.
+          self._net_outputs: The actual Q-values.
+          self._q_argmax: The action maximizing the current state's Q-values.
+          self._replay_net_outputs: The replayed states' Q-values.
+          self._replay_next_target_net_outputs: The replayed next states' target
+            Q-values (see Mnih et al., 2015 for details).
+        """
+        # Calling online_convnet will generate a new graph as defined in
+        # self._get_network_template using whatever input is passed, but will always
+        # share the same weights.
+        self.online_convnet = tf.make_template('Online', self._network_template)
+        self.target_convnet = tf.make_template('Target', self._network_template)
+        self._net_outputs = self.online_convnet(self.state_ph)
+        # TODO(bellemare): Ties should be broken. They are unlikely to happen when
+        # using a deep network, but may affect performance with a linear
+        # approximation scheme.
+        self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
+
+        self._replay_net_outputs = self.online_convnet(self._replay.states)
+        self._replay_next_target_net_outputs = self.target_convnet(
+            self._replay.next_states)
+        self.logsoftmaxprob = tf.nn.log_softmax(self._net_outputs.q_values)
+        self.sample = Categorical(logits=self.logsoftmaxprob).sample(1)
+
+    def _build_replay_buffer(self, use_staging):
+        """Creates the replay buffer used by the agent.
+
+        Args:
+          use_staging: bool, if True, uses a staging area to prefetch data for
+            faster training.
+
+        Returns:
+          A WrapperReplayBuffer object.
+        """
+        return circular_replay_buffer.WrappedReplayBuffer(
+            observation_shape=self.observation_shape,
+            stack_size=self.stack_size,
+            use_staging=use_staging,
+            update_horizon=self.update_horizon,
+            gamma=self.gamma,
+            observation_dtype=self.observation_dtype.as_numpy_dtype)
+
+    def _build_train_op(self):
+        """Builds a training op.
+
+        Returns:
+          train_op: An op performing one step of training from replay data.
+        """
+        margin = 1
+
+        replay_action_one_hot = tf.one_hot(
+            self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
+        qvalue = self._replay_net_outputs.q_values
+        replay_chosen_q = tf.reduce_sum(
+            self._replay_net_outputs.q_values * replay_action_one_hot,
+            reduction_indices=1,
+            name='replay_chosen_q')
+        self.temp_action_one_hot = replay_action_one_hot
+        self.temp_qvalue = qvalue
+
+        # Q_j + c - Q_* = temp1 + temp2
+        # temp1 = [Q_j + c, Q_*]
+        # temp2 = [-Q_*, -Q_*]
+        self.temp1 = (qvalue + margin) * (1 - replay_action_one_hot) + qvalue * replay_action_one_hot
+        self.temp2 = -(tf.reshape(replay_chosen_q, [-1, 1]) * tf.ones([1, self.num_actions])) \
+                     * ((1 - replay_action_one_hot) + (replay_action_one_hot))
+        self.hingeloss = tf.maximum(0.0, self.temp1 + self.temp2)
+        loss = tf.reduce_mean(self.hingeloss)
+
+        if self.exploration_strategy == "EPG":
+            logits = qvalue
+            self.logsoftmaxprob = tf.nn.log_softmax(logits)
+            self.neglogprob = - tf.reduce_sum(self.logsoftmaxprob * replay_action_one_hot, axis=1)
+            self.actor_loss = tf.reduce_mean(self.neglogprob)
+            loss = self.actor_loss + loss
+
+        # target = tf.stop_gradient(self._build_target_q_op())
+        # loss = tf.losses.huber_loss(
+        #     target, replay_chosen_q, reduction=tf.losses.Reduction.NONE)
+        if self.summary_writer is not None:
+            with tf.variable_scope('Losses'):
+                tf.summary.scalar('hingeLoss', loss)
+                if self.exploration_strategy == "EPG":
+                    tf.summary.scalar('actorloss', self.actor_loss)
+        return self.optimizer.minimize(loss)
+
+    def begin_episode(self, observation):
+        """Returns the agent's first action for this episode.
+
+        Args:
+          observation: numpy array, the environment's initial observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._reset_state()
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def step(self, reward, observation):
+        """Records the most recent transition and returns the agent's next action.
+
+        We store the observation of the last time step since we want to store it
+        with the reward.
+
+        Args:
+          reward: float, the reward received from the agent's most recent action.
+          observation: numpy array, the most recent observation.
+
+        Returns:
+          int, the selected action.
+        """
+        self._last_observation = self._observation
+        self._record_observation(observation)
+
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+
+            self._train_step()
+
+        self.action = self._select_action()
+        return self.action
+
+    def end_episode(self, reward):
+        """Signals the end of the episode to the agent.
+
+        We store the observation of the current time step, which is the last
+        observation of the episode.
+
+        Args:
+          reward: float, the last reward from the environment.
+        """
+        if not self.eval_mode:
+            if self.game_name in ["Pong"]:
+                collect_trajectory(self, reward)
+            else:
+                raise ValueError("collection wrong trajectory")
+            # if reward < 0:
+            #     self.replay_buffer.clear()
+            # elif reward > 0:
+            #     self.replay_buffer.add(self._last_observation, self.action, reward, True)
+            #     while self.replay_buffer.size() > 0:
+            #         experience = self.replay_buffer.get_sample()
+            #         state, action, reward, _ = experience
+            #         self._store_transition(state, action, reward, True)
+            #         # there is zero transition padding to the memory in self._replay.
+            # else:
+            #     self.replay_buffer.add(self._last_observation, self.action, reward, True)
+
+    def _select_action_training(self):
+        """Use EPG to select action during training, """
+        return self._sess.run(self.sample, {self.state_ph: self.state})[0][0]
+
+    def _select_action(self):
+        """Select an action from the set of available actions.
+
+        Chooses an action randomly with probability self._calculate_epsilon(), and
+        otherwise acts greedily according to the current Q-value estimates.
+
+        Returns:
+           int, the selected action.
+        """
+        # epsilon = self.epsilon_eval if self.eval_mode else self.epsilon_fn(
+        #     self.epsilon_decay_period,
+        #     self.training_steps,
+        #     self.min_replay_history,
+        #     self.epsilon_train)
+
+        exploration = self.exploration_strategy   # default random explore
+        if exploration == "EPG":
+            self.epsilon_current = 0
+            self.current_replay_size = self._replay.memory.add_count
+            if self.eval_mode:
+                return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+            return self._select_action_training()
+        else:
+            if self.training_steps < self.min_replay_history:
+                epsilon = 1
+            else:
+                epsilon = self.epsilon_train
+            if self.eval_mode:
+                epsilon = self.epsilon_eval
+
+            self.epsilon_current = epsilon
+            self.current_replay_size = self._replay.memory.add_count
+            if random.random() <= epsilon:
+                # Choose a random action with probability epsilon.
+                return random.randint(0, self.num_actions - 1)
+            else:
+                # Choose the action with highest Q-value at the current state.
+                return self._sess.run(self._q_argmax, {self.state_ph: self.state})
+
+    def _train_step(self):
+        """Runs a single training step.
+
+        Runs a training op if both:
+          (1) A minimum number of frames have been added to the replay buffer.
+          (2) `training_steps` is a multiple of `update_period`.
+
+        Also, syncs weights from online to target network if training steps is a
+        multiple of target update period.
+        """
+        # Run a train op at the rate of self.update_period if enough training steps
+        # have been run. This matches the Nature DQN behaviour.
+
+        if self._replay.memory.add_count > self.start_training:
+            if self.training_steps % self.update_period == 0:
+
+                # # debug checked.
+                # _, temp1, temp2, taction, tqvalue, hingloss = self._sess.run([self._train_op,
+                #                                             self.temp1,
+                #                                             self.temp2,
+                #                                             self.temp_action_one_hot,
+                #                                             self.temp_qvalue,
+                #                                             self.hingeloss])
+                self._sess.run(self._train_op)
+                if (self.summary_writer is not None and
+                        self.training_steps > 0 and
+                        self.training_steps % self.summary_writing_frequency == 0):
+                    summary = self._sess.run(self._merged_summaries)
+                    self.summary_writer.add_summary(summary, self.training_steps)
+
+            # if self.training_steps % self.target_update_period == 0:
+            #     self._sess.run(self._sync_qt_ops)
+
+        self.training_steps += 1
+
+        if (self._replay.memory.add_count > self.start_training) and self.isPrinted is False:
+            print("start training at {}".format(self.training_steps))
+            self.isPrinted = True
+
+    def _record_observation(self, observation):
+        """Records an observation and update state.
+
+        Extracts a frame from the observation vector and overwrites the oldest
+        frame in the state buffer.
+
+        Args:
+          observation: numpy array, an observation from the environment.
+        """
+        # Set current observation. We do the reshaping to handle environments
+        # without frame stacking.
+        observation = np.reshape(observation, self.observation_shape)
+        self._observation = observation[..., 0]
+        self._observation = np.reshape(observation, self.observation_shape)
+        # Swap out the oldest frame with the current frame.
+        self.state = np.roll(self.state, -1, axis=-1)
+        self.state[0, ..., -1] = self._observation
+
+    def _store_transition(self, last_observation, action, reward, is_terminal):
+        """Stores an experienced transition.
+
+        Executes a tf session and executes replay buffer ops in order to store the
+        following tuple in the replay buffer:
+          (last_observation, action, reward, is_terminal).
+
+        Pedantically speaking, this does not actually store an entire transition
+        since the next state is recorded on the following time step.
+
+        Args:
+          last_observation: numpy array, last observation.
+          action: int, the action taken.
+          reward: float, the reward.
+          is_terminal: bool, indicating if the current state is a terminal state.
+        """
+        self._replay.add(last_observation, action, reward, is_terminal)
+
+    def _reset_state(self):
+        """Resets the agent state by filling it with zeros."""
+        self.state.fill(0)
+
+    def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
+        """Returns a self-contained bundle of the agent's state.
+
+        This is used for checkpointing. It will return a dictionary containing all
+        non-TensorFlow objects (to be saved into a file by the caller), and it saves
+        all TensorFlow objects into a checkpoint file.
+
+        Args:
+          checkpoint_dir: str, directory where TensorFlow objects will be saved.
+          iteration_number: int, iteration number to use for naming the checkpoint
+            file.
+
+        Returns:
+          A dict containing additional Python objects to be checkpointed by the
+            experiment. If the checkpoint directory does not exist, returns None.
+        """
+        if not tf.gfile.Exists(checkpoint_dir):
+            return None
+        # Call the Tensorflow saver to checkpoint the graph.
+        self._saver.save(
+            self._sess,
+            os.path.join(checkpoint_dir, 'tf_ckpt'),
+            global_step=iteration_number)
+        # Checkpoint the out-of-graph replay buffer.
+        self._replay.save(checkpoint_dir, iteration_number)
+        bundle_dictionary = {}
+        bundle_dictionary['state'] = self.state
+        bundle_dictionary['eval_mode'] = self.eval_mode
+        bundle_dictionary['training_steps'] = self.training_steps
+        return bundle_dictionary
+
+    def unbundle(self, checkpoint_dir, iteration_number, bundle_dictionary):
+        """Restores the agent from a checkpoint.
+
+        Restores the agent's Python objects to those specified in bundle_dictionary,
+        and restores the TensorFlow objects to those specified in the
+        checkpoint_dir. If the checkpoint_dir does not exist, will not reset the
+          agent's state.
+
+        Args:
+          checkpoint_dir: str, path to the checkpoint saved by tf.Save.
+          iteration_number: int, checkpoint version, used when restoring replay
+            buffer.
+          bundle_dictionary: dict, containing additional Python objects owned by
+            the agent.
+
+        Returns:
+          bool, True if unbundling was successful.
+        """
+        try:
+            # self._replay.load() will throw a NotFoundError if it does not find all
+            # the necessary files, in which case we abort the process & return False.
+            self._replay.load(checkpoint_dir, iteration_number)
+        except tf.errors.NotFoundError:
+            return False
+        for key in self.__dict__:
+            if key in bundle_dictionary:
+                self.__dict__[key] = bundle_dictionary[key]
+        # Restore the agent's TensorFlow graph.
+        self._saver.restore(self._sess,
+                            os.path.join(checkpoint_dir,
+                                         'tf_ckpt-{}'.format(iteration_number)))
+        return True
+
+
+class ReplayBufferRegular(object):
+    """ for uniformly sampling.
+
+    """
+
+    def __init__(self, buffer_size, random_seed=1234):
+        self.buffer_size = buffer_size
+        self.count = 0
+        # Right side of deque contains newest experience
+        self.buffer = deque()
+        random.seed(random_seed)
+        self.ptr, self.path_start_idx = 0, 0
+
+    def add(self, state, action, reward, terminal):
+        experience = [state, action, reward, terminal]
+        assert self.count < self.buffer_size
+        self.buffer.append(experience)
+        self.count += 1
+        self.ptr += 1
+        # else:
+        #     self.path_start_idx -= 1
+        #     self.ptr = self.buffer_size - 1
+        #     self.buffer.popleft()
+        #     self.buffer.append(experience)
+
+    def get_sample(self):
+        self.count -= 1
+        return self.buffer.popleft()
+
+    def size(self):
+        return self.count
+
+    def clear(self):
+        self.buffer.clear()
+        self.count = 0
+        self.ptr = 0
+        self.path_start_idx = 0
diff --git a/dopamine/dopamine/atari/__init__.py b/dopamine/dopamine/atari/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/atari/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/atari/preprocessing.py b/dopamine/dopamine/atari/preprocessing.py
new file mode 100644
index 0000000..861c544
--- /dev/null
+++ b/dopamine/dopamine/atari/preprocessing.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A class implementing minimal Atari 2600 preprocessing.
+
+This includes:
+  . Emitting a terminal signal when losing a life (optional).
+  . Frame skipping and color pooling.
+  . Resizing the image before it is provided to the agent.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from gym.spaces.box import Box
+import numpy as np
+import gin.tf
+import cv2
+
+
+@gin.configurable
+class AtariPreprocessing(object):
+  """A class implementing image preprocessing for Atari 2600 agents.
+
+  Specifically, this provides the following subset from the JAIR paper
+  (Bellemare et al., 2013) and Nature DQN paper (Mnih et al., 2015):
+
+    * Frame skipping (defaults to 4).
+    * Terminal signal when a life is lost (off by default).
+    * Grayscale and max-pooling of the last two frames.
+    * Downsample the screen to a square image (defaults to 84x84).
+
+  More generally, this class follows the preprocessing guidelines set down in
+  Machado et al. (2018), "Revisiting the Arcade Learning Environment:
+  Evaluation Protocols and Open Problems for General Agents".
+  """
+
+  def __init__(self, environment, frame_skip=4, terminal_on_life_loss=False,
+               screen_size=84):
+    """Constructor for an Atari 2600 preprocessor.
+
+    Args:
+      environment: Gym environment whose observations are preprocessed.
+      frame_skip: int, the frequency at which the agent experiences the game.
+      terminal_on_life_loss: bool, If True, the step() method returns
+        is_terminal=True whenever a life is lost. See Mnih et al. 2015.
+      screen_size: int, size of a resized Atari 2600 frame.
+
+    Raises:
+      ValueError: if frame_skip or screen_size are not strictly positive.
+    """
+    if frame_skip <= 0:
+      raise ValueError('Frame skip should be strictly positive, got {}'.
+                       format(frame_skip))
+    if screen_size <= 0:
+      raise ValueError('Target screen size should be strictly positive, got {}'.
+                       format(screen_size))
+
+    self.environment = environment
+    self.terminal_on_life_loss = terminal_on_life_loss
+    self.frame_skip = frame_skip
+    self.screen_size = screen_size
+
+    obs_dims = self.environment.observation_space
+    # Stores temporary observations used for pooling over two successive
+    # frames.
+    self.screen_buffer = [
+        np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8),
+        np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8)
+    ]
+
+    self.game_over = False
+    self.lives = 0  # Will need to be set by reset().
+
+  @property
+  def observation_space(self):
+    # Return the observation space adjusted to match the shape of the processed
+    # observations.
+    return Box(low=0, high=255, shape=(self.screen_size, self.screen_size, 1),
+               dtype=np.uint8)
+
+  @property
+  def action_space(self):
+    return self.environment.action_space
+
+  @property
+  def reward_range(self):
+    return self.environment.reward_range
+
+  @property
+  def metadata(self):
+    return self.environment.metadata
+
+  def reset(self):
+    """Resets the environment.
+
+    Returns:
+      observation: numpy array, the initial observation emitted by the
+        environment.
+    """
+    self.environment.reset()
+    self.lives = self.environment.ale.lives()
+    self._fetch_grayscale_observation(self.screen_buffer[0])
+    self.screen_buffer[1].fill(0)
+    return self._pool_and_resize()
+
+  def render(self, mode):
+    """Renders the current screen, before preprocessing.
+
+    This calls the Gym API's render() method.
+
+    Args:
+      mode: Mode argument for the environment's render() method.
+        Valid values (str) are:
+          'rgb_array': returns the raw ALE image.
+          'human': renders to display via the Gym renderer.
+
+    Returns:
+      if mode='rgb_array': numpy array, the most recent screen.
+      if mode='human': bool, whether the rendering was successful.
+    """
+    return self.environment.render(mode)
+
+  def step(self, action):
+    """Applies the given action in the environment.
+
+    Remarks:
+
+      * If a terminal state (from life loss or episode end) is reached, this may
+        execute fewer than self.frame_skip steps in the environment.
+      * Furthermore, in this case the returned observation may not contain valid
+        image data and should be ignored.
+
+    Args:
+      action: The action to be executed.
+
+    Returns:
+      observation: numpy array, the observation following the action.
+      reward: float, the reward following the action.
+      is_terminal: bool, whether the environment has reached a terminal state.
+        This is true when a life is lost and terminal_on_life_loss, or when the
+        episode is over.
+      info: Gym API's info data structure.
+    """
+    accumulated_reward = 0.
+
+    for time_step in range(self.frame_skip):
+      # We bypass the Gym observation altogether and directly fetch the
+      # grayscale image from the ALE. This is a little faster.
+      _, reward, game_over, info = self.environment.step(action)
+      accumulated_reward += reward
+
+      if self.terminal_on_life_loss:
+        new_lives = self.environment.ale.lives()
+        is_terminal = game_over or new_lives < self.lives
+        self.lives = new_lives
+      else:
+        is_terminal = game_over
+
+      if is_terminal:
+        break
+      # We max-pool over the last two frames, in grayscale.
+      elif time_step >= self.frame_skip - 2:
+        t = time_step - (self.frame_skip - 2)
+        self._fetch_grayscale_observation(self.screen_buffer[t])
+
+    # Pool the last two observations.
+    observation = self._pool_and_resize()
+
+    self.game_over = game_over
+    return observation, accumulated_reward, is_terminal, info
+
+  def _fetch_grayscale_observation(self, output):
+    """Returns the current observation in grayscale.
+
+    The returned observation is stored in 'output'.
+
+    Args:
+      output: numpy array, screen buffer to hold the returned observation.
+
+    Returns:
+      observation: numpy array, the current observation in grayscale.
+    """
+    self.environment.ale.getScreenGrayscale(output)
+    return output
+
+  def _pool_and_resize(self):
+    """Transforms two frames into a Nature DQN observation.
+
+    For efficiency, the transformation is done in-place in self.screen_buffer.
+
+    Returns:
+      transformed_screen: numpy array, pooled, resized screen.
+    """
+    # Pool if there are enough screens to do so.
+    if self.frame_skip > 1:
+      np.maximum(self.screen_buffer[0], self.screen_buffer[1],
+                 out=self.screen_buffer[0])
+
+    transformed_image = cv2.resize(self.screen_buffer[0],
+                                   (self.screen_size, self.screen_size),
+                                   interpolation=cv2.INTER_AREA)
+    int_image = np.asarray(transformed_image, dtype=np.uint8)
+    return np.expand_dims(int_image, axis=2)
diff --git a/dopamine/dopamine/atari/run_experiment.py b/dopamine/dopamine/atari/run_experiment.py
new file mode 100644
index 0000000..e82758c
--- /dev/null
+++ b/dopamine/dopamine/atari/run_experiment.py
@@ -0,0 +1,592 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module defining classes and helper methods for running Atari 2600 agents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import time
+
+
+import atari_py
+from dopamine.atari import preprocessing
+from dopamine.common import checkpointer
+from dopamine.common import iteration_statistics
+from dopamine.common import logger
+from dopamine.agents.agent_utils import *
+import gym
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+RPG_AGENTS = ['dqnrpg', 'rainbowrpg', 'implicit_quantilerpg', 'c51rpg']
+
+
+def load_gin_configs(gin_files, gin_bindings):
+  """Loads gin configuration files.
+
+  Args:
+    gin_files: list, of paths to the gin configuration files for this
+      experiment.
+    gin_bindings: list, of gin parameter bindings to override the values in
+      the config files.
+  """
+  gin.parse_config_files_and_bindings(gin_files,
+                                      bindings=gin_bindings,
+                                      skip_unknown=False)
+
+
+def create_atari_environment(game_name, sticky_actions=True):
+  """Wraps an Atari 2600 Gym environment with some basic preprocessing.
+
+  This preprocessing matches the guidelines proposed in Machado et al. (2017),
+  "Revisiting the Arcade Learning Environment: Evaluation Protocols and Open
+  Problems for General Agents".
+
+  The created environment is the Gym wrapper around the Arcade Learning
+  Environment.
+
+  The main choice available to the user is whether to use sticky actions or not.
+  Sticky actions, as prescribed by Machado et al., cause actions to persist
+  with some probability (0.25) when a new command is sent to the ALE. This
+  can be viewed as introducing a mild form of stochasticity in the environment.
+  We use them by default.
+
+  Args:
+    game_name: str, the name of the Atari 2600 domain.
+    sticky_actions: bool, whether to use sticky_actions as per Machado et al.
+
+  Returns:
+    An Atari 2600 environment with some standard preprocessing.
+  """
+  game_version = 'v0' if sticky_actions else 'v4'
+  full_game_name = '{}NoFrameskip-{}'.format(game_name, game_version)
+  env = gym.make(full_game_name)
+  # Strip out the TimeLimit wrapper from Gym, which caps us at 100k frames. We
+  # handle this time limit internally instead, which lets us cap at 108k frames
+  # (30 minutes). The TimeLimit wrapper also plays poorly with saving and
+  # restoring states.
+  env = env.env
+  env = preprocessing.AtariPreprocessing(env)
+  return env
+
+
+@gin.configurable
+class Runner(object):
+  """Object that handles running Atari 2600 experiments.
+
+  Here we use the term 'experiment' to mean simulating interactions between the
+  agent and the environment and reporting some statistics pertaining to these
+  interactions.
+
+  A simple scenario to train a DQN agent is as follows:
+
+  ```python
+  base_dir = '/tmp/simple_example'
+  def create_agent(sess, environment):
+    return dqn_agent.DQNAgent(sess, num_actions=environment.action_space.n)
+  runner = Runner(base_dir, create_agent, game_name='Pong')
+  runner.run()
+  ```
+  """
+
+  def __init__(self,
+               base_dir,
+               create_agent_fn,
+               random_seed,
+               agent_name,
+               game_name,
+               num_iterations,
+               create_environment_fn=create_atari_environment,
+               sticky_actions=True,
+               checkpoint_file_prefix='ckpt',
+               logging_file_prefix='log',
+               log_every_n=1,
+               training_steps=250000,
+               evaluation_steps=125000,
+               max_steps_per_episode=27000):
+    """Initialize the Runner object in charge of running a full experiment.
+
+    Args:
+      base_dir: str, the base directory to host all required sub-directories.
+      create_agent_fn: A function that takes as args a Tensorflow session and an
+        Atari 2600 Gym environment, and returns an agent.
+      create_environment_fn: A function which receives a game name and creates
+        an Atari 2600 Gym environment.
+      game_name: str, name of the Atari 2600 domain to run.
+      sticky_actions: bool, whether to enable sticky actions in the environment.
+      checkpoint_file_prefix: str, the prefix to use for checkpoint files.
+      logging_file_prefix: str, prefix to use for the log files.
+      log_every_n: int, the frequency for writing logs.
+      num_iterations: int, the iteration number threshold (must be greater than
+        start_iteration).
+      training_steps: int, the number of training steps to perform.
+      evaluation_steps: int, the number of evaluation steps to perform.
+      max_steps_per_episode: int, maximum number of steps after which an episode
+        terminates.
+
+    This constructor will take the following actions:
+    - Initialize an environment.
+    - Initialize a `tf.Session`.
+    - Initialize a logger.
+    - Initialize an agent.
+    - Reload from the latest checkpoint, if available, and initialize the
+      Checkpointer object.
+    """
+    assert base_dir is not None
+    assert game_name is not None
+    self._logging_file_prefix = logging_file_prefix
+    self._log_every_n = log_every_n
+    self._num_iterations = num_iterations
+    self._training_steps = training_steps
+    self._evaluation_steps = evaluation_steps
+    self._max_steps_per_episode = max_steps_per_episode
+    self._base_dir = base_dir
+    self._create_directories()
+    self._summary_writer = tf.summary.FileWriter(self._base_dir)
+    self.average_reward_eval = -100
+    self.game_name = game_name
+    self.agent_name = agent_name
+
+    self._environment = create_environment_fn(game_name, sticky_actions)
+    # Set up a session and initialize variables.
+    tf.set_random_seed(random_seed)
+    tfconfig = tf.ConfigProto(allow_soft_placement=True)
+    tfconfig.gpu_options.allow_growth = True
+    self._sess = tf.Session('',
+                            config=tfconfig)
+    # self._sess = tf.Session('',
+    #                         config=tf.ConfigProto(allow_soft_placement=True))
+    self._agent = create_agent_fn(self._sess, self._environment,
+                                  summary_writer=self._summary_writer)
+    tf.logging.info('Running %s with the following parameters:',
+                    self.__class__.__name__)
+    tf.logging.info('\t random_seed: %s', random_seed)
+    tf.logging.info('\t num_iterations: %s', num_iterations)
+    tf.logging.info('\t training_steps: %s', training_steps)
+    tf.logging.info('\t sticky_actions: %s', sticky_actions)
+    tf.logging.info('\t game_name: %s', game_name)
+    # self._sess = tf.Session('',
+    #                         config=tf.ConfigProto(allow_soft_placement=True))
+    # self._agent = create_agent_fn(self._sess, self._environment,
+    #                               summary_writer=self._summary_writer)
+
+    self._summary_writer.add_graph(graph=tf.get_default_graph())
+    self._sess.run(tf.global_variables_initializer())
+
+    self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix)
+
+    # # restore from pretained model a quick fix.
+    # base_restore_dir = "/mnt/research/linkaixi/AllData/pommerman/dopaminecheckpts"
+    # agent_name = "c51"
+    # restore_dir = base_restore_dir + "/{}/{}/1/tf_checkpoints".format(agent_name, game_name)
+    # self.restore_checkpoints(restore_dir, "tf_ckpt-199")
+
+  def _create_directories(self):
+    """Create necessary sub-directories."""
+    self._checkpoint_dir = os.path.join(self._base_dir, 'checkpoints')
+    self._logger = logger.Logger(os.path.join(self._base_dir, 'logs'))
+
+  def _initialize_checkpointer_and_maybe_resume(self, checkpoint_file_prefix):
+    """Reloads the latest checkpoint if it exists.
+
+    This method will first create a `Checkpointer` object and then call
+    `checkpointer.get_latest_checkpoint_number` to determine if there is a valid
+    checkpoint in self._checkpoint_dir, and what the largest file number is.
+    If a valid checkpoint file is found, it will load the bundled data from this
+    file and will pass it to the agent for it to reload its data.
+    If the agent is able to successfully unbundle, this method will verify that
+    the unbundled data contains the keys,'logs' and 'current_iteration'. It will
+    then load the `Logger`'s data from the bundle, and will return the iteration
+    number keyed by 'current_iteration' as one of the return values (along with
+    the `Checkpointer` object).
+
+    Args:
+      checkpoint_file_prefix: str, the checkpoint file prefix.
+
+    Returns:
+      start_iteration: int, the iteration number to start the experiment from.
+      experiment_checkpointer: `Checkpointer` object for the experiment.
+    """
+    # self._checkpoint_dir = base_dir + "/checkpoints"
+    self._checkpointer = checkpointer.Checkpointer(self._checkpoint_dir,
+                                                   checkpoint_file_prefix)
+    self._start_iteration = 0
+    # Check if checkpoint exists. Note that the existence of checkpoint 0 means
+    # that we have finished iteration 0 (so we will start from iteration 1).
+    latest_checkpoint_version = checkpointer.get_latest_checkpoint_number(
+        self._checkpoint_dir)
+    if latest_checkpoint_version >= 0:
+      experiment_data = self._checkpointer.load_checkpoint(
+          latest_checkpoint_version)
+      if self._agent.unbundle(
+          self._checkpoint_dir, latest_checkpoint_version, experiment_data):
+        assert 'logs' in experiment_data
+        assert 'current_iteration' in experiment_data
+        self._logger.data = experiment_data['logs']
+        self._start_iteration = experiment_data['current_iteration'] + 1
+        tf.logging.info('Reloaded checkpoint and will start from iteration %d',
+                        self._start_iteration)
+
+  def restore_checkpoints(self, restore_dir, filename):
+    saver = tf.train.Saver()
+    saver.restore(self._sess, os.path.join(restore_dir, filename))
+
+  def _initialize_episode(self):
+    """Initialization for a new episode.
+
+    Returns:
+      action: int, the initial action chosen by the agent.
+    """
+    initial_observation = self._environment.reset()
+    return self._agent.begin_episode(initial_observation)
+
+  def _run_one_step(self, action):
+    """Executes a single step in the environment.
+
+    Args:
+      action: int, the action to perform in the environment.
+
+    Returns:
+      The observation, reward, and is_terminal values returned from the
+        environment.
+    """
+    observation, reward, is_terminal, _ = self._environment.step(action)
+    return observation, reward, is_terminal
+
+  def _end_episode(self, reward):
+    """Finalizes an episode run.
+
+    Args:
+      reward: float, the last reward from the environment.
+    """
+    self._agent.end_episode(reward)
+
+  def _end_episode_store(self, reward, total_reward, step_number, is_opt):
+    """Finalizes an episode run and store optimal trajectories.
+
+    Args:
+      reward: float, the last reward from the environment.
+    """
+    if is_opt:   # if it is optimal trajectories, then store it.
+      self._agent.end_episode_(reward, total_reward, step_number)
+    else:   # else only store for DQN
+      self._agent.end_episode(reward)
+
+  def _run_one_episode(self):
+    """Executes a full trajectory of the agent interacting with the environment.
+
+    Returns:
+      The number of steps taken and the total reward.
+    """
+    step_number = 0
+    total_reward = 0.
+
+    action = self._initialize_episode()
+    is_terminal = False
+
+    # Keep interacting until we reach a terminal state.
+    while True:
+      observation, reward, is_terminal = self._run_one_step(action)
+
+      total_reward += reward
+      step_number += 1
+
+      # Perform reward clipping.
+      # reward = np.clip(reward, -1, 1)  todo
+
+      if (self._environment.game_over or
+          step_number == self._max_steps_per_episode):
+        # Stop the run loop once we reach the true end of episode.
+        break
+      elif is_terminal:
+        # If we lose a life but the episode is not over, signal an artificial
+        # end of episode to the agent.
+        self._agent.end_episode(reward)
+        action = self._agent.begin_episode(observation)
+      else:
+        action = self._agent.step(reward, observation)
+    if self.agent_name in RPG_AGENTS:
+      is_opt = False
+      if total_reward >= episodic_return[self.game_name]:
+        is_opt = True
+      self._end_episode_store(reward, total_reward, step_number, is_opt)
+    else:
+      self._end_episode(reward)
+
+    return step_number, total_reward
+
+  def _run_one_phase(self, min_steps, statistics, run_mode_str):
+    """Runs the agent/environment loop until a desired number of steps.
+
+    We follow the Machado et al., 2017 convention of running full episodes,
+    and terminating once we've run a minimum number of steps.
+
+    Args:
+      min_steps: int, minimum number of steps to generate in this phase.
+      statistics: `IterationStatistics` object which records the experimental
+        results.
+      run_mode_str: str, describes the run mode for this agent.
+
+    Returns:
+      Tuple containing the number of steps taken in this phase (int), the sum of
+        returns (float), and the number of episodes performed (int).
+    """
+    step_count = 0
+    num_episodes = 0
+    sum_returns = 0.
+    num_good_trajs = 0
+    good_traj_label = 0
+    while step_count < min_steps:
+      episode_length, episode_return = self._run_one_episode()
+
+      good_traj_label = 0
+      if episode_return >= episodic_return[self.game_name]:
+        good_traj_label = 1
+        num_good_trajs += 1
+      statistics.append({
+          '{}_episode_lengths'.format(run_mode_str): episode_length,
+          '{}_episode_returns'.format(run_mode_str): episode_return,
+          '{}_episode_goodtraj'.format(run_mode_str): good_traj_label
+      })
+      step_count += episode_length
+      sum_returns += episode_return
+      num_episodes += 1
+      # We use sys.stdout.write instead of tf.logging so as to flush frequently
+      # without generating a line break.
+
+      if self.agent_name in ['rpg', 'repg']:
+        sys.stdout.write('epsilon: {} '.format(self._agent.epsilon_current) +
+                         'replaysize {}\r'.format(self._agent.current_replay_size))
+      elif self.agent_name in RPG_AGENTS:
+        sys.stdout.write('Opt replay size: {} '.format(self._agent._replay_opt.memory.add_count))
+
+      sys.stdout.write('Steps executed: {} '.format(step_count) +
+                       'Episode length: {} '.format(episode_length) +
+                       'Return: {}'.format(episode_return) +
+                       'Good traj?: {}\r'.format(good_traj_label))
+      sys.stdout.flush()
+    return step_count, sum_returns, num_episodes, num_good_trajs
+
+  def _run_train_phase(self, statistics, eval_mode=False):
+    """Run training phase.
+
+    Args:
+      statistics: `IterationStatistics` object which records the experimental
+        results. Note - This object is modified by this method.
+
+    Returns:
+      num_episodes: int, The number of episodes run in this phase.
+      average_reward: The average reward generated in this phase.
+    """
+    # Perform the training phase, during which the agent learns.
+    self._agent.eval_mode = eval_mode
+    start_time = time.time()
+    number_steps, sum_returns, num_episodes, num_good_trajs = self._run_one_phase(
+        self._training_steps, statistics, 'train')
+    average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0
+    statistics.append({'train_average_return': average_return})
+    average_good_trajs = num_good_trajs / num_episodes if num_episodes > 0 else 0.0
+    statistics.append({'train_average_goodtraj': average_good_trajs})
+    time_delta = time.time() - start_time
+    tf.logging.info('Average undiscounted return per training episode: %.2f',
+                    average_return)
+    tf.logging.info('Average training steps per second: %.2f',
+                    number_steps / time_delta)
+    return num_episodes, average_return
+
+  def _run_eval_phase(self, statistics):
+    """Run evaluation phase.
+
+    Args:
+      statistics: `IterationStatistics` object which records the experimental
+        results. Note - This object is modified by this method.
+
+    Returns:
+      num_episodes: int, The number of episodes run in this phase.
+      average_reward: float, The average reward generated in this phase.
+    """
+    # Perform the evaluation phase -- no learning.
+    self._agent.eval_mode = True
+    _, sum_returns, num_episodes, _ = self._run_one_phase(
+        self._evaluation_steps, statistics, 'eval')
+    average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0
+    tf.logging.info('Average undiscounted return per evaluation episode: %.2f',
+                    average_return)
+    statistics.append({'eval_average_return': average_return})
+    return num_episodes, average_return
+
+  def _run_one_iteration(self, iteration):
+    """Runs one iteration of agent/environment interaction.
+
+    An iteration involves running several episodes until a certain number of
+    steps are obtained. The interleaving of train/eval phases implemented here
+    are to match the implementation of (Mnih et al., 2015).
+
+    Args:
+      iteration: int, current iteration number, used as a global_step for saving
+        Tensorboard summaries.
+
+    Returns:
+      A dict containing summary statistics for this iteration.
+    """
+    statistics = iteration_statistics.IterationStatistics()
+    tf.logging.info('Starting iteration %d', iteration)
+
+    train_eval_mode = False
+    # if self.game_name == "Pong":
+    if self.average_reward_eval >= episodic_return_switch[self.game_name]:
+      train_eval_mode = True
+      print("Stop training at iteration {}".format(iteration))
+
+    num_episodes_train, average_reward_train = self._run_train_phase(
+        statistics, train_eval_mode)
+    # else:
+    #   # don't train, only for evaluation.
+    #   num_episodes_train, average_reward_train = 0, 0
+
+    if self.agent_name in RPG_AGENTS and self._agent._replay_opt.memory.add_count == 0:
+      num_episodes_eval, average_reward_eval = -10000, -10000
+      # if we didn't train rpg, don't waste time evaluate it.
+    else:
+      num_episodes_eval, average_reward_eval = self._run_eval_phase(
+          statistics)
+    self.average_reward_eval = average_reward_eval
+    self._save_tensorboard_summaries(iteration, num_episodes_train,
+                                     average_reward_train, num_episodes_eval,
+                                     average_reward_eval)
+    return statistics.data_lists
+
+  def _save_tensorboard_summaries(self, iteration,
+                                  num_episodes_train,
+                                  average_reward_train,
+                                  num_episodes_eval,
+                                  average_reward_eval):
+    """Save statistics as tensorboard summaries.
+
+    Args:
+      iteration: int, The current iteration number.
+      num_episodes_train: int, number of training episodes run.
+      average_reward_train: float, The average training reward.
+      num_episodes_eval: int, number of evaluation episodes run.
+      average_reward_eval: float, The average evaluation reward.
+    """
+    summary = tf.Summary(value=[
+        tf.Summary.Value(tag='Train/NumEpisodes',
+                         simple_value=num_episodes_train),
+        tf.Summary.Value(tag='Train/AverageReturns',
+                         simple_value=average_reward_train),
+        tf.Summary.Value(tag='Eval/NumEpisodes',
+                         simple_value=num_episodes_eval),
+        tf.Summary.Value(tag='Eval/AverageReturns',
+                         simple_value=average_reward_eval)
+    ])
+    self._summary_writer.add_summary(summary, iteration)
+
+  def _log_experiment(self, iteration, statistics):
+    """Records the results of the current iteration.
+
+    Args:
+      iteration: int, iteration number.
+      statistics: `IterationStatistics` object containing statistics to log.
+    """
+    self._logger['iteration_{:d}'.format(iteration)] = statistics
+    if iteration % self._log_every_n == 0:
+      self._logger.log_to_file(self._logging_file_prefix, iteration)
+
+  def _checkpoint_experiment(self, iteration):
+    """Checkpoint experiment data.
+
+    Args:
+      iteration: int, iteration number for checkpointing.
+    """
+    experiment_data = self._agent.bundle_and_checkpoint(self._checkpoint_dir,
+                                                        iteration)
+    if experiment_data:
+      experiment_data['current_iteration'] = iteration
+      experiment_data['logs'] = self._logger.data
+      self._checkpointer.save_checkpoint(iteration, experiment_data)
+
+  def run_experiment(self):
+    """Runs a full experiment, spread over multiple iterations."""
+    tf.logging.info('Beginning training...')
+    if self._num_iterations <= self._start_iteration:
+      tf.logging.warning('num_iterations (%d) < start_iteration(%d)',
+                         self._num_iterations, self._start_iteration)
+      return
+
+    for iteration in range(self._start_iteration, self._num_iterations):
+      statistics = self._run_one_iteration(iteration)
+      self._log_experiment(iteration, statistics)
+      self._checkpoint_experiment(iteration)
+
+
+@gin.configurable
+class TrainRunner(Runner):
+  """Object that handles running Atari 2600 experiments.
+
+  The `TrainRunner` differs from the base `Runner` class in that it does not
+  the evaluation phase. Checkpointing and logging for the train phase are
+  preserved as before.
+  """
+
+  def __init__(self, base_dir, create_agent_fn):
+    """Initialize the TrainRunner object in charge of running a full experiment.
+
+    Args:
+      base_dir: str, the base directory to host all required sub-directories.
+      create_agent_fn: A function that takes as args a Tensorflow session and an
+        Atari 2600 Gym environment, and returns an agent.
+    """
+    tf.logging.info('Creating TrainRunner ...')
+    super(TrainRunner, self).__init__(
+        base_dir=base_dir, create_agent_fn=create_agent_fn)
+    self._agent.eval_mode = False
+
+  def _run_one_iteration(self, iteration):
+    """Runs one iteration of agent/environment interaction.
+
+    An iteration involves running several episodes until a certain number of
+    steps are obtained. This method differs from the `_run_one_iteration` method
+    in the base `Runner` class in that it only runs the train phase.
+
+    Args:
+      iteration: int, current iteration number, used as a global_step for saving
+        Tensorboard summaries.
+
+    Returns:
+      A dict containing summary statistics for this iteration.
+    """
+    statistics = iteration_statistics.IterationStatistics()
+    num_episodes_train, average_reward_train = self._run_train_phase(
+        statistics)
+
+    self._save_tensorboard_summaries(iteration, num_episodes_train,
+                                     average_reward_train)
+    return statistics.data_lists
+
+  def _save_tensorboard_summaries(self, iteration, num_episodes,
+                                  average_reward):
+    """Save statistics as tensorboard summaries."""
+    summary = tf.Summary(value=[
+        tf.Summary.Value(tag='Train/NumEpisodes', simple_value=num_episodes),
+        tf.Summary.Value(
+            tag='Train/AverageReturns', simple_value=average_reward),
+    ])
+    self._summary_writer.add_summary(summary, iteration)
diff --git a/dopamine/dopamine/atari/train.py b/dopamine/dopamine/atari/train.py
new file mode 100644
index 0000000..970c8b0
--- /dev/null
+++ b/dopamine/dopamine/atari/train.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""The entry point for running an agent on an Atari 2600 domain.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+
+from absl import app
+from absl import flags
+from dopamine.agents.dqn import dqn_agent
+from dopamine.agents.implicit_quantile import implicit_quantile_agent
+from dopamine.agents.rainbow import rainbow_agent
+from dopamine.atari import run_experiment
+from dopamine.agents.rpg import rpg_agent
+from dopamine.agents.epg import epg_agent
+from dopamine.agents.lpg import lpg_agent
+from dopamine.agents.repg import repg_agent
+from dopamine.agents.dqnrpg import dqnrpg_agent
+from dopamine.agents.rainbowrpg import rainbowrpg_agent
+from dopamine.agents.implicit_quantilerpg import implicit_quantilerpg_agent
+import tensorflow as tf
+import time
+
+current_time = time.strftime("%Y%m%d_%H%M")
+agentname = "implicit_quantilerpg"  #"rainbowrpg"   #"dqnrpg"
+flags.DEFINE_bool('debug_mode', False,
+                  'If set to true, the agent will output in-episode statistics '
+                  'to Tensorboard. Disabled by default as this results in '
+                  'slower training.')
+flags.DEFINE_string('agent_name', agentname,
+                    'Name of the agent. Must be one of '
+                    '(dqn, rainbow, implicit_quantile)')
+flags.DEFINE_string('game_name', "Pong",
+                    'Name of game playing '
+                    'Pong, Boxing, Bowling, etc.')
+flags.DEFINE_integer('num_iterations', 35,
+                     'Number of training iterations')
+
+flags.DEFINE_string('base_dir', "/mnt/research/linkaixi/AllData/pommerman/tmp",  #"/mnt/research/linkaixi/AllData/pommerman/{}".format(current_time),
+                    'Base directory to host all required sub-directories.')
+flags.DEFINE_integer('random_seed', 1,
+                     'graph level random seed')
+flags.DEFINE_multi_string(
+    'gin_files', "../agents/{}/configs/{}.gin".format(agentname, agentname), 'List of paths to gin configuration files (e.g.'
+    '"../agents/dqn/configs/dqn.gin").')
+flags.DEFINE_multi_string(
+    'gin_bindings', [],
+    'Gin bindings to override the values set in the config files '
+    '(e.g. "DQNAgent.epsilon_train=0.1",'
+    '      "create_environment.game_name="Pong"").')
+flags.DEFINE_string(
+    'schedule', 'continuous_train_and_eval',
+    'The schedule with which to run the experiment and choose an appropriate '
+    'Runner. Supported choices are '
+    '{continuous_train, continuous_train_and_eval}.')
+
+FLAGS = flags.FLAGS
+
+
+
+def create_agent(sess, environment, summary_writer=None):
+  """Creates a DQN agent.
+
+  Args:
+    sess: A `tf.Session` object for running associated ops.
+    environment: An Atari 2600 Gym environment.
+    summary_writer: A Tensorflow summary writer to pass to the agent
+      for in-agent training statistics in Tensorboard.
+
+  Returns:
+    agent: An RL agent.
+
+  Raises:
+    ValueError: If `agent_name` is not in supported list.
+  """
+  if not FLAGS.debug_mode:
+    summary_writer = None
+  if FLAGS.agent_name == 'dqn':
+    return dqn_agent.DQNAgent(sess, num_actions=environment.action_space.n,
+                              summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'rainbow':
+    return rainbow_agent.RainbowAgent(
+        sess, num_actions=environment.action_space.n,
+        summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'implicit_quantile':
+    return implicit_quantile_agent.ImplicitQuantileAgent(
+        sess, num_actions=environment.action_space.n,
+        summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'rpg':
+    return rpg_agent.RPGAgent(sess, num_actions=environment.action_space.n,
+                              summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'epg':
+    return epg_agent.EPGAgent(sess, num_actions=environment.action_space.n,
+                              summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'lpg':
+    return lpg_agent.LPGAgent(sess, num_actions=environment.action_space.n,
+                              summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'repg':
+    return repg_agent.REPGAgent(sess, num_actions=environment.action_space.n,
+                              summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'dqnrpg':
+    return dqnrpg_agent.DQNRPGAgent(sess, num_actions=environment.action_space.n,
+                              summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'rainbowrpg':
+    return rainbowrpg_agent.RainbowRPGAgent(sess, num_actions=environment.action_space.n,
+                                summary_writer=summary_writer)
+  elif FLAGS.agent_name == 'implicit_quantilerpg':
+    return implicit_quantilerpg_agent.ImplicitQuantileRPGAgent(sess, num_actions=environment.action_space.n,
+                                summary_writer=summary_writer)
+  else:
+    raise ValueError('Unknown agent: {}'.format(FLAGS.agent_name))
+
+
+def create_runner(base_dir, create_agent_fn, random_seed, agent_name, game_name, num_iterations):
+  """Creates an experiment Runner.
+
+  Args:
+    base_dir: str, base directory for hosting all subdirectories.
+    create_agent_fn: A function that takes as args a Tensorflow session and an
+     Atari 2600 Gym environment, and returns an agent.
+
+  Returns:
+    runner: A `run_experiment.Runner` like object.
+
+  Raises:
+    ValueError: When an unknown schedule is encountered.
+  """
+  assert base_dir is not None
+  # Continuously runs training and evaluation until max num_iterations is hit.
+  if FLAGS.schedule == 'continuous_train_and_eval':
+    return run_experiment.Runner(base_dir, create_agent_fn, random_seed,
+                                 agent_name, game_name, num_iterations)
+  # Continuously runs training until max num_iterations is hit.
+  elif FLAGS.schedule == 'continuous_train':
+    return run_experiment.TrainRunner(base_dir, create_agent_fn)
+  else:
+    raise ValueError('Unknown schedule: {}'.format(FLAGS.schedule))
+
+
+def launch_experiment(create_runner_fn, create_agent_fn):
+  """Launches the experiment.
+
+  Args:
+    create_runner_fn: A function that takes as args a base directory and a
+      function for creating an agent and returns a `Runner`-like object.
+    create_agent_fn: A function that takes as args a Tensorflow session and an
+     Atari 2600 Gym environment, and returns an agent.
+  """
+  run_experiment.load_gin_configs(FLAGS.gin_files, FLAGS.gin_bindings)
+  runner = create_runner_fn(FLAGS.base_dir, create_agent_fn,
+                            FLAGS.random_seed, FLAGS.agent_name,
+                            FLAGS.game_name, FLAGS.num_iterations)
+  runner.run_experiment()
+
+
+def main(unused_argv):
+  """Main method.
+
+  Args:
+    unused_argv: Arguments (unused).
+  """
+  tf.logging.set_verbosity(tf.logging.INFO)
+  launch_experiment(create_runner, create_agent)
+
+
+if __name__ == '__main__':
+  # flags.mark_flag_as_required('agent_name')
+  # flags.mark_flag_as_required('base_dir')
+  app.run(main)
diff --git a/dopamine/dopamine/common/__init__.py b/dopamine/dopamine/common/__init__.py
new file mode 100644
index 0000000..f9bcb7c
--- /dev/null
+++ b/dopamine/dopamine/common/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/dopamine/dopamine/common/checkpointer.py b/dopamine/dopamine/common/checkpointer.py
new file mode 100644
index 0000000..08a478a
--- /dev/null
+++ b/dopamine/dopamine/common/checkpointer.py
@@ -0,0 +1,177 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A checkpointing mechanism for Dopamine agents.
+
+This Checkpointer expects a base directory where checkpoints for different
+iterations are stored. Specifically, Checkpointer.save_checkpoint() takes in
+as input a dictionary 'data' to be pickled to disk. At each iteration, we
+write a file called 'cpkt.#', where # is the iteration number. The
+Checkpointer also cleans up old files, maintaining up to the CHECKPOINT_DURATION
+most recent iterations.
+
+The Checkpointer writes a sentinel file to indicate that checkpointing was
+globally successful. This means that all other checkpointing activities
+(saving the Tensorflow graph, the replay buffer) should be performed *prior*
+to calling Checkpointer.save_checkpoint(). This allows the Checkpointer to
+detect incomplete checkpoints.
+
+#### Example
+
+After running 10 iterations (numbered 0...9) with base_directory='/checkpoint',
+the following files will exist:
+```
+  /checkpoint/cpkt.6
+  /checkpoint/cpkt.7
+  /checkpoint/cpkt.8
+  /checkpoint/cpkt.9
+  /checkpoint/sentinel_checkpoint_complete.6
+  /checkpoint/sentinel_checkpoint_complete.7
+  /checkpoint/sentinel_checkpoint_complete.8
+  /checkpoint/sentinel_checkpoint_complete.9
+```
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pickle
+import tensorflow as tf
+
+CHECKPOINT_DURATION = 4
+
+
+def get_latest_checkpoint_number(base_directory):
+  """Returns the version number of the latest completed checkpoint.
+
+  Args:
+    base_directory: str, directory in which to look for checkpoint files.
+
+  Returns:
+    int, the iteration number of the latest checkpoint, or -1 if none was found.
+  """
+  glob = os.path.join(base_directory, 'sentinel_checkpoint_complete.*')
+  def extract_iteration(x):
+    return int(x[x.rfind('.') + 1:])
+  try:
+    checkpoint_files = tf.gfile.Glob(glob)
+  except tf.errors.NotFoundError:
+    return -1
+  try:
+    latest_iteration = max(extract_iteration(x) for x in checkpoint_files)
+    return latest_iteration
+  except ValueError:
+    return -1
+
+
+class Checkpointer(object):
+  """Class for managing checkpoints for Dopamine agents.
+  """
+
+  def __init__(self, base_directory, checkpoint_file_prefix='ckpt',
+               checkpoint_frequency=1):
+    """Initializes Checkpointer.
+
+    Args:
+      base_directory: str, directory where all checkpoints are saved/loaded.
+      checkpoint_file_prefix: str, prefix to use for naming checkpoint files.
+      checkpoint_frequency: int, the frequency at which to checkpoint.
+
+    Raises:
+      ValueError: if base_directory is empty, or not creatable.
+    """
+    if not base_directory:
+      raise ValueError('No path provided to Checkpointer.')
+    self._checkpoint_file_prefix = checkpoint_file_prefix
+    self._checkpoint_frequency = checkpoint_frequency
+    self._base_directory = base_directory
+    try:
+      tf.gfile.MakeDirs(base_directory)
+    except tf.errors.PermissionDeniedError:
+      # We catch the PermissionDeniedError and issue a more useful exception.
+      raise ValueError('Unable to create checkpoint path: {}.'.format(
+          base_directory))
+
+  def _generate_filename(self, file_prefix, iteration_number):
+    """Returns a checkpoint filename from prefix and iteration number."""
+    filename = '{}.{}'.format(file_prefix, iteration_number)
+    return os.path.join(self._base_directory, filename)
+
+  def _save_data_to_file(self, data, filename):
+    """Saves the given 'data' object to a file."""
+    with tf.gfile.GFile(filename, 'w') as fout:
+      pickle.dump(data, fout)
+
+  def save_checkpoint(self, iteration_number, data):
+    """Saves a new checkpoint at the current iteration_number.
+
+    Args:
+      iteration_number: int, the current iteration number for this checkpoint.
+      data: Any (picklable) python object containing the data to store in the
+        checkpoint.
+    """
+    if iteration_number % self._checkpoint_frequency != 0:
+      return
+
+    filename = self._generate_filename(self._checkpoint_file_prefix,
+                                       iteration_number)
+    self._save_data_to_file(data, filename)
+    filename = self._generate_filename('sentinel_checkpoint_complete',
+                                       iteration_number)
+    with tf.gfile.GFile(filename, 'wb') as fout:
+      fout.write('done')
+
+    self._clean_up_old_checkpoints(iteration_number)
+
+  def _clean_up_old_checkpoints(self, iteration_number):
+    """Removes sufficiently old checkpoints."""
+    # After writing a the checkpoint and sentinel file, we garbage collect files
+    # that are CHECKPOINT_DURATION * self._checkpoint_frequency versions old.
+    stale_iteration_number = iteration_number - (self._checkpoint_frequency *
+                                                 CHECKPOINT_DURATION)
+
+    if stale_iteration_number >= 0:
+      stale_file = self._generate_filename(self._checkpoint_file_prefix,
+                                           stale_iteration_number)
+      stale_sentinel = self._generate_filename('sentinel_checkpoint_complete',
+                                               stale_iteration_number)
+      try:
+        tf.gfile.Remove(stale_file)
+        tf.gfile.Remove(stale_sentinel)
+      except tf.errors.NotFoundError:
+        # Ignore if file not found.
+        tf.logging.info('Unable to remove {} or {}.'.format(stale_file,
+                                                            stale_sentinel))
+
+  def _load_data_from_file(self, filename):
+    if not tf.gfile.Exists(filename):
+      return None
+    with tf.gfile.GFile(filename, 'rb') as fin:
+      return pickle.load(fin)
+
+  def load_checkpoint(self, iteration_number):
+    """Tries to reload a checkpoint at the selected iteration number.
+
+    Args:
+      iteration_number: The checkpoint iteration number to try to load.
+
+    Returns:
+      If the checkpoint files exist, two unpickled objects that were passed in
+        as data to save_checkpoint; returns None if the files do not exist.
+    """
+    checkpoint_file = self._generate_filename(self._checkpoint_file_prefix,
+                                              iteration_number)
+    return self._load_data_from_file(checkpoint_file)
diff --git a/dopamine/dopamine/common/iteration_statistics.py b/dopamine/dopamine/common/iteration_statistics.py
new file mode 100644
index 0000000..f47c575
--- /dev/null
+++ b/dopamine/dopamine/common/iteration_statistics.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A class for storing iteration-specific metrics.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class IterationStatistics(object):
+  """A class for storing iteration-specific metrics.
+
+  The internal format is as follows: we maintain a mapping from keys to lists.
+  Each list contains all the values corresponding to the given key.
+
+  For example, self.data_lists['train_episode_returns'] might contain the
+    per-episode returns achieved during this iteration.
+
+  Attributes:
+    data_lists: dict mapping each metric_name (str) to a list of said metric
+      across episodes.
+  """
+
+  def __init__(self):
+    self.data_lists = {}
+
+  def append(self, data_pairs):
+    """Add the given values to their corresponding key-indexed lists.
+
+    Args:
+      data_pairs: A dictionary of key-value pairs to be recorded.
+    """
+    for key, value in data_pairs.items():
+      if key not in self.data_lists:
+        self.data_lists[key] = []
+      self.data_lists[key].append(value)
diff --git a/dopamine/dopamine/common/logger.py b/dopamine/dopamine/common/logger.py
new file mode 100644
index 0000000..8e1b51b
--- /dev/null
+++ b/dopamine/dopamine/common/logger.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A lightweight logging mechanism for dopamine agents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pickle
+import tensorflow as tf
+
+
+CHECKPOINT_DURATION = 4
+
+
+class Logger(object):
+  """Class for maintaining a dictionary of data to log."""
+
+  def __init__(self, logging_dir):
+    """Initializes Logger.
+
+    Args:
+      logging_dir: str, Directory to which logs are written.
+    """
+    # Dict used by logger to store data.
+    self.data = {}
+    self._logging_enabled = True
+
+    if not logging_dir:
+      tf.logging.info('Logging directory not specified, will not log.')
+      self._logging_enabled = False
+      return
+    # Try to create logging directory.
+    try:
+      tf.gfile.MakeDirs(logging_dir)
+    except tf.errors.PermissionDeniedError:
+      # If it already exists, ignore exception.
+      pass
+    if not tf.gfile.Exists(logging_dir):
+      tf.logging.warning(
+          'Could not create directory %s, logging will be disabled.',
+          logging_dir)
+      self._logging_enabled = False
+      return
+    self._logging_dir = logging_dir
+
+  def __setitem__(self, key, value):
+    """This method will set an entry at key with value in the dictionary.
+
+    It will effectively overwrite any previous data at the same key.
+
+    Args:
+      key: str, indicating key where to write the entry.
+      value: A python object to store.
+    """
+    if self._logging_enabled:
+      self.data[key] = value
+
+  def _generate_filename(self, filename_prefix, iteration_number):
+    filename = '{}_{}'.format(filename_prefix, iteration_number)
+    return os.path.join(self._logging_dir, filename)
+
+  def log_to_file(self, filename_prefix, iteration_number):
+    """Save the pickled dictionary to a file.
+
+    Args:
+      filename_prefix: str, name of the file to use (without iteration
+        number).
+      iteration_number: int, the iteration number, appended to the end of
+        filename_prefix.
+    """
+    if not self._logging_enabled:
+      tf.logging.warning('Logging is disabled.')
+      return
+    log_file = self._generate_filename(filename_prefix, iteration_number)
+    with tf.gfile.GFile(log_file, 'w') as fout:
+      pickle.dump(self.data, fout, protocol=pickle.HIGHEST_PROTOCOL)
+    # After writing a checkpoint file, we garbage collect the log file
+    # that is CHECKPOINT_DURATION versions old.
+    stale_iteration_number = iteration_number - CHECKPOINT_DURATION
+    if stale_iteration_number >= 0:
+      stale_file = self._generate_filename(filename_prefix,
+                                           stale_iteration_number)
+      try:
+        tf.gfile.Remove(stale_file)
+      except tf.errors.NotFoundError:
+        # Ignore if file not found.
+        pass
+
+  def is_logging_enabled(self):
+    """Return if logging is enabled."""
+    return self._logging_enabled
diff --git a/dopamine/dopamine/replay_memory/__init__.py b/dopamine/dopamine/replay_memory/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/replay_memory/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/replay_memory/circular_replay_buffer.py b/dopamine/dopamine/replay_memory/circular_replay_buffer.py
new file mode 100644
index 0000000..e10c03b
--- /dev/null
+++ b/dopamine/dopamine/replay_memory/circular_replay_buffer.py
@@ -0,0 +1,835 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The standard DQN replay memory.
+
+This implementation is an out-of-graph replay memory + in-graph wrapper. It
+supports vanilla n-step updates of the form typically found in the literature,
+i.e. where rewards are accumulated for n steps and the intermediate trajectory
+is not exposed to the agent. This does not allow, for example, performing
+off-policy corrections.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import gzip
+import math
+import os
+import pickle
+
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+# Defines a type describing part of the tuple returned by the replay
+# memory. Each element of the tuple is a tensor of shape [batch, ...] where
+# ... is defined the 'shape' field of ReplayElement. The tensor type is
+# given by the 'type' field. The 'name' field is for convenience and ease of
+# debugging.
+ReplayElement = (
+    collections.namedtuple('shape_type', ['name', 'shape', 'type']))
+
+# A prefix that can not collide with variable names for checkpoint files.
+STORE_FILENAME_PREFIX = '$store$_'
+
+# This constant determines how many iterations a checkpoint is kept for.
+CHECKPOINT_DURATION = 4
+MAX_SAMPLE_ATTEMPTS = 1000
+
+
+def invalid_range(cursor, replay_capacity, stack_size, update_horizon):
+  """Returns a array with the indices of cursor-related invalid transitions.
+
+  There are update_horizon + stack_size invalid indices:
+    - The update_horizon indices before the cursor, because we do not have a
+      valid N-step transition (including the next state).
+    - The stack_size indices on or immediately after the cursor.
+  If N = update_horizon, K = stack_size, and the cursor is at c, invalid
+  indices are:
+    c - N, c - N + 1, ..., c, c + 1, ..., c + K - 1.
+
+  It handles special cases in a circular buffer in the beginning and the end.
+
+  Args:
+    cursor: int, the position of the cursor.
+    replay_capacity: int, the size of the replay memory.
+    stack_size: int, the size of the stacks returned by the replay memory.
+    update_horizon: int, the agent's update horizon.
+  Returns:
+    np.array of size stack_size with the invalid indices.
+  """
+  assert cursor < replay_capacity
+  return np.array(
+      [(cursor - update_horizon + i) % replay_capacity
+       for i in range(stack_size + update_horizon)])
+
+
+class OutOfGraphReplayBuffer(object):
+  """A simple out-of-graph Replay Buffer.
+
+  Stores transitions, state, action, reward, next_state, terminal (and any
+  extra contents specified) in a circular buffer and provides a uniform
+  transition sampling function.
+
+  When the states consist of stacks of observations storing the states is
+  inefficient. This class writes observations and constructs the stacked states
+  at sample time.
+
+  Attributes:
+    add_count: int, counter of how many transitions have been added (including
+      the blank ones at the beginning of an episode).
+  """
+
+  def __init__(self,
+               observation_shape,
+               stack_size,
+               replay_capacity,
+               batch_size,
+               update_horizon=1,
+               gamma=0.99,
+               max_sample_attempts=MAX_SAMPLE_ATTEMPTS,
+               extra_storage_types=None,
+               observation_dtype=np.uint8):
+    """Initializes OutOfGraphReplayBuffer.
+
+    Args:
+      observation_shape: tuple of ints.
+      stack_size: int, number of frames to use in state stack.
+      replay_capacity: int, number of transitions to keep in memory.
+      batch_size: int.
+      update_horizon: int, length of update ('n' in n-step update).
+      gamma: int, the discount factor.
+      max_sample_attempts: int, the maximum number of attempts allowed to
+        get a sample.
+      extra_storage_types: list of ReplayElements defining the type of the extra
+        contents that will be stored and returned by sample_transition_batch.
+      observation_dtype: np.dtype, type of the observations. Defaults to
+        np.uint8 for Atari 2600.
+
+    Raises:
+      ValueError: If replay_capacity is too small to hold at least one
+        transition.
+    """
+    assert isinstance(observation_shape, tuple)
+    if replay_capacity < update_horizon + stack_size:
+      raise ValueError('There is not enough capacity to cover '
+                       'update_horizon and stack_size.')
+
+    tf.logging.info(
+        'Creating a %s replay memory with the following parameters:',
+        self.__class__.__name__)
+    tf.logging.info('\t observation_shape: %s', str(observation_shape))
+    tf.logging.info('\t observation_dtype: %s', str(observation_dtype))
+    tf.logging.info('\t stack_size: %d', stack_size)
+    tf.logging.info('\t replay_capacity: %d', replay_capacity)
+    tf.logging.info('\t batch_size: %d', batch_size)
+    tf.logging.info('\t update_horizon: %d', update_horizon)
+    tf.logging.info('\t gamma: %f', gamma)
+
+    self._observation_shape = observation_shape
+    self._stack_size = stack_size
+    self._state_shape = self._observation_shape + (self._stack_size,)
+    self._replay_capacity = replay_capacity
+    self._batch_size = batch_size
+    self._update_horizon = update_horizon
+    self._gamma = gamma
+    self._observation_dtype = observation_dtype
+    self._max_sample_attempts = max_sample_attempts
+    if extra_storage_types:
+      self._extra_storage_types = extra_storage_types
+    else:
+      self._extra_storage_types = []
+    self._create_storage()
+    self.add_count = np.array(0)
+    self.invalid_range = np.zeros((self._stack_size))
+    # When the horizon is > 1, we compute the sum of discounted rewards as a dot
+    # product using the precomputed vector <gamma^0, gamma^1, ..., gamma^{n-1}>.
+    self._cumulative_discount_vector = np.array(
+        [math.pow(self._gamma, n) for n in range(update_horizon)],
+        dtype=np.float32)
+
+  def _create_storage(self):
+    """Creates the numpy arrays used to store transitions.
+    """
+    self._store = {}
+    for storage_element in self.get_storage_signature():
+      array_shape = [self._replay_capacity] + list(storage_element.shape)
+      self._store[storage_element.name] = np.empty(
+          array_shape, dtype=storage_element.type)
+
+  def get_add_args_signature(self):
+    """The signature of the add function.
+
+    Note - Derived classes may return a different signature.
+
+    Returns:
+      list of ReplayElements defining the type of the argument signature needed
+        by the add function.
+    """
+    return self.get_storage_signature()
+
+  def get_storage_signature(self):
+    """Returns a default list of elements to be stored in this replay memory.
+
+    Note - Derived classes may return a different signature.
+
+    Returns:
+      list of ReplayElements defining the type of the contents stored.
+    """
+    storage_elements = [
+        ReplayElement('observation', self._observation_shape,
+                      self._observation_dtype),
+        ReplayElement('action', (), np.int32),
+        ReplayElement('reward', (), np.float32),
+        ReplayElement('terminal', (), np.uint8)
+    ]
+
+    for extra_replay_element in self._extra_storage_types:
+      storage_elements.append(extra_replay_element)
+    return storage_elements
+
+  def _add_zero_transition(self):
+    """Adds a padding transition filled with zeros (Used in episode beginnings).
+    """
+    zero_transition = []
+    for element_type in self.get_add_args_signature():
+      zero_transition.append(
+          np.zeros(element_type.shape, dtype=element_type.type))
+    self._add(*zero_transition)
+
+  def add(self, observation, action, reward, terminal, *args):
+    """Adds a transition to the replay memory.
+
+    This function checks the types and handles the padding at the beginning of
+    an episode. Then it calls the _add function.
+
+    Since the next_observation in the transition will be the observation added
+    next there is no need to pass it.
+
+    If the replay memory is at capacity the oldest transition will be discarded.
+
+    Args:
+      observation: np.array with shape observation_shape.
+      action: int, the action in the transition.
+      reward: float, the reward received in the transition.
+      terminal: A uint8 acting as a boolean indicating whether the transition
+                was terminal (1) or not (0).
+      *args: extra contents with shapes and dtypes according to
+        extra_storage_types.
+    """
+    self._check_add_types(observation, action, reward, terminal, *args)
+    if self.is_empty() or self._store['terminal'][self.cursor() - 1] == 1:
+      for _ in range(self._stack_size - 1):
+        # Child classes can rely on the padding transitions being filled with
+        # zeros. This is useful when there is a priority argument.
+        self._add_zero_transition()
+    self._add(observation, action, reward, terminal, *args)
+
+  def _add(self, *args):
+    """Internal add method to add to the storage arrays.
+
+    Args:
+      *args: All the elements in a transition.
+    """
+    cursor = self.cursor()
+
+    arg_names = [e.name for e in self.get_add_args_signature()]
+    for arg_name, arg in zip(arg_names, args):
+      self._store[arg_name][cursor] = arg
+
+    self.add_count += 1
+    self.invalid_range = invalid_range(
+        self.cursor(), self._replay_capacity, self._stack_size,
+        self._update_horizon)
+
+  def _check_add_types(self, *args):
+    """Checks if args passed to the add method match those of the storage.
+
+    Args:
+      *args: Args whose types need to be validated.
+
+    Raises:
+      ValueError: If args have wrong shape or dtype.
+    """
+    if len(args) != len(self.get_add_args_signature()):
+      raise ValueError('Add expects {} elements, received {}'.format(
+          len(self.get_add_args_signature()), len(args)))
+    for arg_element, store_element in zip(args, self.get_add_args_signature()):
+      if isinstance(arg_element, np.ndarray):
+        arg_shape = arg_element.shape
+      elif isinstance(arg_element, tuple) or isinstance(arg_element, list):
+        # TODO(b/80536437). This is not efficient when arg_element is a list.
+        arg_shape = np.array(arg_element).shape
+      else:
+        # Assume it is scalar.
+        arg_shape = tuple()
+      store_element_shape = tuple(store_element.shape)
+      if arg_shape != store_element_shape:
+        raise ValueError('arg has shape {}, expected {}'.format(
+            arg_shape, store_element_shape))
+
+  def is_empty(self):
+    """Is the Replay Buffer empty?"""
+    return self.add_count == 0
+
+  def is_full(self):
+    """Is the Replay Buffer full?"""
+    return self.add_count >= self._replay_capacity
+
+  def cursor(self):
+    """Index to the location where the next transition will be written."""
+    return self.add_count % self._replay_capacity
+
+  def get_range(self, array, start_index, end_index):
+    """Returns the range of array at the index handling wraparound if necessary.
+
+    Args:
+      array: np.array, the array to get the stack from.
+      start_index: int, index to the start of the range to be returned. Range
+        will wraparound if start_index is smaller than 0.
+      end_index: int, exclusive end index. Range will wraparound if end_index
+        exceeds replay_capacity.
+
+    Returns:
+      np.array, with shape [end_index - start_index, array.shape[1:]].
+    """
+    assert end_index > start_index, 'end_index must be larger than start_index'
+    assert end_index >= 0
+    assert start_index < self._replay_capacity
+    if not self.is_full():
+      assert end_index <= self.cursor(), (
+          'Index {} has not been added.'.format(start_index))
+
+    # Fast slice read when there is no wraparound.
+    if start_index % self._replay_capacity < end_index % self._replay_capacity:
+      return_array = array[start_index:end_index, ...]
+    # Slow list read.
+    else:
+      indices = [(start_index + i) % self._replay_capacity
+                 for i in range(end_index - start_index)]
+      return_array = array[indices, ...]
+    return return_array
+
+  def get_observation_stack(self, index):
+    state = self.get_range(self._store['observation'],
+                           index - self._stack_size + 1, index + 1)
+    # The stacking axis is 0 but the agent expects as the last axis.
+    return np.moveaxis(state, 0, -1)
+
+  def get_terminal_stack(self, index):
+    return self.get_range(self._store['terminal'], index - self._stack_size + 1,
+                          index + 1)
+
+  def is_valid_transition(self, index):
+    """Checks if the index contains a valid transition.
+
+    Checks for collisions with the end of episodes and the current position
+    of the cursor.
+
+    Args:
+      index: int, the index to the state in the transition.
+
+    Returns:
+      Is the index valid: Boolean.
+
+    """
+    # Check the index is in the valid range
+    if index < 0 or index >= self._replay_capacity:
+      return False
+    if not self.is_full():
+      # The indices and next_indices must be smaller than the cursor.
+      if index >= self.cursor() - self._update_horizon:
+        return False
+      # The first few indices contain the padding states of the first episode.
+      if index < self._stack_size - 1:
+        return False
+
+    # Skip transitions that straddle the cursor.
+    if index in set(self.invalid_range):
+      return False
+
+    # If there are terminal flags in any other frame other than the last one
+    # the stack is not valid, so don't sample it.
+    if self.get_terminal_stack(index)[:-1].any():
+      return False
+
+    return True
+
+  def _create_batch_arrays(self, batch_size):
+    """Create a tuple of arrays with the type of get_transition_elements.
+
+    When using the WrappedReplayBuffer with staging enabled it is important to
+    create new arrays every sample because StaginArea keeps a pointer to the
+    returned arrays.
+
+    Args:
+      batch_size: (int) number of transitions returned. If None the default
+        batch_size will be used.
+
+    Returns:
+      Tuple of np.arrays with the shape and type of get_transition_elements.
+    """
+    transition_elements = self.get_transition_elements(batch_size)
+    batch_arrays = []
+    for element in transition_elements:
+      batch_arrays.append(np.empty(element.shape, dtype=element.type))
+    return tuple(batch_arrays)
+
+  def sample_index_batch(self, batch_size):
+    """Returns a batch of valid indices sampled uniformly.
+
+    Args:
+      batch_size: int, number of indices returned.
+
+    Returns:
+      list of ints, a batch of valid indices sampled uniformly.
+
+    Raises:
+      RuntimeError: If the batch was not constructed after maximum number of
+        tries.
+    """
+    if self.is_full():
+      # add_count >= self._replay_capacity > self._stack_size
+      min_id = self.cursor() - self._replay_capacity + self._stack_size - 1
+      max_id = self.cursor() - self._update_horizon
+    else:
+      # add_count < self._replay_capacity
+      min_id = self._stack_size - 1
+      max_id = self.cursor() - self._update_horizon
+      if max_id <= min_id:
+        raise RuntimeError('Cannot sample a batch with fewer than stack size '
+                           '({}) + update_horizon ({}) transitions.'.
+                           format(self._stack_size, self._update_horizon))
+
+    indices = []
+    attempt_count = 0
+    while (len(indices) < batch_size and
+           attempt_count < self._max_sample_attempts):
+      attempt_count += 1
+      index = np.random.randint(min_id, max_id) % self._replay_capacity
+      if self.is_valid_transition(index):
+        indices.append(index)
+    if len(indices) != batch_size:
+      raise RuntimeError(
+          'Max sample attempts: Tried {} times but only sampled {}'
+          ' valid indices. Batch size is {}'.
+          format(self._max_sample_attempts, len(indices), batch_size))
+
+    return indices
+
+  def sample_transition_batch(self, batch_size=None, indices=None):
+    """Returns a batch of transitions (including any extra contents).
+
+    If get_transition_elements has been overridden and defines elements not
+    stored in self._store, an empty array will be returned and it will be
+    left to the child class to fill it. For example, for the child class
+    OutOfGraphPrioritizedReplayBuffer, the contents of the
+    sampling_probabilities are stored separately in a sum tree.
+
+    When the transition is terminal next_state_batch has undefined contents.
+
+    NOTE: This transition contains the indices of the sampled elements. These
+    are only valid during the call to sample_transition_batch, i.e. they may
+    be used by subclasses of this replay buffer but may point to different data
+    as soon as sampling is done.
+
+    Args:
+      batch_size: int, number of transitions returned. If None, the default
+        batch_size will be used.
+      indices: None or list of ints, the indices of every transition in the
+        batch. If None, sample the indices uniformly.
+
+    Returns:
+      transition_batch: tuple of np.arrays with the shape and type as in
+        get_transition_elements().
+
+    Raises:
+      ValueError: If an element to be sampled is missing from the replay buffer.
+    """
+    if batch_size is None:
+      batch_size = self._batch_size
+    if indices is None:
+      indices = self.sample_index_batch(batch_size)
+    assert len(indices) == batch_size
+
+    transition_elements = self.get_transition_elements(batch_size)
+    batch_arrays = self._create_batch_arrays(batch_size)
+    for batch_element, state_index in enumerate(indices):
+      trajectory_indices = [(state_index + j) % self._replay_capacity
+                            for j in range(self._update_horizon)]
+      trajectory_terminals = self._store['terminal'][trajectory_indices]
+      is_terminal_transition = trajectory_terminals.any()
+      if not is_terminal_transition:
+        trajectory_length = self._update_horizon
+      else:
+        # np.argmax of a bool array returns the index of the first True.
+        trajectory_length = np.argmax(trajectory_terminals.astype(np.bool),
+                                      0) + 1
+      next_state_index = state_index + trajectory_length
+      trajectory_discount_vector = (
+          self._cumulative_discount_vector[:trajectory_length])
+      trajectory_rewards = self.get_range(self._store['reward'], state_index,
+                                          next_state_index)
+
+      # Fill the contents of each array in the sampled batch.
+      assert len(transition_elements) == len(batch_arrays)
+      for element_array, element in zip(batch_arrays, transition_elements):
+        if element.name == 'state':
+          element_array[batch_element] = self.get_observation_stack(state_index)
+        elif element.name == 'reward':
+          # cumpute the discounted sum of rewards in the trajectory.
+          element_array[batch_element] = trajectory_discount_vector.dot(
+              trajectory_rewards)
+        elif element.name == 'next_state':
+          element_array[batch_element] = self.get_observation_stack(
+              (next_state_index) % self._replay_capacity)
+        elif element.name == 'terminal':
+          element_array[batch_element] = is_terminal_transition
+        elif element.name == 'indices':
+          element_array[batch_element] = state_index
+        elif element.name in self._store.keys():
+          element_array[batch_element] = (
+              self._store[element.name][state_index])
+        # We assume the other elements are filled in by the subclass.
+
+    return batch_arrays
+
+  def get_transition_elements(self, batch_size=None):
+    """Returns a 'type signature' for sample_transition_batch.
+
+    Args:
+      batch_size: int, number of transitions returned. If None, the default
+        batch_size will be used.
+    Returns:
+      signature: A namedtuple describing the method's return type signature.
+    """
+    batch_size = self._batch_size if batch_size is None else batch_size
+
+    transition_elements = [
+        ReplayElement('state', (batch_size,) + self._state_shape,
+                      self._observation_dtype),
+        ReplayElement('action', (batch_size,), np.int32),
+        ReplayElement('reward', (batch_size,), np.float32),
+        ReplayElement('next_state', (batch_size,) + self._state_shape,
+                      self._observation_dtype),
+        ReplayElement('terminal', (batch_size,), np.uint8),
+        ReplayElement('indices', (batch_size,), np.int32)
+    ]
+    for element in self._extra_storage_types:
+      transition_elements.append(
+          ReplayElement(element.name, (batch_size,) + tuple(element.shape),
+                        element.type))
+    return transition_elements
+
+  def _generate_filename(self, checkpoint_dir, name, suffix):
+    return os.path.join(checkpoint_dir, '{}_ckpt.{}.gz'.format(name, suffix))
+
+  def _return_checkpointable_elements(self):
+    """Return the dict of elements of the class for checkpointing.
+
+    Returns:
+      checkpointable_elements: dict containing all non private (starting with
+      _) members + all the arrays inside self._store.
+    """
+    checkpointable_elements = {}
+    for member_name, member in self.__dict__.items():
+      if member_name == '_store':
+        for array_name, array in self._store.items():
+          checkpointable_elements[STORE_FILENAME_PREFIX + array_name] = array
+      elif not member_name.startswith('_'):
+        checkpointable_elements[member_name] = member
+    return checkpointable_elements
+
+  def save(self, checkpoint_dir, iteration_number):
+    """Save the OutOfGraphReplayBuffer attributes into a file.
+
+    This method will save all the replay buffer's state in a single file.
+
+    Args:
+      checkpoint_dir: str, the directory where numpy checkpoint files should be
+        saved.
+      iteration_number: int, iteration_number to use as a suffix in naming
+        numpy checkpoint files.
+    """
+    if not tf.gfile.Exists(checkpoint_dir):
+      return
+
+    checkpointable_elements = self._return_checkpointable_elements()
+
+    for attr in checkpointable_elements:
+      filename = self._generate_filename(checkpoint_dir, attr, iteration_number)
+      with tf.gfile.Open(filename, 'wb') as f:
+        with gzip.GzipFile(fileobj=f) as outfile:
+          # Checkpoint the np arrays in self._store with np.save instead of
+          # pickling the dictionary is critical for file size and performance.
+          # STORE_FILENAME_PREFIX indicates that the variable is contained in
+          # self._store.
+          if attr.startswith(STORE_FILENAME_PREFIX):
+            array_name = attr[len(STORE_FILENAME_PREFIX):]
+            np.save(outfile, self._store[array_name], allow_pickle=False)
+          # Some numpy arrays might not be part of storage
+          elif isinstance(self.__dict__[attr], np.ndarray):
+            np.save(outfile, self.__dict__[attr], allow_pickle=False)
+          else:
+            pickle.dump(self.__dict__[attr], outfile)
+
+      # After writing a checkpoint file, we garbage collect the checkpoint file
+      # that is four versions old.
+      stale_iteration_number = iteration_number - CHECKPOINT_DURATION
+      if stale_iteration_number >= 0:
+        stale_filename = self._generate_filename(checkpoint_dir, attr,
+                                                 stale_iteration_number)
+        try:
+          tf.gfile.Remove(stale_filename)
+        except tf.errors.NotFoundError:
+          pass
+
+  def load(self, checkpoint_dir, suffix):
+    """Restores the object from bundle_dictionary and numpy checkpoints.
+
+    Args:
+      checkpoint_dir: str, the directory where to read the numpy checkpointed
+        files from.
+      suffix: str, the suffix to use in numpy checkpoint files.
+
+    Raises:
+      NotFoundError: If not all expected files are found in directory.
+    """
+    save_elements = self._return_checkpointable_elements()
+    # We will first make sure we have all the necessary files available to avoid
+    # loading a partially-specified (i.e. corrupted) replay buffer.
+    for attr in save_elements:
+      filename = self._generate_filename(checkpoint_dir, attr, suffix)
+      if not tf.gfile.Exists(filename):
+        raise tf.errors.NotFoundError(None, None,
+                                      'Missing file: {}'.format(filename))
+    # If we've reached this point then we have verified that all expected files
+    # are available.
+    for attr in save_elements:
+      filename = self._generate_filename(checkpoint_dir, attr, suffix)
+      with tf.gfile.Open(filename, 'rb') as f:
+        with gzip.GzipFile(fileobj=f) as infile:
+          if attr.startswith(STORE_FILENAME_PREFIX):
+            array_name = attr[len(STORE_FILENAME_PREFIX):]
+            self._store[array_name] = np.load(infile, allow_pickle=False)
+          elif isinstance(self.__dict__[attr], np.ndarray):
+            self.__dict__[attr] = np.load(infile, allow_pickle=False)
+          else:
+            self.__dict__[attr] = pickle.load(infile)
+
+
+@gin.configurable(blacklist=['observation_shape', 'stack_size',
+                             'update_horizon', 'gamma'])
+class WrappedReplayBuffer(object):
+  """Wrapper of OutOfGraphReplayBuffer with an in graph sampling mechanism.
+
+  Usage:
+    To add a transition:  call the add function.
+
+    To sample a batch:    Construct operations that depend on any of the
+                          tensors is the transition dictionary. Every sess.run
+                          that requires any of these tensors will sample a new
+                          transition.
+  """
+
+  def __init__(self,
+               observation_shape,
+               stack_size,
+               use_staging=True,
+               replay_capacity=1000000,
+               batch_size=32,
+               update_horizon=1,
+               gamma=0.99,
+               wrapped_memory=None,
+               max_sample_attempts=MAX_SAMPLE_ATTEMPTS,
+               extra_storage_types=None,
+               observation_dtype=np.uint8):
+    """Initializes WrappedReplayBuffer.
+
+    Args:
+      observation_shape: tuple of ints.
+      stack_size: int, number of frames to use in state stack.
+      use_staging: bool, when True it would use a staging area to prefetch
+        the next sampling batch.
+      replay_capacity: int, number of transitions to keep in memory.
+      batch_size: int.
+      update_horizon: int, length of update ('n' in n-step update).
+      gamma: int, the discount factor.
+      wrapped_memory: The 'inner' memory data structure. If None,
+        it creates the standard DQN replay memory.
+      max_sample_attempts: int, the maximum number of attempts allowed to
+        get a sample.
+      extra_storage_types: list of ReplayElements defining the type of the extra
+        contents that will be stored and returned by sample_transition_batch.
+      observation_dtype: np.dtype, type of the observations. Defaults to
+        np.uint8 for Atari 2600.
+
+    Raises:
+      ValueError: If update_horizon is not positive.
+      ValueError: If discount factor is not in [0, 1].
+    """
+    if replay_capacity < update_horizon + 1:
+      raise ValueError(
+          'Update horizon ({}) should be significantly smaller '
+          'than replay capacity ({}).'.format(update_horizon, replay_capacity))
+    if not update_horizon >= 1:
+      raise ValueError('Update horizon must be positive.')
+    if not 0.0 <= gamma <= 1.0:
+      raise ValueError('Discount factor (gamma) must be in [0, 1].')
+
+    self.batch_size = batch_size
+
+    # Mainly used to allow subclasses to pass self.memory.
+    if wrapped_memory is not None:
+      self.memory = wrapped_memory
+    else:
+      self.memory = OutOfGraphReplayBuffer(
+          observation_shape, stack_size, replay_capacity, batch_size,
+          update_horizon, gamma, max_sample_attempts,
+          observation_dtype=observation_dtype,
+          extra_storage_types=extra_storage_types)
+
+    self.create_sampling_ops(use_staging)
+
+  def add(self, observation, action, reward, terminal, *args):
+    """Adds a transition to the replay memory.
+
+    Since the next_observation in the transition will be the observation added
+    next there is no need to pass it.
+
+    If the replay memory is at capacity the oldest transition will be discarded.
+
+    Args:
+      observation: np.array with shape observation_shape.
+      action: int, the action in the transition.
+      reward: float, the reward received in the transition.
+      terminal: A uint8 acting as a boolean indicating whether the transition
+                was terminal (1) or not (0).
+      *args: extra contents with shapes and dtypes according to
+        extra_storage_types.
+    """
+    self.memory.add(observation, action, reward, terminal, *args)
+
+  def create_sampling_ops(self, use_staging):
+    """Creates the ops necessary to sample from the replay buffer.
+
+    Creates the transition dictionary containing the sampling tensors.
+
+    Args:
+      use_staging: bool, when True it would use a staging area to prefetch
+        the next sampling batch.
+    """
+    with tf.name_scope('sample_replay'):
+      with tf.device('/cpu:*'):
+        transition_type = self.memory.get_transition_elements()
+        transition_tensors = tf.py_func(
+            self.memory.sample_transition_batch, [],
+            [return_entry.type for return_entry in transition_type],
+            name='replay_sample_py_func')
+        self._set_transition_shape(transition_tensors, transition_type)
+        if use_staging:
+          transition_tensors = self._set_up_staging(transition_tensors)
+          self._set_transition_shape(transition_tensors, transition_type)
+
+        # Unpack sample transition into member variables.
+        self.unpack_transition(transition_tensors, transition_type)
+
+  def _set_transition_shape(self, transition, transition_type):
+    """Set shape for each element in the transition.
+
+    Args:
+      transition: tuple of tf.Tensors.
+      transition_type: tuple of ReplayElements descriving the shapes of the
+        respective tensors.
+    """
+    for element, element_type in zip(transition, transition_type):
+      element.set_shape(element_type.shape)
+
+  def _set_up_staging(self, transition):
+    """Sets up staging ops for prefetching the next transition.
+
+    This allows us to hide the py_func latency. To do so we use a staging area
+    to pre-fetch the next batch of transitions.
+
+    Args:
+      transition: tuple of tf.Tensors with shape
+        memory.get_transition_elements().
+
+    Returns:
+      prefetched_transition: tuple of tf.Tensors with shape
+        memory.get_transition_elements() that have been previously prefetched.
+    """
+    transition_type = self.memory.get_transition_elements()
+
+    # Create the staging area in CPU.
+    prefetch_area = tf.contrib.staging.StagingArea(
+        [shape_with_type.type for shape_with_type in transition_type])
+
+    # Store prefetch op for tests, but keep it private -- users should not be
+    # calling _prefetch_batch.
+    self._prefetch_batch = prefetch_area.put(transition)
+    initial_prefetch = tf.cond(
+        tf.equal(prefetch_area.size(), 0),
+        lambda: prefetch_area.put(transition), tf.no_op)
+
+    # Every time a transition is sampled self.prefetch_batch will be
+    # called. If the staging area is empty, two put ops will be called.
+    with tf.control_dependencies([self._prefetch_batch, initial_prefetch]):
+      prefetched_transition = prefetch_area.get()
+
+    return prefetched_transition
+
+  def unpack_transition(self, transition_tensors, transition_type):
+    """Unpacks the given transition into member variables.
+
+    Args:
+      transition_tensors: tuple of tf.Tensors.
+      transition_type: tuple of ReplayElements matching transition_tensors.
+    """
+    self.transition = collections.OrderedDict()
+    for element, element_type in zip(transition_tensors, transition_type):
+      self.transition[element_type.name] = element
+
+    # TODO(bellemare): These are legacy and should probably be removed in
+    # future versions.
+    self.states = self.transition['state']
+    self.actions = self.transition['action']
+    self.rewards = self.transition['reward']
+    self.next_states = self.transition['next_state']
+    self.terminals = self.transition['terminal']
+    self.indices = self.transition['indices']
+
+  def save(self, checkpoint_dir, iteration_number):
+    """Save the underlying replay buffer's contents in a file.
+
+    Args:
+      checkpoint_dir: str, the directory where to read the numpy checkpointed
+        files from.
+      iteration_number: int, the iteration_number to use as a suffix in naming
+        numpy checkpoint files.
+    """
+    self.memory.save(checkpoint_dir, iteration_number)
+
+  def load(self, checkpoint_dir, suffix):
+    """Loads the replay buffer's state from a saved file.
+
+    Args:
+      checkpoint_dir: str, the directory where to read the numpy checkpointed
+        files from.
+      suffix: str, the suffix to use in numpy checkpoint files.
+    """
+    self.memory.load(checkpoint_dir, suffix)
diff --git a/dopamine/dopamine/replay_memory/prioritized_replay_buffer.py b/dopamine/dopamine/replay_memory/prioritized_replay_buffer.py
new file mode 100644
index 0000000..426cc1b
--- /dev/null
+++ b/dopamine/dopamine/replay_memory/prioritized_replay_buffer.py
@@ -0,0 +1,327 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An implementation of Prioritized Experience Replay (PER).
+
+This implementation is based on the paper "Prioritized Experience Replay"
+by Tom Schaul et al. (2015). Many thanks to Tom Schaul, John Quan, and Matteo
+Hessel for providing useful pointers on the algorithm and its implementation.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+
+from dopamine.replay_memory import circular_replay_buffer
+from dopamine.replay_memory import sum_tree
+from dopamine.replay_memory.circular_replay_buffer import ReplayElement
+import numpy as np
+import tensorflow as tf
+
+import gin.tf
+
+
+class OutOfGraphPrioritizedReplayBuffer(
+    circular_replay_buffer.OutOfGraphReplayBuffer):
+  """An out-of-graph Replay Buffer for Prioritized Experience Replay.
+
+  See circular_replay_buffer.py for details.
+  """
+
+  def __init__(self,
+               observation_shape,
+               stack_size,
+               replay_capacity,
+               batch_size,
+               update_horizon=1,
+               gamma=0.99,
+               max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
+               extra_storage_types=None,
+               observation_dtype=np.uint8):
+    """Initializes OutOfGraphPrioritizedReplayBuffer.
+
+    Args:
+      observation_shape: tuple of ints.
+      stack_size: int, number of frames to use in state stack.
+      replay_capacity: int, number of transitions to keep in memory.
+      batch_size: int.
+      update_horizon: int, length of update ('n' in n-step update).
+      gamma: int, the discount factor.
+      max_sample_attempts: int, the maximum number of attempts allowed to
+        get a sample.
+      extra_storage_types: list of ReplayElements defining the type of the extra
+        contents that will be stored and returned by sample_transition_batch.
+      observation_dtype: np.dtype, type of the observations. Defaults to
+        np.uint8 for Atari 2600.
+    """
+    super(OutOfGraphPrioritizedReplayBuffer, self).__init__(
+        observation_shape=observation_shape,
+        stack_size=stack_size,
+        replay_capacity=replay_capacity,
+        batch_size=batch_size,
+        update_horizon=update_horizon,
+        gamma=gamma,
+        max_sample_attempts=max_sample_attempts,
+        extra_storage_types=extra_storage_types,
+        observation_dtype=observation_dtype)
+
+    self.sum_tree = sum_tree.SumTree(replay_capacity)
+
+  def get_add_args_signature(self):
+    """The signature of the add function.
+
+    The signature is the same as the one for OutOfGraphReplayBuffer, with an
+    added priority.
+
+    Returns:
+      list of ReplayElements defining the type of the argument signature needed
+        by the add function.
+    """
+    parent_add_signature = super(OutOfGraphPrioritizedReplayBuffer,
+                                 self).get_add_args_signature()
+    add_signature = parent_add_signature + [
+        ReplayElement('priority', (), np.float32)
+    ]
+    return add_signature
+
+  def _add(self, *args):
+    """Internal add method to add to the underlying memory arrays.
+
+    The arguments need to match add_arg_signature.
+
+    If priority is none, it is set to the maximum priority ever seen.
+
+    Args:
+      *args: All the elements in a transition.
+    """
+    # Use Schaul et al.'s (2015) scheme of setting the priority of new elements
+    # to the maximum priority so far.
+    parent_add_args = []
+    # Picks out 'priority' from arguments and passes the other arguments to the
+    # parent method.
+    for i, element in enumerate(self.get_add_args_signature()):
+      if element.name == 'priority':
+        priority = args[i]
+      else:
+        parent_add_args.append(args[i])
+
+    self.sum_tree.set(self.cursor(), priority)
+
+    super(OutOfGraphPrioritizedReplayBuffer, self)._add(*parent_add_args)
+
+  def sample_index_batch(self, batch_size):
+    """Returns a batch of valid indices sampled as in Schaul et al. (2015).
+
+    Args:
+      batch_size: int, number of indices returned.
+
+    Returns:
+      list of ints, a batch of valid indices sampled uniformly.
+
+    Raises:
+      Exception: If the batch was not constructed after maximum number of tries.
+    """
+    # Sample stratified indices. Some of them might be invalid.
+    indices = self.sum_tree.stratified_sample(batch_size)
+    allowed_attempts = self._max_sample_attempts
+    for i in range(len(indices)):
+      if not self.is_valid_transition(indices[i]):
+        if allowed_attempts == 0:
+          raise RuntimeError(
+              'Max saple attempsts: Tried {} times but only sampled {}'
+              ' valid indices. Batch size is {}'.
+              format(self._max_sample_attempts, i, batch_size))
+        index = indices[i]
+        while not self.is_valid_transition(index) and allowed_attempts > 0:
+          # If index i is not valid keep sampling others. Note that this
+          # is not stratified.
+          index = self.sum_tree.sample()
+          allowed_attempts -= 1
+        indices[i] = index
+    return indices
+
+  def sample_transition_batch(self, batch_size=None, indices=None):
+    """Returns a batch of transitions with extra storage and the priorities.
+
+    The extra storage are defined through the extra_storage_types constructor
+    argument.
+
+    When the transition is terminal next_state_batch has undefined contents.
+
+    Args:
+      batch_size: int, number of transitions returned. If None, the default
+        batch_size will be used.
+      indices: None or list of ints, the indices of every transition in the
+        batch. If None, sample the indices uniformly.
+
+    Returns:
+      transition_batch: tuple of np.arrays with the shape and type as in
+        get_transition_elements().
+    """
+    transition = (super(OutOfGraphPrioritizedReplayBuffer, self).
+                  sample_transition_batch(batch_size, indices))
+    transition_elements = self.get_transition_elements(batch_size)
+    transition_names = [e.name for e in transition_elements]
+    probabilities_index = transition_names.index('sampling_probabilities')
+    indices_index = transition_names.index('indices')
+    indices = transition[indices_index]
+    # The parent returned an empty array for the probabilities. Fill it with the
+    # contents of the sum tree.
+    transition[probabilities_index][:] = self.get_priority(indices)
+    return transition
+
+  def set_priority(self, indices, priorities):
+    """Sets the priority of the given elements according to Schaul et al.
+
+    Args:
+      indices: np.array with dtype int32, of indices in range
+        [0, replay_capacity).
+      priorities: float, the corresponding priorities.
+    """
+    assert indices.dtype == np.int32, ('Indices must be integers, '
+                                       'given: {}'.format(indices.dtype))
+    for index, priority in zip(indices, priorities):
+      self.sum_tree.set(index, priority)
+
+  def get_priority(self, indices):
+    """Fetches the priorities correspond to a batch of memory indices.
+
+    For any memory location not yet used, the corresponding priority is 0.
+
+    Args:
+      indices: np.array with dtype int32, of indices in range
+        [0, replay_capacity).
+
+    Returns:
+      priorities: float, the corresponding priorities.
+    """
+    assert indices.shape, 'Indices must be an array.'
+    assert indices.dtype == np.int32, ('Indices must be int32s, '
+                                       'given: {}'.format(indices.dtype))
+    batch_size = len(indices)
+    priority_batch = np.empty((batch_size), dtype=np.float32)
+    for i, memory_index in enumerate(indices):
+      priority_batch[i] = self.sum_tree.get(memory_index)
+    return priority_batch
+
+  def get_transition_elements(self, batch_size=None):
+    """Returns a 'type signature' for sample_transition_batch.
+
+    Args:
+      batch_size: int, number of transitions returned. If None, the default
+        batch_size will be used.
+    Returns:
+      signature: A namedtuple describing the method's return type signature.
+    """
+    parent_transition_type = (
+        super(OutOfGraphPrioritizedReplayBuffer,
+              self).get_transition_elements(batch_size))
+    probablilities_type = [
+        ReplayElement('sampling_probabilities', (batch_size,), np.float32)
+    ]
+    return parent_transition_type + probablilities_type
+
+
+@gin.configurable(blacklist=['observation_shape', 'stack_size',
+                             'update_horizon', 'gamma'])
+class WrappedPrioritizedReplayBuffer(
+    circular_replay_buffer.WrappedReplayBuffer):
+  """Wrapper of OutOfGraphPrioritizedReplayBuffer with in-graph sampling.
+
+  Usage:
+
+    * To add a transition:  Call the add function.
+
+    * To sample a batch:  Query any of the tensors in the transition dictionary.
+                          Every sess.run that requires any of these tensors will
+                          sample a new transition.
+  """
+
+  def __init__(self,
+               observation_shape,
+               stack_size,
+               use_staging=True,
+               replay_capacity=1000000,
+               batch_size=32,
+               update_horizon=1,
+               gamma=0.99,
+               max_sample_attempts=circular_replay_buffer.MAX_SAMPLE_ATTEMPTS,
+               extra_storage_types=None,
+               observation_dtype=np.uint8):
+    """Initializes WrappedPrioritizedReplayBuffer.
+
+    Args:
+      observation_shape: tuple of ints.
+      stack_size: int, number of frames to use in state stack.
+      use_staging: bool, when True it would use a staging area to prefetch
+        the next sampling batch.
+      replay_capacity: int, number of transitions to keep in memory.
+      batch_size: int.
+      update_horizon: int, length of update ('n' in n-step update).
+      gamma: int, the discount factor.
+      max_sample_attempts: int, the maximum number of attempts allowed to
+        get a sample.
+      extra_storage_types: list of ReplayElements defining the type of the extra
+        contents that will be stored and returned by sample_transition_batch.
+      observation_dtype: np.dtype, type of the observations. Defaults to
+        np.uint8 for Atari 2600.
+
+    Raises:
+      ValueError: If update_horizon is not positive.
+      ValueError: If discount factor is not in [0, 1].
+    """
+    memory = OutOfGraphPrioritizedReplayBuffer(
+        observation_shape, stack_size, replay_capacity, batch_size,
+        update_horizon, gamma, max_sample_attempts,
+        extra_storage_types=extra_storage_types)
+    super(WrappedPrioritizedReplayBuffer, self).__init__(
+        observation_shape,
+        stack_size,
+        use_staging,
+        replay_capacity,
+        batch_size,
+        update_horizon,
+        gamma,
+        wrapped_memory=memory,
+        extra_storage_types=extra_storage_types)
+
+  def tf_set_priority(self, indices, priorities):
+    """Sets the priorities for the given indices.
+
+    Args:
+      indices: tf.Tensor with dtype int32 and shape [n].
+      priorities: tf.Tensor with dtype float and shape [n].
+
+    Returns:
+       A tf op setting the priorities for prioritized sampling.
+    """
+    return tf.py_func(
+        self.memory.set_priority, [indices, priorities], [],
+        name='prioritized_replay_set_priority_py_func')
+
+  def tf_get_priority(self, indices):
+    """Gets the priorities for the given indices.
+
+    Args:
+      indices: tf.Tensor with dtype int32 and shape [n].
+
+    Returns:
+      priorities: tf.Tensor with dtype float and shape [n], the priorities at
+        the indices.
+    """
+    return tf.py_func(
+        self.memory.get_priority, [indices],
+        tf.float32,
+        name='prioritized_replay_get_priority_py_func')
diff --git a/dopamine/dopamine/replay_memory/sum_tree.py b/dopamine/dopamine/replay_memory/sum_tree.py
new file mode 100644
index 0000000..406a491
--- /dev/null
+++ b/dopamine/dopamine/replay_memory/sum_tree.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A sum tree data structure.
+
+Used for prioritized experience replay. See prioritized_replay_buffer.py
+and Schaul et al. (2015).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import random
+
+import numpy as np
+
+
+class SumTree(object):
+  """A sum tree data structure for storing replay priorities.
+
+  A sum tree is a complete binary tree whose leaves contain values called
+  priorities. Internal nodes maintain the sum of the priorities of all leaf
+  nodes in their subtree.
+
+  For capacity = 4, the tree may look like this:
+
+               +---+
+               |2.5|
+               +-+-+
+                 |
+         +-------+--------+
+         |                |
+       +-+-+            +-+-+
+       |1.5|            |1.0|
+       +-+-+            +-+-+
+         |                |
+    +----+----+      +----+----+
+    |         |      |         |
+  +-+-+     +-+-+  +-+-+     +-+-+
+  |0.5|     |1.0|  |0.5|     |0.5|
+  +---+     +---+  +---+     +---+
+
+  This is stored in a list of numpy arrays:
+  self.nodes = [ [2.5], [1.5, 1], [0.5, 1, 0.5, 0.5] ]
+
+  For conciseness, we allocate arrays as powers of two, and pad the excess
+  elements with zero values.
+
+  This is similar to the usual array-based representation of a complete binary
+  tree, but is a little more user-friendly.
+  """
+
+  def __init__(self, capacity):
+    """Creates the sum tree data structure for the given replay capacity.
+
+    Args:
+      capacity: int, the maximum number of elements that can be stored in this
+        data structure.
+
+    Raises:
+      ValueError: If requested capacity is not positive.
+    """
+    assert isinstance(capacity, int)
+    if capacity <= 0:
+      raise ValueError('Sum tree capacity should be positive. Got: {}'.
+                       format(capacity))
+
+    self.nodes = []
+    tree_depth = int(math.ceil(np.log2(capacity)))
+    level_size = 1
+    for _ in range(tree_depth + 1):
+      nodes_at_this_depth = np.zeros(level_size)
+      self.nodes.append(nodes_at_this_depth)
+
+      level_size *= 2
+
+    self.max_recorded_priority = 1.0
+
+  def _total_priority(self):
+    """Returns the sum of all priorities stored in this sum tree.
+
+    Returns:
+      float, sum of priorities stored in this sum tree.
+    """
+    return self.nodes[0][0]
+
+  def sample(self, query_value=None):
+    """Samples an element from the sum tree.
+
+    Each element has probability p_i / sum_j p_j of being picked, where p_i is
+    the (positive) value associated with node i (possibly unnormalized).
+
+    Args:
+      query_value: float in [0, 1], used as the random value to select a
+      sample. If None, will select one randomly in [0, 1).
+
+    Returns:
+      int, a random element from the sum tree.
+
+    Raises:
+      Exception: If the sum tree is empty (i.e. its node values sum to 0), or if
+        the supplied query_value is larger than the total sum.
+    """
+    if self._total_priority() == 0.0:
+      raise Exception('Cannot sample from an empty sum tree.')
+
+    if query_value and (query_value < 0. or query_value > 1.):
+      raise ValueError('query_value must be in [0, 1].')
+
+    # Sample a value in range [0, R), where R is the value stored at the root.
+    query_value = random.random() if query_value is None else query_value
+    query_value *= self._total_priority()
+
+    # Now traverse the sum tree.
+    node_index = 0
+    for nodes_at_this_depth in self.nodes[1:]:
+      # Compute children of previous depth's node.
+      left_child = node_index * 2
+
+      left_sum = nodes_at_this_depth[left_child]
+      # Each subtree describes a range [0, a), where a is its value.
+      if query_value < left_sum:  # Recurse into left subtree.
+        node_index = left_child
+      else:  # Recurse into right subtree.
+        node_index = left_child + 1
+        # Adjust query to be relative to right subtree.
+        query_value -= left_sum
+
+    return node_index
+
+  def stratified_sample(self, batch_size):
+    """Performs stratified sampling using the sum tree.
+
+    Let R be the value at the root (total value of sum tree). This method will
+    divide [0, R) into batch_size segments, pick a random number from each of
+    those segments, and use that random number to sample from the sum_tree. This
+    is as specified in Schaul et al. (2015).
+
+    Args:
+      batch_size: int, the number of strata to use.
+    Returns:
+      list of batch_size elements sampled from the sum tree.
+
+    Raises:
+      Exception: If the sum tree is empty (i.e. its node values sum to 0).
+    """
+    if self._total_priority() == 0.0:
+      raise Exception('Cannot sample from an empty sum tree.')
+
+    bounds = np.linspace(0., 1., batch_size + 1)
+    assert len(bounds) == batch_size + 1
+    segments = [(bounds[i], bounds[i+1]) for i in range(batch_size)]
+    query_values = [random.uniform(x[0], x[1]) for x in segments]
+    return [self.sample(query_value=x) for x in query_values]
+
+  def get(self, node_index):
+    """Returns the value of the leaf node corresponding to the index.
+
+    Args:
+      node_index: The index of the leaf node.
+    Returns:
+      The value of the leaf node.
+    """
+    return self.nodes[-1][node_index]
+
+  def set(self, node_index, value):
+    """Sets the value of a leaf node and updates internal nodes accordingly.
+
+    This operation takes O(log(capacity)).
+    Args:
+      node_index: int, the index of the leaf node to be updated.
+      value: float, the value which we assign to the node. This value must be
+        nonnegative. Setting value = 0 will cause the element to never be
+        sampled.
+
+    Raises:
+      ValueError: If the given value is negative.
+    """
+    if value < 0.0:
+      raise ValueError('Sum tree values should be nonnegative. Got {}'.
+                       format(value))
+    self.max_recorded_priority = max(value, self.max_recorded_priority)
+
+    delta_value = value - self.nodes[-1][node_index]
+
+    # Now traverse back the tree, adjusting all sums along the way.
+    for nodes_at_this_depth in reversed(self.nodes):
+      # Note: Adding a delta leads to some tolerable numerical inaccuracies.
+      nodes_at_this_depth[node_index] += delta_value
+      node_index //= 2
+
+    assert node_index == 0, ('Sum tree traversal failed, final node index '
+                             'is not 0.')
diff --git a/dopamine/dopamine/utils/__init__.py b/dopamine/dopamine/utils/__init__.py
new file mode 100644
index 0000000..920cbb5
--- /dev/null
+++ b/dopamine/dopamine/utils/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/dopamine/dopamine/utils/test_utils.py b/dopamine/dopamine/utils/test_utils.py
new file mode 100644
index 0000000..05ecd73
--- /dev/null
+++ b/dopamine/dopamine/utils/test_utils.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common testing utilities shared across agents."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+
+import mock
+import tensorflow as tf
+
+
+class MockReplayBuffer(object):
+  """Mock ReplayBuffer to verify the way the agent interacts with it."""
+
+  def __init__(self):
+    with tf.variable_scope('MockReplayBuffer', reuse=tf.AUTO_REUSE):
+      self.add = mock.Mock()
+      self.memory = mock.Mock()
+      self.memory.add_count = 0
diff --git a/dopamine/gym/preprocessing.py b/dopamine/gym/preprocessing.py
new file mode 100644
index 0000000..80f7233
--- /dev/null
+++ b/dopamine/gym/preprocessing.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A wrapper class around Gym environments.
+
+This class makes general Gym environments conformant with the API Dopamine is
+expecting.
+"""
+
+import gin.tf
+
+
+@gin.configurable
+class GymPreprocessing(object):
+  """A Wrapper class around Gym environments."""
+
+  def __init__(self, environment):
+    self.environment = environment
+    self.game_over = False
+
+  @property
+  def observation_space(self):
+    return self.environment.observation_space
+
+  @property
+  def action_space(self):
+    return self.environment.action_space
+
+  @property
+  def reward_range(self):
+    return self.environment.reward_range
+
+  @property
+  def metadata(self):
+    return self.environment.metadata
+
+  def reset(self):
+    return self.environment.reset()
+
+  def step(self, action):
+    observation, reward, game_over, info = self.environment.step(action)
+    self.game_over = game_over
+    return observation, reward, game_over, info
diff --git a/dopamine/setup.py b/dopamine/setup.py
new file mode 100644
index 0000000..adb82f4
--- /dev/null
+++ b/dopamine/setup.py
@@ -0,0 +1,92 @@
+# coding=utf-8
+# Copyright 2018 The Dopamine Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Setup script for Dopamine.
+
+This script will install Dopamine as a Python module.
+
+See: https://github.com/google/dopamine
+
+"""
+
+import codecs
+from os import path
+from setuptools import find_packages
+from setuptools import setup
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the README file.
+with codecs.open(path.join(here, 'README.md'), encoding='utf-8') as f:
+  long_description = f.read()
+
+install_requires = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2',
+                    'opencv-python >= 3.4.1.15',
+                    'gym >= 0.10.5']
+tests_require = ['gin-config >= 0.1.1', 'absl-py >= 0.2.2',
+                 'opencv-python >= 3.4.1.15',
+                 'gym >= 0.10.5', 'mock >= 1.0.0']
+
+dopamine_description = (
+    'Dopamine: A framework for flexible Reinforcement Learning research')
+
+setup(
+    name='dopamine_rl',
+    version='1.0.5',
+    include_package_data=True,
+    packages=find_packages(exclude=['docs']),  # Required
+    package_data={'testdata': ['testdata/*.gin']},
+    install_requires=install_requires,
+    tests_require=tests_require,
+    description=dopamine_description,
+    long_description=long_description,
+    url='https://github.com/google/dopamine',  # Optional
+    author='The Dopamine Team',  # Optional
+    author_email='opensource@google.com',
+    classifiers=[  # Optional
+        'Development Status :: 4 - Beta',
+
+        # Indicate who your project is intended for
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+
+        # Pick your license as you wish
+        'License :: OSI Approved :: Apache Software License',
+
+        # Specify the Python versions you support here. In particular, ensure
+        # that you indicate whether you support Python 2, Python 3 or both.
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+
+    ],
+    project_urls={  # Optional
+        'Documentation': 'https://github.com/google/dopamine',
+        'Bug Reports': 'https://github.com/google/dopamine/issues',
+        'Source': 'https://github.com/google/dopamine',
+    },
+    license='Apache 2.0',
+    keywords='dopamine reinforcement-learning python machine learning'
+)