Initial support for multiple observations (#256)

* Initial support for multiple observations * Fix PPO for continuous control
Unity-Technologies · Jan 19, 2018 · a1d35bf · a1d35bf · awjuliani · Mar 15, 2018
1 parent 921bb15
commit a1d35bf
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 16 deletions.
diff --git a/python/ppo/history.py b/python/ppo/history.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons',
+history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons',
                 'value_estimates', 'advantages', 'discounted_returns']
 
 
@@ -44,6 +44,8 @@ def empty_local_history(agent_dict):
     """
     for key in history_keys:
         agent_dict[key] = []
+    for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')):
+        agent_dict['observations%d' % i] = []
     return agent_dict
 
 
@@ -55,6 +57,8 @@ def vectorize_history(agent_dict):
     """
     for key in history_keys:
         agent_dict[key] = np.array(agent_dict[key])
+    for key in (key for key in agent_dict.keys() if key.startswith('observations')):
+        agent_dict[key] = np.array(agent_dict[key])
     return agent_dict
 
 
@@ -70,6 +74,8 @@ def empty_all_history(agent_info):
         history_dict[agent] = empty_local_history(history_dict[agent])
         history_dict[agent]['cumulative_reward'] = 0
         history_dict[agent]['episode_steps'] = 0
+        for i, _ in enumerate(agent_info.observations):
+            history_dict[agent]['observations%d' % i] = []
     return history_dict
 
 
@@ -82,6 +88,8 @@ def append_history(global_buffer, local_buffer=None):
     """
     for key in history_keys:
         global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
+    for key in (key for key in local_buffer.keys() if key.startswith('observations')):
+        global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
     return global_buffer
 
 
@@ -94,6 +102,8 @@ def set_history(global_buffer, local_buffer=None):
     """
     for key in history_keys:
         global_buffer[key] = np.copy(local_buffer[key])
+    for key in (key for key in local_buffer.keys() if key.startswith('observations')):
+        global_buffer[key] = np.array(local_buffer[key])
     return global_buffer
 
 
@@ -108,4 +118,7 @@ def shuffle_buffer(global_buffer):
     for key in history_keys:
         if len(global_buffer[key]) > 0:
             global_buffer[key] = global_buffer[key][s]
+    for key in (key for key in global_buffer.keys() if key.startswith('observations')):
+        if len(global_buffer[key]) > 0:
+            global_buffer[key] = global_buffer[key][s]
     return global_buffer
diff --git a/python/ppo/models.py b/python/ppo/models.py
@@ -61,6 +61,7 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
 class PPOModel(object):
     def __init__(self):
         self.normalize = False
+        self.observation_in = []
 
     def create_global_steps(self):
         """Creates TF ops to track and increment global training step."""
@@ -89,11 +90,11 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
         else:
             c_channels = 3
 
-        self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
-                                             name='observation_0')
+        self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
+                                             name='observation_%d' % len(self.observation_in)))
         streams = []
         for i in range(num_streams):
-            self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
+            self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
                                           use_bias=False, activation=activation)
             self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
                                           use_bias=False, activation=activation)
@@ -213,10 +214,12 @@ def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
         self.create_reward_encoder()
 
         hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
-        if brain.number_observations > 0:
-            height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
-            bw = brain.camera_resolutions[0]['blackAndWhite']
-            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
+        encoders = []
+        for i in range(brain.number_observations):
+            height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
+            bw = brain.camera_resolutions[i]['blackAndWhite']
+            encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers))
+        hidden_visual = tf.concat(encoders, axis=2)
         if brain.state_space_size > 0:
             s_size = brain.state_space_size
             if brain.state_space_type == "continuous":
@@ -275,10 +278,12 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la
         self.normalize = normalize
 
         hidden_state, hidden_visual, hidden = None, None, None
-        if brain.number_observations > 0:
-            height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
-            bw = brain.camera_resolutions[0]['blackAndWhite']
-            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
+        encoders = []
+        for i in range(brain.number_observations):
+            height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
+            bw = brain.camera_resolutions[i]['blackAndWhite']
+            encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0])
+        hidden_visual = tf.concat(encoders, axis=1)
         if brain.state_space_size > 0:
             s_size = brain.state_space_size
             if brain.state_space_type == "continuous":

diff --git a/python/ppo/trainer.py b/python/ppo/trainer.py
@@ -57,7 +57,8 @@ def take_action(self, info, env, brain_name, steps, normalize):
             epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
             feed_dict[self.model.epsilon] = epsi
         if self.use_observations:
-            feed_dict[self.model.observation_in] = np.vstack(info.observations)
+            for i, _ in enumerate(info.observations):
+                feed_dict[self.model.observation_in[i]] = info.observations[i]
         if self.use_states:
             feed_dict[self.model.state_in] = info.states
         if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
@@ -91,7 +92,8 @@ def add_experiences(self, info, next_info, epsi, actions, a_dist, value):
                 idx = info.agents.index(agent)
                 if not info.local_done[idx]:
                     if self.use_observations:
-                        history['observations'].append([info.observations[0][idx]])
+                        for i, _ in enumerate(info.observations):
+                            history['observations%d' % i].append([info.observations[i][idx]])
                     if self.use_states:
                         history['states'].append(info.states[idx])
                     if self.is_continuous:
@@ -120,7 +122,8 @@ def process_experiences(self, info, time_horizon, gamma, lambd):
                 else:
                     feed_dict = {self.model.batch_size: len(info.states)}
                     if self.use_observations:
-                        feed_dict[self.model.observation_in] = np.vstack(info.observations)
+                        for i in range(self.info.observations):
+                            feed_dict[self.model.observation_in[i]] = info.observations[i]
                     if self.use_states:
                         feed_dict[self.model.state_in] = info.states
                     value_next = self.sess.run(self.model.value, feed_dict)[l]
@@ -176,7 +179,8 @@ def update_model(self, batch_size, num_epoch):
                 if self.use_states:
                     feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end])
                 if self.use_observations:
-                    feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end])
+                    for i, _ in enumerate(self.model.observation_in):
+                        feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end])
                 v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
                                                    self.model.update_batch], feed_dict=feed_dict)
                 total_v += v_loss