Skip to content

Commit

Permalink
Initial support for multiple observations (#256)
Browse files Browse the repository at this point in the history
* Initial support for multiple observations

* Fix PPO for continuous control
  • Loading branch information
asolano authored and awjuliani committed Jan 19, 2018
1 parent 921bb15 commit a1d35bf
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 16 deletions.
15 changes: 14 additions & 1 deletion python/ppo/history.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons',
history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons',
'value_estimates', 'advantages', 'discounted_returns']


Expand Down Expand Up @@ -44,6 +44,8 @@ def empty_local_history(agent_dict):
"""
for key in history_keys:
agent_dict[key] = []
for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')):
agent_dict['observations%d' % i] = []
return agent_dict


Expand All @@ -55,6 +57,8 @@ def vectorize_history(agent_dict):
"""
for key in history_keys:
agent_dict[key] = np.array(agent_dict[key])
for key in (key for key in agent_dict.keys() if key.startswith('observations')):
agent_dict[key] = np.array(agent_dict[key])
return agent_dict


Expand All @@ -70,6 +74,8 @@ def empty_all_history(agent_info):
history_dict[agent] = empty_local_history(history_dict[agent])
history_dict[agent]['cumulative_reward'] = 0
history_dict[agent]['episode_steps'] = 0
for i, _ in enumerate(agent_info.observations):
history_dict[agent]['observations%d' % i] = []
return history_dict


Expand All @@ -82,6 +88,8 @@ def append_history(global_buffer, local_buffer=None):
"""
for key in history_keys:
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
return global_buffer


Expand All @@ -94,6 +102,8 @@ def set_history(global_buffer, local_buffer=None):
"""
for key in history_keys:
global_buffer[key] = np.copy(local_buffer[key])
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
global_buffer[key] = np.array(local_buffer[key])
return global_buffer


Expand All @@ -108,4 +118,7 @@ def shuffle_buffer(global_buffer):
for key in history_keys:
if len(global_buffer[key]) > 0:
global_buffer[key] = global_buffer[key][s]
for key in (key for key in global_buffer.keys() if key.startswith('observations')):
if len(global_buffer[key]) > 0:
global_buffer[key] = global_buffer[key][s]
return global_buffer
27 changes: 16 additions & 11 deletions python/ppo/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
class PPOModel(object):
def __init__(self):
self.normalize = False
self.observation_in = []

def create_global_steps(self):
"""Creates TF ops to track and increment global training step."""
Expand Down Expand Up @@ -89,11 +90,11 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
else:
c_channels = 3

self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_0')
self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_%d' % len(self.observation_in)))
streams = []
for i in range(num_streams):
self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
use_bias=False, activation=activation)
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
use_bias=False, activation=activation)
Expand Down Expand Up @@ -213,10 +214,12 @@ def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
self.create_reward_encoder()

hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
if brain.number_observations > 0:
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
bw = brain.camera_resolutions[0]['blackAndWhite']
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
encoders = []
for i in range(brain.number_observations):
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
bw = brain.camera_resolutions[i]['blackAndWhite']
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers))
hidden_visual = tf.concat(encoders, axis=2)
if brain.state_space_size > 0:
s_size = brain.state_space_size
if brain.state_space_type == "continuous":
Expand Down Expand Up @@ -275,10 +278,12 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la
self.normalize = normalize

hidden_state, hidden_visual, hidden = None, None, None
if brain.number_observations > 0:
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
bw = brain.camera_resolutions[0]['blackAndWhite']
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
encoders = []
for i in range(brain.number_observations):
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
bw = brain.camera_resolutions[i]['blackAndWhite']
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0])
hidden_visual = tf.concat(encoders, axis=1)
if brain.state_space_size > 0:
s_size = brain.state_space_size
if brain.state_space_type == "continuous":
Expand Down
12 changes: 8 additions & 4 deletions python/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def take_action(self, info, env, brain_name, steps, normalize):
epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
feed_dict[self.model.epsilon] = epsi
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(info.observations)
for i, _ in enumerate(info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
Expand Down Expand Up @@ -91,7 +92,8 @@ def add_experiences(self, info, next_info, epsi, actions, a_dist, value):
idx = info.agents.index(agent)
if not info.local_done[idx]:
if self.use_observations:
history['observations'].append([info.observations[0][idx]])
for i, _ in enumerate(info.observations):
history['observations%d' % i].append([info.observations[i][idx]])
if self.use_states:
history['states'].append(info.states[idx])
if self.is_continuous:
Expand Down Expand Up @@ -120,7 +122,8 @@ def process_experiences(self, info, time_horizon, gamma, lambd):
else:
feed_dict = {self.model.batch_size: len(info.states)}
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(info.observations)
for i in range(self.info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states
value_next = self.sess.run(self.model.value, feed_dict)[l]
Expand Down Expand Up @@ -176,7 +179,8 @@ def update_model(self, batch_size, num_epoch):
if self.use_states:
feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end])
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end])
for i, _ in enumerate(self.model.observation_in):
feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end])
v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
self.model.update_batch], feed_dict=feed_dict)
total_v += v_loss
Expand Down

1 comment on commit a1d35bf

@awjuliani
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks again for making this commit, @asolano. We are going to be overwriting this change with the new v0.3 code which was a re-implementation of this. Just wanted to let you know, in case you realized your commit disappeared. Thanks again for contributing to ML-Agents!

Please sign in to comment.