diff --git a/CHANGELOG.md b/CHANGELOG.md index 4595810..2956142 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +* Bugfix when using `td.discount` with replays coming from vectorized environments (@galatolofederico) * env.action_size and env.state_size when the number of vectorized environments is 1. (thanks @galatolofederico) * Actor-critic integration test being to finicky. * `cherry.onehot` support for numpy's float and integer types. (thanks @ngoby) diff --git a/cherry/td.py b/cherry/td.py index 03338d9..c62a64c 100644 --- a/cherry/td.py +++ b/cherry/td.py @@ -52,7 +52,7 @@ def discount(gamma, rewards, dones, bootstrap=0.0): msg = 'dones and rewards must have equal length.' assert rewards.size(0) == dones.size(0), msg - R = th.zeros_like(rewards[0]) + bootstrap + R = th.zeros_like(rewards) + bootstrap discounted = th.zeros_like(rewards) length = discounted.size(0) for t in reversed(range(length)): diff --git a/tests/unit/rl_tests.py b/tests/unit/rl_tests.py index 6bd0263..ccd63b9 100644 --- a/tests/unit/rl_tests.py +++ b/tests/unit/rl_tests.py @@ -9,6 +9,8 @@ TAU = 0.9 NUM_SAMPLES = 10 VECTOR_SIZE = 5 +TIME_STEPS = 10 +NUM_ENVS = 4 """ TODO: Should test each method to make sure that they properly handle different @@ -61,6 +63,52 @@ def setUp(self): def tearDown(self): pass + + + def test_vectorized_discount(self): + state = th.randn(TIME_STEPS, NUM_ENVS, VECTOR_SIZE) + action = th.randn(TIME_STEPS, NUM_ENVS) + reward = th.randn(TIME_STEPS, NUM_ENVS) + boostrap = th.randn(NUM_ENVS) + done = th.zeros_like(reward) + for i in list(reversed(range(TIME_STEPS)))[:4]: + done[i,i%NUM_ENVS] = 1 + + + # Computing the discounted rewards + # as non-vectorized environment + nonvec_discounted_rewards = [] + for i in range(NUM_ENVS): + replay = ch.ExperienceReplay() + for t in range(TIME_STEPS): + replay.append( + state[t, i, :], action[t, i], + reward[t, i], state[t, i, :], done[t, i] + ) + nonvec_discounted_rewards.append( + ch.td.discount( + GAMMA, replay.reward(), replay.done(), boostrap[i] + ) + ) + # Computing the discounted rewards + # as vectorized environment + replay = ch.ExperienceReplay() + for t in range(TIME_STEPS): + replay.append( + state[t, :, :], action[t, :], + reward[t, :], state[t, :, :], done[t, :] + ) + vec_discounted_rewards = ch.td.discount( + GAMMA, replay.reward(), replay.done(), boostrap + ) + + for i in range(NUM_ENVS): + assert th.all( + nonvec_discounted_rewards[i][:, 0] + == + vec_discounted_rewards[:, i], + ) + def test_discount(self): vector = th.randn(VECTOR_SIZE)