From fea1a6fd05f7da544de0ab03fc70e526a8f33dd6 Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Fri, 21 Feb 2020 13:54:09 +0100 Subject: [PATCH 1/3] Wrote test for vectorized environments discount --- tests/unit/rl_tests.py | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/unit/rl_tests.py b/tests/unit/rl_tests.py index 6bd0263..ccd63b9 100644 --- a/tests/unit/rl_tests.py +++ b/tests/unit/rl_tests.py @@ -9,6 +9,8 @@ TAU = 0.9 NUM_SAMPLES = 10 VECTOR_SIZE = 5 +TIME_STEPS = 10 +NUM_ENVS = 4 """ TODO: Should test each method to make sure that they properly handle different @@ -61,6 +63,52 @@ def setUp(self): def tearDown(self): pass + + + def test_vectorized_discount(self): + state = th.randn(TIME_STEPS, NUM_ENVS, VECTOR_SIZE) + action = th.randn(TIME_STEPS, NUM_ENVS) + reward = th.randn(TIME_STEPS, NUM_ENVS) + boostrap = th.randn(NUM_ENVS) + done = th.zeros_like(reward) + for i in list(reversed(range(TIME_STEPS)))[:4]: + done[i,i%NUM_ENVS] = 1 + + + # Computing the discounted rewards + # as non-vectorized environment + nonvec_discounted_rewards = [] + for i in range(NUM_ENVS): + replay = ch.ExperienceReplay() + for t in range(TIME_STEPS): + replay.append( + state[t, i, :], action[t, i], + reward[t, i], state[t, i, :], done[t, i] + ) + nonvec_discounted_rewards.append( + ch.td.discount( + GAMMA, replay.reward(), replay.done(), boostrap[i] + ) + ) + # Computing the discounted rewards + # as vectorized environment + replay = ch.ExperienceReplay() + for t in range(TIME_STEPS): + replay.append( + state[t, :, :], action[t, :], + reward[t, :], state[t, :, :], done[t, :] + ) + vec_discounted_rewards = ch.td.discount( + GAMMA, replay.reward(), replay.done(), boostrap + ) + + for i in range(NUM_ENVS): + assert th.all( + nonvec_discounted_rewards[i][:, 0] + == + vec_discounted_rewards[:, i], + ) + def test_discount(self): vector = th.randn(VECTOR_SIZE) From 89898c4d7a15a73a5e40c132b9613f32e8d70c1c Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Fri, 21 Feb 2020 14:05:39 +0100 Subject: [PATCH 2/3] Bugfix: td.discount now correctly support vectorized replays --- cherry/td.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cherry/td.py b/cherry/td.py index 03338d9..c62a64c 100644 --- a/cherry/td.py +++ b/cherry/td.py @@ -52,7 +52,7 @@ def discount(gamma, rewards, dones, bootstrap=0.0): msg = 'dones and rewards must have equal length.' assert rewards.size(0) == dones.size(0), msg - R = th.zeros_like(rewards[0]) + bootstrap + R = th.zeros_like(rewards) + bootstrap discounted = th.zeros_like(rewards) length = discounted.size(0) for t in reversed(range(length)): From 576236089e93a3183499bcc4cd2b75aee4c142c0 Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Wed, 26 Feb 2020 13:00:08 +0100 Subject: [PATCH 3/3] Wrote changes in changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4edc402..6c578a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,5 +18,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +* Bugfix when using `td.discount` with replays coming from vectorized environments (@galatolofederico) * Actor-critic integration test being to finicky. * `cherry.onehot` support for numpy's float and integer types. (thanks @ngoby)