thu-ml · Trinkle23897 · Jul 23, 2020 · Jul 23, 2020 · Jul 23, 2020 · Jul 23, 2020
diff --git a/docs/tutorials/concepts.rst b/docs/tutorials/concepts.rst
@@ -130,7 +130,7 @@ In short, :class:`~tianshou.data.Collector` has two main methods:
 * :meth:`~tianshou.data.Collector.collect`: let the policy perform (at least) a specified number of step ``n_step`` or episode ``n_episode`` and store the data in the replay buffer;
 * :meth:`~tianshou.data.Collector.sample`: sample a data batch from replay buffer; it will call :meth:`~tianshou.policy.BasePolicy.process_fn` before returning the final batch data.
 
-Why do we mention **at least** here? For a single environment, the collector will finish exactly ``n_step`` or ``n_episode``. However, for multiple environments, we could not directly store the collected data into the replay buffer, since it breaks the principle of storing data chronologically.
+Why do we mention **at least** here? For multiple environments, we could not directly store the collected data into the replay buffer, since it breaks the principle of storing data chronologically.
 
 The solution is to add some cache buffers inside the collector. Once collecting **a full episode of trajectory**, it will move the stored data from the cache buffer to the main buffer. To satisfy this condition, the collector will interact with environments that may exceed the given step number or episode number.
 

diff --git a/docs/tutorials/dqn.rst b/docs/tutorials/dqn.rst
@@ -179,7 +179,7 @@ Train a Policy with Customized Codes
 Tianshou supports user-defined training code. Here is the code snippet:
 ::
 
-    # pre-collect 5000 frames with random action before training
+    # pre-collect at least 5000 frames with random action before training
     policy.set_eps(1)
     train_collector.collect(n_step=5000)
 

diff --git a/test/base/test_collector.py b/test/base/test_collector.py
@@ -25,29 +25,39 @@ def learn(self):
         pass
 
 
-def preprocess_fn(**kwargs):
-    # modify info before adding into the buffer
-    # if info is not provided from env, it will be a ``Batch()``.
-    if not kwargs.get('info', Batch()).is_empty():
-        n = len(kwargs['obs'])
-        info = kwargs['info']
-        for i in range(n):
-            info[i].update(rew=kwargs['rew'][i])
-        return {'info': info}
-        # or: return Batch(info=info)
-    else:
-        return Batch()
-
-
-class Logger(object):
+class Logger:
     def __init__(self, writer):
         self.cnt = 0
         self.writer = writer
 
-    def log(self, info):
-        self.writer.add_scalar(
-            'key', np.mean(info['key']), global_step=self.cnt)
-        self.cnt += 1
+    def preprocess_fn(self, **kwargs):
+        # modify info before adding into the buffer, and recorded into tfb
+        # if info is not provided from env, it will be a ``Batch()``.
+        if not kwargs.get('info', Batch()).is_empty():
+            n = len(kwargs['obs'])
+            info = kwargs['info']
+            for i in range(n):
+                info[i].update(rew=kwargs['rew'][i])
+            self.writer.add_scalar('key', np.mean(
+                info['key']), global_step=self.cnt)
+            self.cnt += 1
+            return Batch(info=info)
+            # or: return {'info': info}
+        else:
+            return Batch()
+
+    @staticmethod
+    def single_preprocess_fn(**kwargs):
+        # same as above, without tfb
+        if not kwargs.get('info', Batch()).is_empty():
+            n = len(kwargs['obs'])
+            info = kwargs['info']
+            for i in range(n):
+                info[i].update(rew=kwargs['rew'][i])
+            return Batch(info=info)
+            # or: return {'info': info}
+        else:
+            return Batch()
 
 
 def test_collector():
@@ -60,16 +70,16 @@ def test_collector():
     policy = MyPolicy()
     env = env_fns[0]()
     c0 = Collector(policy, env, ReplayBuffer(size=100, ignore_obs_next=False),
-                   preprocess_fn)
-    c0.collect(n_step=3, log_fn=logger.log)
-    assert np.allclose(c0.buffer.obs[:3], [0, 1, 0])
-    assert np.allclose(c0.buffer[:3].obs_next, [1, 2, 1])
-    c0.collect(n_episode=3, log_fn=logger.log)
-    assert np.allclose(c0.buffer.obs[:8], [0, 1, 0, 1, 0, 1, 0, 1])
-    assert np.allclose(c0.buffer[:8].obs_next, [1, 2, 1, 2, 1, 2, 1, 2])
+                   logger.preprocess_fn)
+    c0.collect(n_step=3)
+    assert np.allclose(c0.buffer.obs[:4], [0, 1, 0, 1])
+    assert np.allclose(c0.buffer[:4].obs_next, [1, 2, 1, 2])
+    c0.collect(n_episode=3)
+    assert np.allclose(c0.buffer.obs[:10], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
+    assert np.allclose(c0.buffer[:10].obs_next, [1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
     c0.collect(n_step=3, random=True)
     c1 = Collector(policy, venv, ReplayBuffer(size=100, ignore_obs_next=False),
-                   preprocess_fn)
+                   logger.preprocess_fn)
     c1.collect(n_step=6)
     assert np.allclose(c1.buffer.obs[:11], [0, 1, 0, 1, 2, 0, 1, 0, 1, 2, 3])
     assert np.allclose(c1.buffer[:11].obs_next,
@@ -80,7 +90,7 @@ def test_collector():
                        [1, 2, 3, 4, 5, 1, 2, 1, 2, 3])
     c1.collect(n_episode=3, random=True)
     c2 = Collector(policy, dum, ReplayBuffer(size=100, ignore_obs_next=False),
-                   preprocess_fn)
+                   logger.preprocess_fn)
     c2.collect(n_episode=[1, 2, 2, 2])
     assert np.allclose(c2.buffer.obs_next[:26], [
         1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5,
@@ -96,13 +106,15 @@ def test_collector():
 def test_collector_with_dict_state():
     env = MyTestEnv(size=5, sleep=0, dict_state=True)
     policy = MyPolicy(dict_state=True)
-    c0 = Collector(policy, env, ReplayBuffer(size=100), preprocess_fn)
+    c0 = Collector(policy, env, ReplayBuffer(size=100),
+                   Logger.single_preprocess_fn)
     c0.collect(n_step=3)
-    c0.collect(n_episode=3)
+    c0.collect(n_episode=2)
     env_fns = [lambda x=i: MyTestEnv(size=x, sleep=0, dict_state=True)
                for i in [2, 3, 4, 5]]
     envs = VectorEnv(env_fns)
-    c1 = Collector(policy, envs, ReplayBuffer(size=100), preprocess_fn)
+    c1 = Collector(policy, envs, ReplayBuffer(size=100),
+                   Logger.single_preprocess_fn)
     c1.collect(n_step=10)
     c1.collect(n_episode=[2, 1, 1, 2])
     batch = c1.sample(10)
@@ -113,7 +125,7 @@ def test_collector_with_dict_state():
         0., 1., 2., 0., 1., 0., 1., 2., 3., 0., 1., 2., 3., 4., 0., 1., 0.,
         1., 2., 0., 1., 0., 1., 2., 3., 0., 1., 2., 3., 4.])
     c2 = Collector(policy, envs, ReplayBuffer(size=100, stack_num=4),
-                   preprocess_fn)
+                   Logger.single_preprocess_fn)
     c2.collect(n_episode=[0, 0, 0, 10])
     batch = c2.sample(10)
     print(batch['obs_next']['index'])
@@ -125,16 +137,17 @@ def reward_metric(x):
     env = MyTestEnv(size=5, sleep=0, ma_rew=4)
     policy = MyPolicy()
     c0 = Collector(policy, env, ReplayBuffer(size=100),
-                   preprocess_fn, reward_metric=reward_metric)
+                   Logger.single_preprocess_fn, reward_metric=reward_metric)
+    # n_step=3 will collect a full episode
     r = c0.collect(n_step=3)['rew']
-    assert np.asanyarray(r).size == 1 and r == 0.
-    r = c0.collect(n_episode=3)['rew']
+    assert np.asanyarray(r).size == 1 and r == 4.
+    r = c0.collect(n_episode=2)['rew']
     assert np.asanyarray(r).size == 1 and r == 4.
     env_fns = [lambda x=i: MyTestEnv(size=x, sleep=0, ma_rew=4)
                for i in [2, 3, 4, 5]]
     envs = VectorEnv(env_fns)
     c1 = Collector(policy, envs, ReplayBuffer(size=100),
-                   preprocess_fn, reward_metric=reward_metric)
+                   Logger.single_preprocess_fn, reward_metric=reward_metric)
     r = c1.collect(n_step=10)['rew']
     assert np.asanyarray(r).size == 1 and r == 4.
     r = c1.collect(n_episode=[2, 1, 1, 2])['rew']
@@ -153,7 +166,7 @@ def reward_metric(x):
     assert np.allclose(c0.buffer[:len(c0.buffer)].rew,
                        [[x] * 4 for x in rew])
     c2 = Collector(policy, envs, ReplayBuffer(size=100, stack_num=4),
-                   preprocess_fn, reward_metric=reward_metric)
+                   Logger.single_preprocess_fn, reward_metric=reward_metric)
     r = c2.collect(n_episode=[0, 0, 0, 10])['rew']
     assert np.asanyarray(r).size == 1 and r == 4.
     batch = c2.sample(10)