From bc4d0bcf9990fab912ff0842efd96238a17cecdc Mon Sep 17 00:00:00 2001
From: chy <308604256@qq.com>
Date: Fri, 26 Feb 2021 15:36:59 +0800
Subject: [PATCH 01/13] remove rew_norm in offpolicy algorithm

---
 examples/mujoco/runnable/ant_v2_ddpg.py        |  3 +--
 examples/mujoco/runnable/ant_v2_td3.py         |  3 +--
 .../runnable/halfcheetahBullet_v0_sac.py       |  3 +--
 examples/mujoco/runnable/point_maze_td3.py     |  3 +--
 test/continuous/test_ddpg.py                   |  3 +--
 test/continuous/test_sac_with_il.py            |  3 +--
 test/continuous/test_td3.py                    |  3 +--
 test/discrete/test_sac.py                      |  1 -
 tianshou/policy/base.py                        | 18 +++++-------------
 tianshou/policy/modelfree/ddpg.py              |  2 ++
 tianshou/policy/modelfree/dqn.py               |  2 ++
 11 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/examples/mujoco/runnable/ant_v2_ddpg.py b/examples/mujoco/runnable/ant_v2_ddpg.py
index 53e9ac4d7..ce42434c0 100644
--- a/examples/mujoco/runnable/ant_v2_ddpg.py
+++ b/examples/mujoco/runnable/ant_v2_ddpg.py
@@ -72,8 +72,7 @@ def test_ddpg(args=get_args()):
         actor, actor_optim, critic, critic_optim,
         action_range=[env.action_space.low[0], env.action_space.high[0]],
         tau=args.tau, gamma=args.gamma,
-        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
-        reward_normalization=True)
+        exploration_noise=GaussianNoise(sigma=args.exploration_noise))
     # collector
     train_collector = Collector(
         policy, train_envs,
diff --git a/examples/mujoco/runnable/ant_v2_td3.py b/examples/mujoco/runnable/ant_v2_td3.py
index cbbd952f3..5e33c33da 100644
--- a/examples/mujoco/runnable/ant_v2_td3.py
+++ b/examples/mujoco/runnable/ant_v2_td3.py
@@ -80,8 +80,7 @@ def test_td3(args=get_args()):
         exploration_noise=GaussianNoise(sigma=args.exploration_noise),
         policy_noise=args.policy_noise,
         update_actor_freq=args.update_actor_freq,
-        noise_clip=args.noise_clip,
-        reward_normalization=True)
+        noise_clip=args.noise_clip)
     # collector
     train_collector = Collector(
         policy, train_envs,
diff --git a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py b/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py
index db0ce6ec8..618492771 100644
--- a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py
+++ b/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py
@@ -80,8 +80,7 @@ def test_sac(args=get_args()):
     policy = SACPolicy(
         actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
         action_range=[env.action_space.low[0], env.action_space.high[0]],
-        tau=args.tau, gamma=args.gamma, alpha=args.alpha,
-        reward_normalization=True)
+        tau=args.tau, gamma=args.gamma, alpha=args.alpha)
     # collector
     train_collector = Collector(
         policy, train_envs,
diff --git a/examples/mujoco/runnable/point_maze_td3.py b/examples/mujoco/runnable/point_maze_td3.py
index ed2ce0efc..eda299244 100644
--- a/examples/mujoco/runnable/point_maze_td3.py
+++ b/examples/mujoco/runnable/point_maze_td3.py
@@ -85,8 +85,7 @@ def test_td3(args=get_args()):
         exploration_noise=GaussianNoise(sigma=args.exploration_noise),
         policy_noise=args.policy_noise,
         update_actor_freq=args.update_actor_freq,
-        noise_clip=args.noise_clip,
-        reward_normalization=True)
+        noise_clip=args.noise_clip)
     # collector
     train_collector = Collector(
         policy, train_envs,
diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py
index 232eef17c..6d7020ad4 100644
--- a/test/continuous/test_ddpg.py
+++ b/test/continuous/test_ddpg.py
@@ -37,8 +37,7 @@ def get_args():
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument('--rew-norm', type=int, default=1)
-    parser.add_argument('--ignore-done', type=int, default=1)
+    parser.add_argument('--rew-norm', type=int, default=0)
     parser.add_argument('--n-step', type=int, default=1)
     parser.add_argument(
         '--device', type=str,
diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py
index 8d1842876..900c6e0c4 100644
--- a/test/continuous/test_sac_with_il.py
+++ b/test/continuous/test_sac_with_il.py
@@ -40,8 +40,7 @@ def get_args():
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument('--rew-norm', type=int, default=1)
-    parser.add_argument('--ignore-done', type=int, default=1)
+    parser.add_argument('--rew-norm', type=int, default=0)
     parser.add_argument('--n-step', type=int, default=4)
     parser.add_argument(
         '--device', type=str,
diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py
index c24741c3c..340cb261d 100644
--- a/test/continuous/test_td3.py
+++ b/test/continuous/test_td3.py
@@ -40,8 +40,7 @@ def get_args():
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument('--rew-norm', type=int, default=1)
-    parser.add_argument('--ignore-done', type=int, default=1)
+    parser.add_argument('--rew-norm', type=int, default=0)
     parser.add_argument('--n-step', type=int, default=1)
     parser.add_argument(
         '--device', type=str,
diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py
index b5871f66a..465331a99 100644
--- a/test/discrete/test_sac.py
+++ b/test/discrete/test_sac.py
@@ -39,7 +39,6 @@ def get_args():
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.0)
     parser.add_argument('--rew-norm', type=int, default=0)
-    parser.add_argument('--ignore-done', type=int, default=0)
     parser.add_argument(
         '--device', type=str,
         default='cuda' if torch.cuda.is_available() else 'cpu')
diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py
index cf2678f74..28e1200fc 100644
--- a/tianshou/policy/base.py
+++ b/tianshou/policy/base.py
@@ -286,15 +286,10 @@ def compute_nstep_return(
         :return: a Batch. The result will be stored in batch.returns as a
             torch.Tensor with the same shape as target_q_fn's return tensor.
         """
+        assert rew_norm == False, (
+            "Reward normalization in computing n-step return is unsupported for now.")
         rew = buffer.rew
         bsz = len(indice)
-        if rew_norm:  # TODO: remove it or fix this bug
-            bfr = rew[:min(len(buffer), 1000)]  # avoid large buffer
-            mean, std = bfr.mean(), bfr.std()
-            if np.isclose(std, 0, 1e-2):
-                mean, std = 0.0, 1.0
-        else:
-            mean, std = 0.0, 1.0
         indices = [indice]
         for _ in range(n_step - 1):
             indices.append(buffer.next(indices[-1]))
@@ -308,8 +303,7 @@ def compute_nstep_return(
         target_q = target_q * BasePolicy.value_mask(buffer, terminal).reshape(-1, 1)
         end_flag = buffer.done.copy()
         end_flag[buffer.unfinished_index()] = True
-        target_q = _nstep_return(rew, end_flag, target_q,
-                                 indices, gamma, n_step, mean, std)
+        target_q = _nstep_return(rew, end_flag, target_q, indices, gamma, n_step)
 
         batch.returns = to_torch_as(target_q, target_q_torch)
         if hasattr(batch, "weight"):  # prio buffer update
@@ -325,7 +319,7 @@ def _compile(self) -> None:
         _gae_return(f32, f32, f64, b, 0.1, 0.1)
         _episodic_return(f64, f64, b, 0.1, 0.1)
         _episodic_return(f32, f64, b, 0.1, 0.1)
-        _nstep_return(f64, b, f32.reshape(-1, 1), i64, 0.1, 1, 0.0, 1.0)
+        _nstep_return(f64, b, f32.reshape(-1, 1), i64, 0.1, 1)
 
 
 @njit
@@ -368,8 +362,6 @@ def _nstep_return(
     indices: np.ndarray,
     gamma: float,
     n_step: int,
-    mean: float,
-    std: float,
 ) -> np.ndarray:
     gamma_buffer = np.ones(n_step + 1)
     for i in range(1, n_step + 1):
@@ -384,6 +376,6 @@ def _nstep_return(
         now = indices[n]
         gammas[end_flag[now] > 0] = n + 1
         returns[end_flag[now] > 0] = 0.0
-        returns = (rew[now].reshape(bsz, 1) - mean) / std + gamma * returns
+        returns = rew[now].reshape(bsz, 1) + gamma * returns
     target_q = target_q * gamma_buffer[gammas].reshape(bsz, 1) + returns
     return target_q.reshape(target_shape)
diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py
index d91359a2f..fe1a77f9c 100644
--- a/tianshou/policy/modelfree/ddpg.py
+++ b/tianshou/policy/modelfree/ddpg.py
@@ -71,6 +71,8 @@ def __init__(
         # it is only a little difference to use GaussianNoise
         # self.noise = OUNoise()
         self._rew_norm = reward_normalization
+        assert self._rew_norm == False, (
+            "Reward normalization in offpolicy algorithm is unsupported for now.")
         assert estimation_step > 0, "estimation_step should be greater than 0"
         self._n_step = estimation_step
 
diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py
index e79ff3206..6466ec568 100644
--- a/tianshou/policy/modelfree/dqn.py
+++ b/tianshou/policy/modelfree/dqn.py
@@ -57,6 +57,8 @@ def __init__(
             self.model_old = deepcopy(self.model)
             self.model_old.eval()
         self._rew_norm = reward_normalization
+        assert self._rew_norm == False, (
+            "Reward normalization in offpolicy algorithm is unsupported for now.")
 
     def set_eps(self, eps: float) -> None:
         """Set the eps for epsilon-greedy exploration."""

From 6c75da73db89467fa0a0410e1dd006da4ff903a3 Mon Sep 17 00:00:00 2001
From: chy <308604256@qq.com>
Date: Fri, 26 Feb 2021 15:43:39 +0800
Subject: [PATCH 02/13] all

---
 tianshou/policy/base.py           | 2 +-
 tianshou/policy/modelfree/ddpg.py | 2 +-
 tianshou/policy/modelfree/dqn.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py
index 28e1200fc..7e910996b 100644
--- a/tianshou/policy/base.py
+++ b/tianshou/policy/base.py
@@ -286,7 +286,7 @@ def compute_nstep_return(
         :return: a Batch. The result will be stored in batch.returns as a
             torch.Tensor with the same shape as target_q_fn's return tensor.
         """
-        assert rew_norm == False, (
+        assert not rew_norm, (
             "Reward normalization in computing n-step return is unsupported for now.")
         rew = buffer.rew
         bsz = len(indice)
diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py
index fe1a77f9c..90068841f 100644
--- a/tianshou/policy/modelfree/ddpg.py
+++ b/tianshou/policy/modelfree/ddpg.py
@@ -71,7 +71,7 @@ def __init__(
         # it is only a little difference to use GaussianNoise
         # self.noise = OUNoise()
         self._rew_norm = reward_normalization
-        assert self._rew_norm == False, (
+        assert not self._rew_norm, (
             "Reward normalization in offpolicy algorithm is unsupported for now.")
         assert estimation_step > 0, "estimation_step should be greater than 0"
         self._n_step = estimation_step
diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py
index 6466ec568..2ee42275f 100644
--- a/tianshou/policy/modelfree/dqn.py
+++ b/tianshou/policy/modelfree/dqn.py
@@ -57,7 +57,7 @@ def __init__(
             self.model_old = deepcopy(self.model)
             self.model_old.eval()
         self._rew_norm = reward_normalization
-        assert self._rew_norm == False, (
+        assert not self._rew_norm, (
             "Reward normalization in offpolicy algorithm is unsupported for now.")
 
     def set_eps(self, eps: float) -> None:

From f5f9b24f7c5746624f9f72cc11637ef401bb4176 Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 21:15:07 +0800
Subject: [PATCH 03/13] defaults -> Default

---
 tianshou/policy/imitation/base.py         |  7 ++-
 tianshou/policy/imitation/discrete_bcq.py | 23 ++++-----
 tianshou/policy/modelbase/psrl.py         |  2 +-
 tianshou/policy/modelfree/a2c.py          | 39 +++++++-------
 tianshou/policy/modelfree/c51.py          | 17 +++---
 tianshou/policy/modelfree/ddpg.py         | 22 +++-----
 tianshou/policy/modelfree/discrete_sac.py | 17 +++---
 tianshou/policy/modelfree/dqn.py          | 13 ++---
 tianshou/policy/modelfree/pg.py           | 14 +++--
 tianshou/policy/modelfree/ppo.py          | 63 ++++++++++-------------
 tianshou/policy/modelfree/qrdqn.py        | 13 ++---
 tianshou/policy/modelfree/sac.py          | 40 ++++++--------
 tianshou/policy/modelfree/td3.py          | 46 +++++++----------
 tianshou/utils/net/common.py              | 10 ++--
 14 files changed, 134 insertions(+), 192 deletions(-)

diff --git a/tianshou/policy/imitation/base.py b/tianshou/policy/imitation/base.py
index 954bc81f6..a618dd480 100644
--- a/tianshou/policy/imitation/base.py
+++ b/tianshou/policy/imitation/base.py
@@ -14,7 +14,7 @@ class ImitationPolicy(BasePolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> a)
     :param torch.optim.Optimizer optim: for optimizing the model.
     :param str mode: indicate the imitation type ("continuous" or "discrete"
-        action space), defaults to "continuous".
+        action space). Default to "continuous".
 
     .. seealso::
 
@@ -32,9 +32,8 @@ def __init__(
         super().__init__(**kwargs)
         self.model = model
         self.optim = optim
-        assert (
-            mode in ["continuous", "discrete"]
-        ), f"Mode {mode} is not in ['continuous', 'discrete']."
+        assert mode in ["continuous", "discrete"], \
+            f"Mode {mode} is not in ['continuous', 'discrete']."
         self.mode = mode
 
     def forward(
diff --git a/tianshou/policy/imitation/discrete_bcq.py b/tianshou/policy/imitation/discrete_bcq.py
index 0061ea20f..610b164f1 100644
--- a/tianshou/policy/imitation/discrete_bcq.py
+++ b/tianshou/policy/imitation/discrete_bcq.py
@@ -17,16 +17,15 @@ class DiscreteBCQPolicy(DQNPolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> imtation_logits)
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param float discount_factor: in [0, 1].
-    :param int estimation_step: greater than 1, the number of steps to look
-        ahead.
+    :param int estimation_step: greater than 1, the number of steps to look ahead.
     :param int target_update_freq: the target network update frequency.
     :param float eval_eps: the epsilon-greedy noise added in evaluation.
     :param float unlikely_action_threshold: the threshold (tau) for unlikely
-        actions, as shown in Equ. (17) in the paper, defaults to 0.3.
+        actions, as shown in Equ. (17) in the paper. Default to 0.3.
     :param float imitation_logits_penalty: reguralization weight for imitation
-        logits, defaults to 1e-2.
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
+        logits. Default to 1e-2.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
 
     .. seealso::
 
@@ -52,9 +51,8 @@ def __init__(
                          target_update_freq, reward_normalization, **kwargs)
         assert target_update_freq > 0, "BCQ needs target network setting."
         self.imitator = imitator
-        assert (
-            0.0 <= unlikely_action_threshold < 1.0
-        ), "unlikely_action_threshold should be in [0, 1)"
+        assert 0.0 <= unlikely_action_threshold < 1.0, \
+            "unlikely_action_threshold should be in [0, 1)"
         if unlikely_action_threshold > 0:
             self._log_tau = math.log(unlikely_action_threshold)
         else:
@@ -69,9 +67,7 @@ def train(self, mode: bool = True) -> "DiscreteBCQPolicy":
         self.imitator.train(mode)
         return self
 
-    def _target_q(
-        self, buffer: ReplayBuffer, indice: np.ndarray
-    ) -> torch.Tensor:
+    def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor:
         batch = buffer[indice]  # batch.obs_next: s_{t+n}
         # target_Q = Q_old(s_, argmax(Q_new(s_, *)))
         act = self(batch, input="obs_next").act
@@ -93,8 +89,7 @@ def forward(  # type: ignore
         imitation_logits, _ = self.imitator(obs, state=state, info=batch.info)
 
         # mask actions for argmax
-        ratio = imitation_logits - imitation_logits.max(
-            dim=-1, keepdim=True).values
+        ratio = imitation_logits - imitation_logits.max(dim=-1, keepdim=True).values
         mask = (ratio < self._log_tau).float()
         action = (q_value - np.inf * mask).argmax(dim=-1)
 
diff --git a/tianshou/policy/modelbase/psrl.py b/tianshou/policy/modelbase/psrl.py
index dcf6a5d05..4a565976f 100644
--- a/tianshou/policy/modelbase/psrl.py
+++ b/tianshou/policy/modelbase/psrl.py
@@ -149,7 +149,7 @@ class PSRLPolicy(BasePolicy):
     :param float discount_factor: in [0, 1].
     :param float epsilon: for precision control in value iteration.
     :param bool add_done_loop: whether to add an extra self-loop for the
-        terminal state in MDP, defaults to False.
+        terminal state in MDP. Default to False.
 
     .. seealso::
 
diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py
index 1c8edb300..f79682789 100644
--- a/tianshou/policy/modelfree/a2c.py
+++ b/tianshou/policy/modelfree/a2c.py
@@ -2,7 +2,7 @@
 import numpy as np
 from torch import nn
 import torch.nn.functional as F
-from typing import Any, Dict, List, Union, Optional, Callable
+from typing import Any, Dict, List, Type, Union, Optional
 
 from tianshou.policy import PGPolicy
 from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy
@@ -17,20 +17,20 @@ class A2CPolicy(PGPolicy):
     :param torch.optim.Optimizer optim: the optimizer for actor and critic
         network.
     :param dist_fn: distribution class for computing the action.
-    :type dist_fn: Callable[[], torch.distributions.Distribution]
-    :param float discount_factor: in [0, 1], defaults to 0.99.
-    :param float vf_coef: weight for value loss, defaults to 0.5.
-    :param float ent_coef: weight for entropy loss, defaults to 0.01.
-    :param float max_grad_norm: clipping gradients in back propagation,
-        defaults to None.
+    :type dist_fn: Type[torch.distributions.Distribution]
+    :param float discount_factor: in [0, 1]. Default to 0.99.
+    :param float vf_coef: weight for value loss. Default to 0.5.
+    :param float ent_coef: weight for entropy loss. Default to 0.01.
+    :param float max_grad_norm: clipping gradients in back propagation.
+        Default to None.
     :param float gae_lambda: in [0, 1], param for Generalized Advantage
-        Estimation, defaults to 0.95.
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
+        Estimation. Default to 0.95.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
     :param int max_batchsize: the maximum size of the batch when computing GAE,
         depends on the size of available memory and the memory cost of the
-        model; should be as large as possible within the memory constraint;
-        defaults to 256.
+        model; should be as large as possible within the memory constraint.
+        Default to 256.
 
     .. seealso::
 
@@ -43,7 +43,7 @@ def __init__(
         actor: torch.nn.Module,
         critic: torch.nn.Module,
         optim: torch.optim.Optimizer,
-        dist_fn: Callable[[], torch.distributions.Distribution],
+        dist_fn: Type[torch.distributions.Distribution],
         discount_factor: float = 0.99,
         vf_coef: float = 0.5,
         ent_coef: float = 0.01,
@@ -77,9 +77,8 @@ def process_fn(
                 v_.append(to_numpy(self.critic(b.obs_next)))
         v_ = np.concatenate(v_, axis=0)
         return self.compute_episodic_return(
-            batch, buffer, indice,
-            v_, gamma=self._gamma,
-            gae_lambda=self._lambda, rew_norm=self._rew_norm)
+            batch, buffer, indice, v_,
+            gamma=self._gamma, gae_lambda=self._lambda, rew_norm=self._rew_norm)
 
     def forward(
         self,
@@ -105,7 +104,7 @@ def forward(
         if isinstance(logits, tuple):
             dist = self.dist_fn(*logits)
         else:
-            dist = self.dist_fn(logits)  # type: ignore
+            dist = self.dist_fn(logits)
         act = dist.sample()
         return Batch(logits=logits, act=act, state=h, dist=dist)
 
@@ -124,13 +123,11 @@ def learn(  # type: ignore
                 a_loss = -(log_prob * (r - v).detach()).mean()
                 vf_loss = F.mse_loss(r, v)  # type: ignore
                 ent_loss = dist.entropy().mean()
-                loss = a_loss + self._weight_vf * vf_loss - \
-                    self._weight_ent * ent_loss
+                loss = a_loss + self._weight_vf * vf_loss - self._weight_ent * ent_loss
                 loss.backward()
                 if self._grad_norm is not None:
                     nn.utils.clip_grad_norm_(
-                        list(self.actor.parameters())
-                        + list(self.critic.parameters()),
+                        list(self.actor.parameters()) + list(self.critic.parameters()),
                         max_norm=self._grad_norm,
                     )
                 self.optim.step()
diff --git a/tianshou/policy/modelfree/c51.py b/tianshou/policy/modelfree/c51.py
index dce2112a8..eb24f0eb8 100644
--- a/tianshou/policy/modelfree/c51.py
+++ b/tianshou/policy/modelfree/c51.py
@@ -14,17 +14,16 @@ class C51Policy(DQNPolicy):
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param float discount_factor: in [0, 1].
     :param int num_atoms: the number of atoms in the support set of the
-        value distribution, defaults to 51.
-    :param float v_min: the value of the smallest atom in the support set,
-        defaults to -10.0.
-    :param float v_max: the value of the largest atom in the support set,
-        defaults to 10.0.
-    :param int estimation_step: greater than 1, the number of steps to look
-        ahead.
+        value distribution. Default to 51.
+    :param float v_min: the value of the smallest atom in the support set.
+        Default to -10.0.
+    :param float v_max: the value of the largest atom in the support set.
+        Default to 10.0.
+    :param int estimation_step: greater than 1, the number of steps to look ahead.
     :param int target_update_freq: the target network update frequency (0 if
         you do not use the target network).
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
 
     .. seealso::
 
diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py
index 90068841f..c858b29e2 100644
--- a/tianshou/policy/modelfree/ddpg.py
+++ b/tianshou/policy/modelfree/ddpg.py
@@ -15,19 +15,16 @@ class DDPGPolicy(BasePolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.optim.Optimizer actor_optim: the optimizer for actor network.
     :param torch.nn.Module critic: the critic network. (s, a -> Q(s, a))
-    :param torch.optim.Optimizer critic_optim: the optimizer for critic
-        network.
+    :param torch.optim.Optimizer critic_optim: the optimizer for critic network.
     :param action_range: the action range (minimum, maximum).
     :type action_range: Tuple[float, float]
-    :param float tau: param for soft update of the target network, defaults to
-        0.005.
-    :param float gamma: discount factor, in [0, 1], defaults to 0.99.
+    :param float tau: param for soft update of the target network. Default to 0.005.
+    :param float gamma: discount factor, in [0, 1]. Default to 0.99.
     :param BaseNoise exploration_noise: the exploration noise,
-        add to the action, defaults to ``GaussianNoise(sigma=0.1)``.
+        add to the action. Default to ``GaussianNoise(sigma=0.1)``.
     :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
-    :param int estimation_step: greater than 1, the number of steps to look
-        ahead.
+        Default to False.
+    :param int estimation_step: greater than 1, the number of steps to look ahead.
 
     .. seealso::
 
@@ -71,9 +68,6 @@ def __init__(
         # it is only a little difference to use GaussianNoise
         # self.noise = OUNoise()
         self._rew_norm = reward_normalization
-        assert not self._rew_norm, (
-            "Reward normalization in offpolicy algorithm is unsupported for now.")
-        assert estimation_step > 0, "estimation_step should be greater than 0"
         self._n_step = estimation_step
 
     def set_exp_noise(self, noise: Optional[BaseNoise]) -> None:
@@ -91,9 +85,7 @@ def sync_weight(self) -> None:
         """Soft-update the weight for the target network."""
         for o, n in zip(self.actor_old.parameters(), self.actor.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
-        for o, n in zip(
-            self.critic_old.parameters(), self.critic.parameters()
-        ):
+        for o, n in zip(self.critic_old.parameters(), self.critic.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
 
     def _target_q(
diff --git a/tianshou/policy/modelfree/discrete_sac.py b/tianshou/policy/modelfree/discrete_sac.py
index fd67d4738..a53bbbbf8 100644
--- a/tianshou/policy/modelfree/discrete_sac.py
+++ b/tianshou/policy/modelfree/discrete_sac.py
@@ -19,15 +19,14 @@ class DiscreteSACPolicy(SACPolicy):
     :param torch.nn.Module critic2: the second critic network. (s -> Q(s))
     :param torch.optim.Optimizer critic2_optim: the optimizer for the second
         critic network.
-    :param float tau: param for soft update of the target network, defaults to
-        0.005.
-    :param float gamma: discount factor, in [0, 1], defaults to 0.99.
+    :param float tau: param for soft update of the target network. Default to 0.005.
+    :param float gamma: discount factor, in [0, 1]. Default to 0.99.
     :param (float, torch.Tensor, torch.optim.Optimizer) or float alpha: entropy
-        regularization coefficient, default to 0.2.
-        If a tuple (target_entropy, log_alpha, alpha_optim) is provided, then
+        regularization coefficient. Default to 0.2.
+        If a tuple (target_entropy, log_alpha, alpha_optim) is provided, the
         alpha is automatatically tuned.
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to ``False``.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
 
     .. seealso::
 
@@ -45,9 +44,7 @@ def __init__(
         critic2_optim: torch.optim.Optimizer,
         tau: float = 0.005,
         gamma: float = 0.99,
-        alpha: Union[
-            float, Tuple[float, torch.Tensor, torch.optim.Optimizer]
-        ] = 0.2,
+        alpha: Union[float, Tuple[float, torch.Tensor, torch.optim.Optimizer]] = 0.2,
         reward_normalization: bool = False,
         estimation_step: int = 1,
         **kwargs: Any,
diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py
index 2ee42275f..a4ad772fb 100644
--- a/tianshou/policy/modelfree/dqn.py
+++ b/tianshou/policy/modelfree/dqn.py
@@ -19,12 +19,11 @@ class DQNPolicy(BasePolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param float discount_factor: in [0, 1].
-    :param int estimation_step: greater than 1, the number of steps to look
-        ahead.
+    :param int estimation_step: greater than 1, the number of steps to look ahead.
     :param int target_update_freq: the target network update frequency (0 if
         you do not use the target network).
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
 
     .. seealso::
 
@@ -57,8 +56,6 @@ def __init__(
             self.model_old = deepcopy(self.model)
             self.model_old.eval()
         self._rew_norm = reward_normalization
-        assert not self._rew_norm, (
-            "Reward normalization in offpolicy algorithm is unsupported for now.")
 
     def set_eps(self, eps: float) -> None:
         """Set the eps for epsilon-greedy exploration."""
@@ -74,9 +71,7 @@ def sync_weight(self) -> None:
         """Synchronize the weight for the target network."""
         self.model_old.load_state_dict(self.model.state_dict())
 
-    def _target_q(
-        self, buffer: ReplayBuffer, indice: np.ndarray
-    ) -> torch.Tensor:
+    def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor:
         batch = buffer[indice]  # batch.obs_next: s_{t+n}
         # target_Q = Q_old(s_, argmax(Q_new(s_, *)))
         if self._target:
diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py
index 82fb9f704..080ba70a2 100644
--- a/tianshou/policy/modelfree/pg.py
+++ b/tianshou/policy/modelfree/pg.py
@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from typing import Any, Dict, List, Union, Optional, Callable
+from typing import Any, Dict, List, Type, Union, Optional
 
 from tianshou.policy import BasePolicy
 from tianshou.data import Batch, ReplayBuffer, to_torch_as
@@ -13,8 +13,8 @@ class PGPolicy(BasePolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param dist_fn: distribution class for computing the action.
-    :type dist_fn: Callable[[], torch.distributions.Distribution]
-    :param float discount_factor: in [0, 1].
+    :type dist_fn: Type[torch.distributions.Distribution]
+    :param float discount_factor: in [0, 1]. Default to 0.99.
 
     .. seealso::
 
@@ -26,7 +26,7 @@ def __init__(
         self,
         model: Optional[torch.nn.Module],
         optim: torch.optim.Optimizer,
-        dist_fn: Callable[[], torch.distributions.Distribution],
+        dist_fn: Type[torch.distributions.Distribution],
         discount_factor: float = 0.99,
         reward_normalization: bool = False,
         **kwargs: Any,
@@ -36,9 +36,7 @@ def __init__(
             self.model: torch.nn.Module = model
         self.optim = optim
         self.dist_fn = dist_fn
-        assert (
-            0.0 <= discount_factor <= 1.0
-        ), "discount factor should be in [0, 1]"
+        assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]"
         self._gamma = discount_factor
         self._rew_norm = reward_normalization
 
@@ -83,7 +81,7 @@ def forward(
         if isinstance(logits, tuple):
             dist = self.dist_fn(*logits)
         else:
-            dist = self.dist_fn(logits)  # type: ignore
+            dist = self.dist_fn(logits)
         act = dist.sample()
         return Batch(logits=logits, act=act, state=h, dist=dist)
 
diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py
index 7fd6f1f26..953829195 100644
--- a/tianshou/policy/modelfree/ppo.py
+++ b/tianshou/policy/modelfree/ppo.py
@@ -1,7 +1,7 @@
 import torch
 import numpy as np
 from torch import nn
-from typing import Any, Dict, List, Tuple, Union, Optional, Callable
+from typing import Any, Dict, List, Type, Tuple, Union, Optional
 
 from tianshou.policy import PGPolicy
 from tianshou.data import Batch, ReplayBuffer, to_numpy, to_torch_as
@@ -13,32 +13,31 @@ class PPOPolicy(PGPolicy):
     :param torch.nn.Module actor: the actor network following the rules in
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.nn.Module critic: the critic network. (s -> V(s))
-    :param torch.optim.Optimizer optim: the optimizer for actor and critic
-        network.
+    :param torch.optim.Optimizer optim: the optimizer for actor and critic network.
     :param dist_fn: distribution class for computing the action.
-    :type dist_fn: Callable[[], torch.distributions.Distribution]
-    :param float discount_factor: in [0, 1], defaults to 0.99.
-    :param float max_grad_norm: clipping gradients in back propagation,
-        defaults to None.
+    :type dist_fn: Type[torch.distributions.Distribution]
+    :param float discount_factor: in [0, 1]. Default to 0.99.
+    :param float max_grad_norm: clipping gradients in back propagation.
+        Default to None.
     :param float eps_clip: :math:`\epsilon` in :math:`L_{CLIP}` in the original
-        paper, defaults to 0.2.
-    :param float vf_coef: weight for value loss, defaults to 0.5.
-    :param float ent_coef: weight for entropy loss, defaults to 0.01.
+        paper. Default to 0.2.
+    :param float vf_coef: weight for value loss. Default to 0.5.
+    :param float ent_coef: weight for entropy loss. Default to 0.01.
     :param action_range: the action range (minimum, maximum).
     :type action_range: (float, float)
     :param float gae_lambda: in [0, 1], param for Generalized Advantage
-        Estimation, defaults to 0.95.
+        Estimation. Default to 0.95.
     :param float dual_clip: a parameter c mentioned in arXiv:1912.09729 Equ. 5,
-        where c > 1 is a constant indicating the lower bound,
-        defaults to 5.0 (set ``None`` if you do not want to use it).
-    :param bool value_clip: a parameter mentioned in arXiv:1811.02553 Sec. 4.1,
-        defaults to True.
-    :param bool reward_normalization: normalize the returns to Normal(0, 1),
-        defaults to True.
+        where c > 1 is a constant indicating the lower bound.
+        Default to 5.0 (set None if you do not want to use it).
+    :param bool value_clip: a parameter mentioned in arXiv:1811.02553 Sec. 4.1.
+        Default to True.
+    :param bool reward_normalization: normalize the returns to Normal(0, 1).
+        Default to True.
     :param int max_batchsize: the maximum size of the batch when computing GAE,
         depends on the size of available memory and the memory cost of the
-        model; should be as large as possible within the memory constraint;
-        defaults to 256.
+        model; should be as large as possible within the memory constraint.
+        Default to 256.
 
     .. seealso::
 
@@ -51,7 +50,7 @@ def __init__(
         actor: torch.nn.Module,
         critic: torch.nn.Module,
         optim: torch.optim.Optimizer,
-        dist_fn: Callable[[], torch.distributions.Distribution],
+        dist_fn: Type[torch.distributions.Distribution],
         discount_factor: float = 0.99,
         max_grad_norm: Optional[float] = None,
         eps_clip: float = 0.2,
@@ -76,9 +75,8 @@ def __init__(
         self._batch = max_batchsize
         assert 0.0 <= gae_lambda <= 1.0, "GAE lambda should be in [0, 1]."
         self._lambda = gae_lambda
-        assert (
-            dual_clip is None or dual_clip > 1.0
-        ), "Dual-clip PPO parameter should greater than 1.0."
+        assert dual_clip is None or dual_clip > 1.0, \
+            "Dual-clip PPO parameter should greater than 1.0."
         self._dual_clip = dual_clip
         self._value_clip = value_clip
         self._rew_norm = reward_normalization
@@ -95,9 +93,7 @@ def process_fn(
             for b in batch.split(self._batch, shuffle=False, merge_last=True):
                 v_.append(self.critic(b.obs_next))
                 v.append(self.critic(b.obs))
-                old_log_prob.append(
-                    self(b).dist.log_prob(to_torch_as(b.act, v[0]))
-                )
+                old_log_prob.append(self(b).dist.log_prob(to_torch_as(b.act, v[0])))
         v_ = to_numpy(torch.cat(v_, dim=0))
         batch = self.compute_episodic_return(
             batch, buffer, indice, v_, gamma=self._gamma,
@@ -137,7 +133,7 @@ def forward(
         if isinstance(logits, tuple):
             dist = self.dist_fn(*logits)
         else:
-            dist = self.dist_fn(logits)  # type: ignore
+            dist = self.dist_fn(logits)
         act = dist.sample()
         if self._range:
             act = act.clamp(self._range[0], self._range[1])
@@ -154,8 +150,7 @@ def learn(  # type: ignore
                 ratio = (dist.log_prob(b.act) - b.logp_old).exp().float()
                 ratio = ratio.reshape(ratio.size(0), -1).transpose(0, 1)
                 surr1 = ratio * b.adv
-                surr2 = ratio.clamp(1.0 - self._eps_clip,
-                                    1.0 + self._eps_clip) * b.adv
+                surr2 = ratio.clamp(1.0 - self._eps_clip, 1.0 + self._eps_clip) * b.adv
                 if self._dual_clip:
                     clip_loss = -torch.max(
                         torch.min(surr1, surr2), self._dual_clip * b.adv
@@ -164,8 +159,7 @@ def learn(  # type: ignore
                     clip_loss = -torch.min(surr1, surr2).mean()
                 clip_losses.append(clip_loss.item())
                 if self._value_clip:
-                    v_clip = b.v + (value - b.v).clamp(
-                        -self._eps_clip, self._eps_clip)
+                    v_clip = b.v + (value - b.v).clamp(-self._eps_clip, self._eps_clip)
                     vf1 = (b.returns - value).pow(2)
                     vf2 = (b.returns - v_clip).pow(2)
                     vf_loss = 0.5 * torch.max(vf1, vf2).mean()
@@ -174,15 +168,14 @@ def learn(  # type: ignore
                 vf_losses.append(vf_loss.item())
                 e_loss = dist.entropy().mean()
                 ent_losses.append(e_loss.item())
-                loss = clip_loss + self._weight_vf * vf_loss - \
-                    self._weight_ent * e_loss
+                loss = clip_loss + self._weight_vf * vf_loss \
+                    - self._weight_ent * e_loss
                 losses.append(loss.item())
                 self.optim.zero_grad()
                 loss.backward()
                 if self._max_grad_norm:
                     nn.utils.clip_grad_norm_(
-                        list(self.actor.parameters())
-                        + list(self.critic.parameters()),
+                        list(self.actor.parameters()) + list(self.critic.parameters()),
                         self._max_grad_norm)
                 self.optim.step()
         return {
diff --git a/tianshou/policy/modelfree/qrdqn.py b/tianshou/policy/modelfree/qrdqn.py
index 8816b6b1a..ffc93aeee 100644
--- a/tianshou/policy/modelfree/qrdqn.py
+++ b/tianshou/policy/modelfree/qrdqn.py
@@ -16,13 +16,12 @@ class QRDQNPolicy(DQNPolicy):
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param float discount_factor: in [0, 1].
     :param int num_quantiles: the number of quantile midpoints in the inverse
-        cumulative distribution function of the value, defaults to 200.
-    :param int estimation_step: greater than 1, the number of steps to look
-        ahead.
+        cumulative distribution function of the value. Default to 200.
+    :param int estimation_step: greater than 1, the number of steps to look ahead.
     :param int target_update_freq: the target network update frequency (0 if
         you do not use the target network).
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
 
     .. seealso::
 
@@ -50,9 +49,7 @@ def __init__(
             ((tau[:-1] + tau[1:]) / 2).view(1, -1, 1), requires_grad=False)
         warnings.filterwarnings("ignore", message="Using a target size")
 
-    def _target_q(
-        self, buffer: ReplayBuffer, indice: np.ndarray
-    ) -> torch.Tensor:
+    def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor:
         batch = buffer[indice]  # batch.obs_next: s_{t+n}
         if self._target:
             a = self(batch, input="obs_next").act
diff --git a/tianshou/policy/modelfree/sac.py b/tianshou/policy/modelfree/sac.py
index cb53fad7f..68bef3971 100644
--- a/tianshou/policy/modelfree/sac.py
+++ b/tianshou/policy/modelfree/sac.py
@@ -15,30 +15,27 @@ class SACPolicy(DDPGPolicy):
     :param torch.nn.Module actor: the actor network following the rules in
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.optim.Optimizer actor_optim: the optimizer for actor network.
-    :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s,
-        a))
+    :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s, a))
     :param torch.optim.Optimizer critic1_optim: the optimizer for the first
         critic network.
-    :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s,
-        a))
+    :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s, a))
     :param torch.optim.Optimizer critic2_optim: the optimizer for the second
         critic network.
     :param action_range: the action range (minimum, maximum).
     :type action_range: Tuple[float, float]
-    :param float tau: param for soft update of the target network, defaults to
-        0.005.
-    :param float gamma: discount factor, in [0, 1], defaults to 0.99.
+    :param float tau: param for soft update of the target network. Default to 0.005.
+    :param float gamma: discount factor, in [0, 1]. Default to 0.99.
     :param (float, torch.Tensor, torch.optim.Optimizer) or float alpha: entropy
-        regularization coefficient, default to 0.2.
+        regularization coefficient. Default to 0.2.
         If a tuple (target_entropy, log_alpha, alpha_optim) is provided, then
         alpha is automatatically tuned.
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
-    :param BaseNoise exploration_noise: add a noise to action for exploration,
-        defaults to None. This is useful when solving hard-exploration problem.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
+    :param BaseNoise exploration_noise: add a noise to action for exploration.
+        Default to None. This is useful when solving hard-exploration problem.
     :param bool deterministic_eval: whether to use deterministic action (mean
-        of Gaussian policy) instead of stochastic action sampled by the policy,
-        defaults to True.
+        of Gaussian policy) instead of stochastic action sampled by the policy.
+        Default to True.
 
     .. seealso::
 
@@ -57,9 +54,7 @@ def __init__(
         action_range: Tuple[float, float],
         tau: float = 0.005,
         gamma: float = 0.99,
-        alpha: Union[
-            float, Tuple[float, torch.Tensor, torch.optim.Optimizer]
-        ] = 0.2,
+        alpha: Union[float, Tuple[float, torch.Tensor, torch.optim.Optimizer]] = 0.2,
         reward_normalization: bool = False,
         estimation_step: int = 1,
         exploration_noise: Optional[BaseNoise] = None,
@@ -98,13 +93,9 @@ def train(self, mode: bool = True) -> "SACPolicy":
         return self
 
     def sync_weight(self) -> None:
-        for o, n in zip(
-            self.critic1_old.parameters(), self.critic1.parameters()
-        ):
+        for o, n in zip(self.critic1_old.parameters(), self.critic1.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
-        for o, n in zip(
-            self.critic2_old.parameters(), self.critic2.parameters()
-        ):
+        for o, n in zip(self.critic2_old.parameters(), self.critic2.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
 
     def forward(  # type: ignore
@@ -128,8 +119,7 @@ def forward(  # type: ignore
         log_prob = dist.log_prob(x).unsqueeze(-1)
         log_prob = log_prob - torch.log(y).sum(-1, keepdim=True)
 
-        return Batch(
-            logits=logits, act=act, state=h, dist=dist, log_prob=log_prob)
+        return Batch(logits=logits, act=act, state=h, dist=dist, log_prob=log_prob)
 
     def _target_q(
         self, buffer: ReplayBuffer, indice: np.ndarray
diff --git a/tianshou/policy/modelfree/td3.py b/tianshou/policy/modelfree/td3.py
index 23e16d88a..bd6572205 100644
--- a/tianshou/policy/modelfree/td3.py
+++ b/tianshou/policy/modelfree/td3.py
@@ -14,29 +14,26 @@ class TD3Policy(DDPGPolicy):
     :param torch.nn.Module actor: the actor network following the rules in
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.optim.Optimizer actor_optim: the optimizer for actor network.
-    :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s,
-        a))
+    :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s, a))
     :param torch.optim.Optimizer critic1_optim: the optimizer for the first
         critic network.
-    :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s,
-        a))
+    :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s, a))
     :param torch.optim.Optimizer critic2_optim: the optimizer for the second
         critic network.
     :param action_range: the action range (minimum, maximum).
     :type action_range: Tuple[float, float]
-    :param float tau: param for soft update of the target network, defaults to
-        0.005.
-    :param float gamma: discount factor, in [0, 1], defaults to 0.99.
-    :param float exploration_noise: the exploration noise, add to the action,
-        defaults to ``GaussianNoise(sigma=0.1)``
-    :param float policy_noise: the noise used in updating policy network,
-        default to 0.2.
-    :param int update_actor_freq: the update frequency of actor network,
-        default to 2.
-    :param float noise_clip: the clipping range used in updating policy
-        network, default to 0.5.
-    :param bool reward_normalization: normalize the reward to Normal(0, 1),
-        defaults to False.
+    :param float tau: param for soft update of the target network. Default to 0.005.
+    :param float gamma: discount factor, in [0, 1]. Default to 0.99.
+    :param float exploration_noise: the exploration noise, add to the action.
+        Default to ``GaussianNoise(sigma=0.1)``
+    :param float policy_noise: the noise used in updating policy network.
+        Default to 0.2.
+    :param int update_actor_freq: the update frequency of actor network.
+        Default to 2.
+    :param float noise_clip: the clipping range used in updating policy network.
+        Default to 0.5.
+    :param bool reward_normalization: normalize the reward to Normal(0, 1).
+        Default to False.
 
     .. seealso::
 
@@ -88,18 +85,12 @@ def train(self, mode: bool = True) -> "TD3Policy":
     def sync_weight(self) -> None:
         for o, n in zip(self.actor_old.parameters(), self.actor.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
-        for o, n in zip(
-            self.critic1_old.parameters(), self.critic1.parameters()
-        ):
+        for o, n in zip(self.critic1_old.parameters(), self.critic1.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
-        for o, n in zip(
-            self.critic2_old.parameters(), self.critic2.parameters()
-        ):
+        for o, n in zip(self.critic2_old.parameters(), self.critic2.parameters()):
             o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau)
 
-    def _target_q(
-        self, buffer: ReplayBuffer, indice: np.ndarray
-    ) -> torch.Tensor:
+    def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor:
         batch = buffer[indice]  # batch.obs: s_{t+n}
         a_ = self(batch, model="actor_old", input="obs_next").act
         dev = a_.device
@@ -134,8 +125,7 @@ def learn(self, batch: Batch, **kwargs: Any) -> Dict[str, float]:
         self.critic2_optim.step()
         batch.weight = (td1 + td2) / 2.0  # prio-buffer
         if self._cnt % self._freq == 0:
-            actor_loss = -self.critic1(
-                batch.obs, self(batch, eps=0.0).act).mean()
+            actor_loss = -self.critic1(batch.obs, self(batch, eps=0.0).act).mean()
             self.actor_optim.zero_grad()
             actor_loss.backward()
             self._last = actor_loss.item()
diff --git a/tianshou/utils/net/common.py b/tianshou/utils/net/common.py
index 1da33650b..b41346e9a 100644
--- a/tianshou/utils/net/common.py
+++ b/tianshou/utils/net/common.py
@@ -34,7 +34,7 @@ class MLP(nn.Module):
     :param hidden_sizes: shape of MLP passed in as a list, not incluing
         input_dim and output_dim.
     :param norm_layer: use which normalization before activation, e.g.,
-        ``nn.LayerNorm`` and ``nn.BatchNorm1d``, defaults to no normalization.
+        ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
         You can also pass a list of normalization modules with the same length
         of hidden_sizes, to use different normalization module in different
         layers. Default to no normalization.
@@ -103,7 +103,7 @@ class Net(nn.Module):
     :param action_shape: int or a sequence of int of the shape of action.
     :param hidden_sizes: shape of MLP passed in as a list.
     :param norm_layer: use which normalization before activation, e.g.,
-        ``nn.LayerNorm`` and ``nn.BatchNorm1d``, defaults to no normalization.
+        ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
         You can also pass a list of normalization modules with the same length
         of hidden_sizes, to use different normalization module in different
         layers. Default to no normalization.
@@ -118,13 +118,13 @@ class Net(nn.Module):
     :param bool concat: whether the input shape is concatenated by state_shape
         and action_shape. If it is True, ``action_shape`` is not the output
         shape, but affects the input shape only.
-    :param int num_atoms: in order to expand to the net of distributional RL,
-         defaults to 1 (not use).
+    :param int num_atoms: in order to expand to the net of distributional RL.
+        Default to 1 (not use).
     :param bool dueling_param: whether to use dueling network to calculate Q
         values (for Dueling DQN). If you want to use dueling option, you should
         pass a tuple of two dict (first for Q and second for V) stating
         self-defined arguments as stated in
-        class:`~tianshou.utils.net.common.MLP`. Defaults to None.
+        class:`~tianshou.utils.net.common.MLP`. Default to None.
 
     .. seealso::
 

From 76be2466872ded7bd674b6b74fe76f7f7e551384 Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 21:26:49 +0800
Subject: [PATCH 04/13] action=store_true

---
 test/continuous/test_ddpg.py        | 5 ++---
 test/continuous/test_sac_with_il.py | 2 +-
 test/continuous/test_td3.py         | 5 ++---
 test/discrete/test_a2c_with_il.py   | 2 +-
 test/discrete/test_sac.py           | 5 ++---
 5 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py
index 6d7020ad4..16aa475dd 100644
--- a/test/continuous/test_ddpg.py
+++ b/test/continuous/test_ddpg.py
@@ -31,13 +31,12 @@ def get_args():
     parser.add_argument('--step-per-collect', type=int, default=4)
     parser.add_argument('--update-per-step', type=float, default=0.25)
     parser.add_argument('--batch-size', type=int, default=128)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
+    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
     parser.add_argument('--training-num', type=int, default=4)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument('--rew-norm', type=int, default=0)
+    parser.add_argument('--rew-norm', action="store_true", default=False)
     parser.add_argument('--n-step', type=int, default=1)
     parser.add_argument(
         '--device', type=str,
diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py
index 900c6e0c4..b1ca67bd4 100644
--- a/test/continuous/test_sac_with_il.py
+++ b/test/continuous/test_sac_with_il.py
@@ -40,7 +40,7 @@ def get_args():
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument('--rew-norm', type=int, default=0)
+    parser.add_argument('--rew-norm', action="store_true", default=False)
     parser.add_argument('--n-step', type=int, default=4)
     parser.add_argument(
         '--device', type=str,
diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py
index 340cb261d..9a0f3a9c3 100644
--- a/test/continuous/test_td3.py
+++ b/test/continuous/test_td3.py
@@ -34,13 +34,12 @@ def get_args():
     parser.add_argument('--step-per-collect', type=int, default=10)
     parser.add_argument('--update-per-step', type=float, default=0.1)
     parser.add_argument('--batch-size', type=int, default=128)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
+    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
     parser.add_argument('--training-num', type=int, default=10)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument('--rew-norm', type=int, default=0)
+    parser.add_argument('--rew-norm', action="store_true", default=False)
     parser.add_argument('--n-step', type=int, default=1)
     parser.add_argument(
         '--device', type=str,
diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py
index 196dc28b5..882cb440a 100644
--- a/test/discrete/test_a2c_with_il.py
+++ b/test/discrete/test_a2c_with_il.py
@@ -47,7 +47,7 @@ def get_args():
     parser.add_argument('--ent-coef', type=float, default=0.0)
     parser.add_argument('--max-grad-norm', type=float, default=None)
     parser.add_argument('--gae-lambda', type=float, default=1.)
-    parser.add_argument('--rew-norm', type=bool, default=False)
+    parser.add_argument('--rew-norm', action="store_true", default=False)
     args = parser.parse_known_args()[0]
     return args
 
diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py
index 465331a99..5f5a3f0cd 100644
--- a/test/discrete/test_sac.py
+++ b/test/discrete/test_sac.py
@@ -32,13 +32,12 @@ def get_args():
     parser.add_argument('--step-per-collect', type=int, default=5)
     parser.add_argument('--update-per-step', type=float, default=0.2)
     parser.add_argument('--batch-size', type=int, default=64)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
+    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
     parser.add_argument('--training-num', type=int, default=5)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.0)
-    parser.add_argument('--rew-norm', type=int, default=0)
+    parser.add_argument('--rew-norm', action="store_true", default=False)
     parser.add_argument(
         '--device', type=str,
         default='cuda' if torch.cuda.is_available() else 'cpu')

From 6f766181d2cb022a6626e5fc951c989262a9afe2 Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 21:40:00 +0800
Subject: [PATCH 05/13] fix test_ddpg: pass 10 seed within avg 30s

---
 test/continuous/test_ddpg.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py
index 16aa475dd..093aed196 100644
--- a/test/continuous/test_ddpg.py
+++ b/test/continuous/test_ddpg.py
@@ -26,18 +26,18 @@ def get_args():
     parser.add_argument('--gamma', type=float, default=0.99)
     parser.add_argument('--tau', type=float, default=0.005)
     parser.add_argument('--exploration-noise', type=float, default=0.1)
-    parser.add_argument('--epoch', type=int, default=20)
-    parser.add_argument('--step-per-epoch', type=int, default=9600)
-    parser.add_argument('--step-per-collect', type=int, default=4)
-    parser.add_argument('--update-per-step', type=float, default=0.25)
+    parser.add_argument('--epoch', type=int, default=5)
+    parser.add_argument('--step-per-epoch', type=int, default=20000)
+    parser.add_argument('--step-per-collect', type=int, default=8)
+    parser.add_argument('--update-per-step', type=float, default=0.125)
     parser.add_argument('--batch-size', type=int, default=128)
     parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=4)
+    parser.add_argument('--training-num', type=int, default=8)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
     parser.add_argument('--rew-norm', action="store_true", default=False)
-    parser.add_argument('--n-step', type=int, default=1)
+    parser.add_argument('--n-step', type=int, default=3)
     parser.add_argument(
         '--device', type=str,
         default='cuda' if torch.cuda.is_available() else 'cpu')

From 9291d5ac3cf95ba0dbd4d3af4c09e72c1e8355ae Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 21:52:57 +0800
Subject: [PATCH 06/13] fix test_td3: pass 10 seed within avg 35s

---
 test/continuous/test_td3.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py
index 9a0f3a9c3..41bb2f835 100644
--- a/test/continuous/test_td3.py
+++ b/test/continuous/test_td3.py
@@ -21,7 +21,7 @@ def get_args():
     parser.add_argument('--task', type=str, default='Pendulum-v0')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--actor-lr', type=float, default=3e-4)
+    parser.add_argument('--actor-lr', type=float, default=1e-4)
     parser.add_argument('--critic-lr', type=float, default=1e-3)
     parser.add_argument('--gamma', type=float, default=0.99)
     parser.add_argument('--tau', type=float, default=0.005)
@@ -29,18 +29,18 @@ def get_args():
     parser.add_argument('--policy-noise', type=float, default=0.2)
     parser.add_argument('--noise-clip', type=float, default=0.5)
     parser.add_argument('--update-actor-freq', type=int, default=2)
-    parser.add_argument('--epoch', type=int, default=20)
+    parser.add_argument('--epoch', type=int, default=5)
     parser.add_argument('--step-per-epoch', type=int, default=20000)
-    parser.add_argument('--step-per-collect', type=int, default=10)
-    parser.add_argument('--update-per-step', type=float, default=0.1)
+    parser.add_argument('--step-per-collect', type=int, default=8)
+    parser.add_argument('--update-per-step', type=float, default=0.125)
     parser.add_argument('--batch-size', type=int, default=128)
     parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=10)
+    parser.add_argument('--training-num', type=int, default=8)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)
     parser.add_argument('--rew-norm', action="store_true", default=False)
-    parser.add_argument('--n-step', type=int, default=1)
+    parser.add_argument('--n-step', type=int, default=3)
     parser.add_argument(
         '--device', type=str,
         default='cuda' if torch.cuda.is_available() else 'cpu')

From 72e074d52d9c52ba3fa15edb2093261ca4c2b02d Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 22:29:48 +0800
Subject: [PATCH 07/13] fix test_drqn: 10 seed avg < 20s

---
 test/continuous/test_ppo.py         |  2 +-
 test/continuous/test_sac_with_il.py |  2 +-
 test/discrete/test_drqn.py          | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py
index b4fd383c7..762c58838 100644
--- a/test/continuous/test_ppo.py
+++ b/test/continuous/test_ppo.py
@@ -23,7 +23,7 @@ def get_args():
     parser.add_argument('--buffer-size', type=int, default=20000)
     parser.add_argument('--lr', type=float, default=1e-3)
     parser.add_argument('--gamma', type=float, default=0.99)
-    parser.add_argument('--epoch', type=int, default=20)
+    parser.add_argument('--epoch', type=int, default=5)
     parser.add_argument('--step-per-epoch', type=int, default=150000)
     parser.add_argument('--episode-per-collect', type=int, default=16)
     parser.add_argument('--repeat-per-collect', type=int, default=2)
diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py
index b1ca67bd4..ac533fcf4 100644
--- a/test/continuous/test_sac_with_il.py
+++ b/test/continuous/test_sac_with_il.py
@@ -26,7 +26,7 @@ def get_args():
     parser.add_argument('--gamma', type=float, default=0.99)
     parser.add_argument('--tau', type=float, default=0.005)
     parser.add_argument('--alpha', type=float, default=0.2)
-    parser.add_argument('--epoch', type=int, default=20)
+    parser.add_argument('--epoch', type=int, default=5)
     parser.add_argument('--step-per-epoch', type=int, default=24000)
     parser.add_argument('--il-step-per-epoch', type=int, default=500)
     parser.add_argument('--step-per-collect', type=int, default=10)
diff --git a/test/discrete/test_drqn.py b/test/discrete/test_drqn.py
index 33f0432a3..39bef8dbc 100644
--- a/test/discrete/test_drqn.py
+++ b/test/discrete/test_drqn.py
@@ -24,15 +24,15 @@ def get_args():
     parser.add_argument('--stack-num', type=int, default=4)
     parser.add_argument('--lr', type=float, default=1e-3)
     parser.add_argument('--gamma', type=float, default=0.95)
-    parser.add_argument('--n-step', type=int, default=4)
+    parser.add_argument('--n-step', type=int, default=3)
     parser.add_argument('--target-update-freq', type=int, default=320)
-    parser.add_argument('--epoch', type=int, default=10)
-    parser.add_argument('--step-per-epoch', type=int, default=10000)
-    parser.add_argument('--update-per-step', type=float, default=0.1)
-    parser.add_argument('--step-per-collect', type=int, default=10)
-    parser.add_argument('--batch-size', type=int, default=64)
-    parser.add_argument('--layer-num', type=int, default=3)
-    parser.add_argument('--training-num', type=int, default=10)
+    parser.add_argument('--epoch', type=int, default=5)
+    parser.add_argument('--step-per-epoch', type=int, default=20000)
+    parser.add_argument('--update-per-step', type=float, default=1 / 16)
+    parser.add_argument('--step-per-collect', type=int, default=16)
+    parser.add_argument('--batch-size', type=int, default=128)
+    parser.add_argument('--layer-num', type=int, default=2)
+    parser.add_argument('--training-num', type=int, default=16)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.)

From f6ef057cf178875627d5841549498454cb58edd0 Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 22:31:45 +0800
Subject: [PATCH 08/13] test td3 seed

---
 test/continuous/test_td3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py
index 41bb2f835..6b53a79ff 100644
--- a/test/continuous/test_td3.py
+++ b/test/continuous/test_td3.py
@@ -19,7 +19,7 @@
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--task', type=str, default='Pendulum-v0')
-    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=1626)
     parser.add_argument('--buffer-size', type=int, default=20000)
     parser.add_argument('--actor-lr', type=float, default=1e-4)
     parser.add_argument('--critic-lr', type=float, default=1e-3)

From 19a66f864af13da33a973c26a749300dcbd9ab55 Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 22:53:05 +0800
Subject: [PATCH 09/13] fix test_sac

---
 test/discrete/test_sac.py     | 17 +++++++++--------
 tianshou/trainer/offline.py   |  2 +-
 tianshou/trainer/offpolicy.py |  4 ++--
 tianshou/trainer/onpolicy.py  |  4 ++--
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py
index 5f5a3f0cd..ad594dbfc 100644
--- a/test/discrete/test_sac.py
+++ b/test/discrete/test_sac.py
@@ -20,24 +20,25 @@ def get_args():
     parser.add_argument('--task', type=str, default='CartPole-v0')
     parser.add_argument('--seed', type=int, default=1626)
     parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--actor-lr', type=float, default=3e-4)
+    parser.add_argument('--actor-lr', type=float, default=1e-4)
     parser.add_argument('--critic-lr', type=float, default=1e-3)
     parser.add_argument('--alpha-lr', type=float, default=3e-4)
     parser.add_argument('--gamma', type=float, default=0.95)
     parser.add_argument('--tau', type=float, default=0.005)
     parser.add_argument('--alpha', type=float, default=0.05)
-    parser.add_argument('--auto_alpha', type=int, default=0)
+    parser.add_argument('--auto-alpha', action="store_true", default=False)
     parser.add_argument('--epoch', type=int, default=5)
-    parser.add_argument('--step-per-epoch', type=int, default=5000)
-    parser.add_argument('--step-per-collect', type=int, default=5)
-    parser.add_argument('--update-per-step', type=float, default=0.2)
-    parser.add_argument('--batch-size', type=int, default=64)
+    parser.add_argument('--step-per-epoch', type=int, default=10000)
+    parser.add_argument('--step-per-collect', type=int, default=10)
+    parser.add_argument('--update-per-step', type=float, default=0.1)
+    parser.add_argument('--batch-size', type=int, default=128)
     parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=5)
+    parser.add_argument('--training-num', type=int, default=10)
     parser.add_argument('--test-num', type=int, default=100)
     parser.add_argument('--logdir', type=str, default='log')
     parser.add_argument('--render', type=float, default=0.0)
     parser.add_argument('--rew-norm', action="store_true", default=False)
+    parser.add_argument('--n-step', type=int, default=3)
     parser.add_argument(
         '--device', type=str,
         default='cuda' if torch.cuda.is_available() else 'cpu')
@@ -85,7 +86,7 @@ def test_discrete_sac(args=get_args()):
 
     policy = DiscreteSACPolicy(
         actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
-        args.tau, args.gamma, args.alpha,
+        args.tau, args.gamma, args.alpha, estimation_step=args.n_step,
         reward_normalization=args.rew_norm)
     # collector
     train_collector = Collector(
diff --git a/tianshou/trainer/offline.py b/tianshou/trainer/offline.py
index b3588ae5b..13f96faeb 100644
--- a/tianshou/trainer/offline.py
+++ b/tianshou/trainer/offline.py
@@ -79,7 +79,7 @@ def offline_trainer(
                 for k in losses.keys():
                     stat[k].add(losses[k])
                     losses[k] = stat[k].get()
-                    data[k] = f"{losses[k]:.6f}"
+                    data[k] = f"{losses[k]:.3f}"
                 logger.log_update_data(losses, gradient_step)
                 t.set_postfix(**data)
         # test
diff --git a/tianshou/trainer/offpolicy.py b/tianshou/trainer/offpolicy.py
index 5f233bfef..72a243d9a 100644
--- a/tianshou/trainer/offpolicy.py
+++ b/tianshou/trainer/offpolicy.py
@@ -106,7 +106,7 @@ def offpolicy_trainer(
                 data = {
                     "env_step": str(env_step),
                     "rew": f"{last_rew:.2f}",
-                    "len": str(last_len),
+                    "len": str(int(last_len)),
                     "n/ep": str(int(result["n/ep"])),
                     "n/st": str(int(result["n/st"])),
                 }
@@ -130,7 +130,7 @@ def offpolicy_trainer(
                     for k in losses.keys():
                         stat[k].add(losses[k])
                         losses[k] = stat[k].get()
-                        data[k] = f"{losses[k]:.6f}"
+                        data[k] = f"{losses[k]:.3f}"
                     logger.log_update_data(losses, gradient_step)
                     t.set_postfix(**data)
             if t.n <= t.total:
diff --git a/tianshou/trainer/onpolicy.py b/tianshou/trainer/onpolicy.py
index 5f5254d66..dae20a741 100644
--- a/tianshou/trainer/onpolicy.py
+++ b/tianshou/trainer/onpolicy.py
@@ -113,7 +113,7 @@ def onpolicy_trainer(
                 data = {
                     "env_step": str(env_step),
                     "rew": f"{last_rew:.2f}",
-                    "len": str(last_len),
+                    "len": str(int(last_len)),
                     "n/ep": str(int(result["n/ep"])),
                     "n/st": str(int(result["n/st"])),
                 }
@@ -140,7 +140,7 @@ def onpolicy_trainer(
                 for k in losses.keys():
                     stat[k].add(losses[k])
                     losses[k] = stat[k].get()
-                    data[k] = f"{losses[k]:.6f}"
+                    data[k] = f"{losses[k]:.3f}"
                 logger.log_update_data(losses, gradient_step)
                 t.set_postfix(**data)
             if t.n <= t.total:

From 33e6ae09f52973fcc4da95c1f6b50a53f8a8309c Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 22:55:53 +0800
Subject: [PATCH 10/13] td3 seed=1

---
 test/continuous/test_td3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py
index 6b53a79ff..86331e993 100644
--- a/test/continuous/test_td3.py
+++ b/test/continuous/test_td3.py
@@ -19,7 +19,7 @@
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--task', type=str, default='Pendulum-v0')
-    parser.add_argument('--seed', type=int, default=1626)
+    parser.add_argument('--seed', type=int, default=1)
     parser.add_argument('--buffer-size', type=int, default=20000)
     parser.add_argument('--actor-lr', type=float, default=1e-4)
     parser.add_argument('--critic-lr', type=float, default=1e-3)

From ec1096ba8f7ed85cb5f8f0aa03259597a125d75a Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Fri, 26 Feb 2021 23:28:55 +0800
Subject: [PATCH 11/13] change psrl seed to see what happens

---
 test/modelbase/test_psrl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/modelbase/test_psrl.py b/test/modelbase/test_psrl.py
index c04261868..d89a7f4bc 100644
--- a/test/modelbase/test_psrl.py
+++ b/test/modelbase/test_psrl.py
@@ -16,7 +16,7 @@
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--task', type=str, default='NChain-v0')
-    parser.add_argument('--seed', type=int, default=1626)
+    parser.add_argument('--seed', type=int, default=1)
     parser.add_argument('--buffer-size', type=int, default=50000)
     parser.add_argument('--epoch', type=int, default=5)
     parser.add_argument('--step-per-epoch', type=int, default=1000)
@@ -29,7 +29,7 @@ def get_args():
     parser.add_argument('--rew-std-prior', type=float, default=1.0)
     parser.add_argument('--gamma', type=float, default=0.99)
     parser.add_argument('--eps', type=float, default=0.01)
-    parser.add_argument('--add-done-loop', action='store_true')
+    parser.add_argument('--add-done-loop', action="store_true", default=False)
     return parser.parse_known_args()[0]
 
 

From 41465345c5227beecbf03cde13e299e3eee13715 Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Sat, 27 Feb 2021 10:42:31 +0800
Subject: [PATCH 12/13] remove runnable/

---
 examples/atari/runnable/atari.py              | 133 ----
 examples/atari/runnable/pong_a2c.py           | 110 ----
 examples/atari/runnable/pong_ppo.py           | 115 ----
 examples/mujoco/runnable/ant_v2_ddpg.py       | 108 ----
 examples/mujoco/runnable/ant_v2_td3.py        | 117 ----
 .../runnable/halfcheetahBullet_v0_sac.py      | 121 ----
 examples/mujoco/runnable/mujoco/__init__.py   |   0
 .../mujoco/runnable/mujoco/assets/point.xml   |  34 --
 .../mujoco/runnable/mujoco/maze_env_utils.py  | 196 ------
 examples/mujoco/runnable/mujoco/point.py      |  93 ---
 .../mujoco/runnable/mujoco/point_maze_env.py  | 568 ------------------
 examples/mujoco/runnable/mujoco/register.py   |  27 -
 examples/mujoco/runnable/point_maze_td3.py    | 126 ----
 13 files changed, 1748 deletions(-)
 delete mode 100644 examples/atari/runnable/atari.py
 delete mode 100644 examples/atari/runnable/pong_a2c.py
 delete mode 100644 examples/atari/runnable/pong_ppo.py
 delete mode 100644 examples/mujoco/runnable/ant_v2_ddpg.py
 delete mode 100644 examples/mujoco/runnable/ant_v2_td3.py
 delete mode 100644 examples/mujoco/runnable/halfcheetahBullet_v0_sac.py
 delete mode 100644 examples/mujoco/runnable/mujoco/__init__.py
 delete mode 100644 examples/mujoco/runnable/mujoco/assets/point.xml
 delete mode 100644 examples/mujoco/runnable/mujoco/maze_env_utils.py
 delete mode 100644 examples/mujoco/runnable/mujoco/point.py
 delete mode 100644 examples/mujoco/runnable/mujoco/point_maze_env.py
 delete mode 100644 examples/mujoco/runnable/mujoco/register.py
 delete mode 100644 examples/mujoco/runnable/point_maze_td3.py

diff --git a/examples/atari/runnable/atari.py b/examples/atari/runnable/atari.py
deleted file mode 100644
index 8e2ea5168..000000000
--- a/examples/atari/runnable/atari.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import cv2
-import gym
-import numpy as np
-from gym.spaces.box import Box
-from tianshou.data import Batch
-
-SIZE = 84
-FRAME = 4
-
-
-def create_atari_environment(name=None, sticky_actions=True,
-                             max_episode_steps=2000):
-    game_version = 'v0' if sticky_actions else 'v4'
-    name = '{}NoFrameskip-{}'.format(name, game_version)
-    env = gym.make(name)
-    env = env.env
-    env = preprocessing(env, max_episode_steps=max_episode_steps)
-    return env
-
-
-def preprocess_fn(obs=None, act=None, rew=None, done=None,
-                  obs_next=None, info=None, policy=None, **kwargs):
-    if obs_next is not None:
-        obs_next = np.reshape(obs_next, (-1, *obs_next.shape[2:]))
-        obs_next = np.moveaxis(obs_next, 0, -1)
-        obs_next = cv2.resize(obs_next, (SIZE, SIZE))
-        obs_next = np.asanyarray(obs_next, dtype=np.uint8)
-        obs_next = np.reshape(obs_next, (-1, FRAME, SIZE, SIZE))
-        obs_next = np.moveaxis(obs_next, 1, -1)
-    elif obs is not None:
-        obs = np.reshape(obs, (-1, *obs.shape[2:]))
-        obs = np.moveaxis(obs, 0, -1)
-        obs = cv2.resize(obs, (SIZE, SIZE))
-        obs = np.asanyarray(obs, dtype=np.uint8)
-        obs = np.reshape(obs, (-1, FRAME, SIZE, SIZE))
-        obs = np.moveaxis(obs, 1, -1)
-
-    return Batch(obs=obs, act=act, rew=rew, done=done,
-                 obs_next=obs_next, info=info)
-
-
-class preprocessing(object):
-    def __init__(self, env, frame_skip=4, terminal_on_life_loss=False,
-                 size=84, max_episode_steps=2000):
-        self.max_episode_steps = max_episode_steps
-        self.env = env
-        self.terminal_on_life_loss = terminal_on_life_loss
-        self.frame_skip = frame_skip
-        self.size = size
-        self.count = 0
-        obs_dims = self.env.observation_space
-
-        self.screen_buffer = [
-            np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8),
-            np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8)
-        ]
-
-        self.game_over = False
-        self.lives = 0
-
-    @property
-    def observation_space(self):
-        return Box(low=0, high=255,
-                   shape=(self.size, self.size, self.frame_skip),
-                   dtype=np.uint8)
-
-    def action_space(self):
-        return self.env.action_space
-
-    def reward_range(self):
-        return self.env.reward_range
-
-    def metadata(self):
-        return self.env.metadata
-
-    def close(self):
-        return self.env.close()
-
-    def reset(self):
-        self.count = 0
-        self.env.reset()
-        self.lives = self.env.ale.lives()
-        self._grayscale_obs(self.screen_buffer[0])
-        self.screen_buffer[1].fill(0)
-
-        return np.array([self._pool_and_resize()
-                         for _ in range(self.frame_skip)])
-
-    def render(self, mode='human'):
-        return self.env.render(mode)
-
-    def step(self, action):
-        total_reward = 0.
-        observation = []
-        for t in range(self.frame_skip):
-            self.count += 1
-            _, reward, terminal, info = self.env.step(action)
-            total_reward += reward
-
-            if self.terminal_on_life_loss:
-                lives = self.env.ale.lives()
-                is_terminal = terminal or lives < self.lives
-                self.lives = lives
-            else:
-                is_terminal = terminal
-
-            if is_terminal:
-                break
-            elif t >= self.frame_skip - 2:
-                t_ = t - (self.frame_skip - 2)
-                self._grayscale_obs(self.screen_buffer[t_])
-
-            observation.append(self._pool_and_resize())
-        if len(observation) == 0:
-            observation = [self._pool_and_resize()
-                           for _ in range(self.frame_skip)]
-        while len(observation) > 0 and \
-                len(observation) < self.frame_skip:
-            observation.append(observation[-1])
-        terminal = self.count >= self.max_episode_steps
-        return np.array(observation), total_reward, \
-            (terminal or is_terminal), info
-
-    def _grayscale_obs(self, output):
-        self.env.ale.getScreenGrayscale(output)
-        return output
-
-    def _pool_and_resize(self):
-        if self.frame_skip > 1:
-            np.maximum(self.screen_buffer[0], self.screen_buffer[1],
-                       out=self.screen_buffer[0])
-
-        return self.screen_buffer[0]
diff --git a/examples/atari/runnable/pong_a2c.py b/examples/atari/runnable/pong_a2c.py
deleted file mode 100644
index 023824ce6..000000000
--- a/examples/atari/runnable/pong_a2c.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import os
-import torch
-import pprint
-import argparse
-import numpy as np
-from torch.utils.tensorboard import SummaryWriter
-
-from tianshou.policy import A2CPolicy
-from tianshou.utils import BasicLogger
-from tianshou.env import SubprocVectorEnv
-from tianshou.utils.net.common import Net
-from tianshou.trainer import onpolicy_trainer
-from tianshou.data import Collector, VectorReplayBuffer
-from tianshou.utils.net.discrete import Actor, Critic
-
-from atari import create_atari_environment, preprocess_fn
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--task', type=str, default='Pong')
-    parser.add_argument('--seed', type=int, default=1626)
-    parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--lr', type=float, default=3e-4)
-    parser.add_argument('--gamma', type=float, default=0.9)
-    parser.add_argument('--epoch', type=int, default=100)
-    parser.add_argument('--step-per-epoch', type=int, default=1000)
-    parser.add_argument('--episode-per-collect', type=int, default=10)
-    parser.add_argument('--repeat-per-collect', type=int, default=1)
-    parser.add_argument('--batch-size', type=int, default=64)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128, 128])
-    parser.add_argument('--training-num', type=int, default=8)
-    parser.add_argument('--test-num', type=int, default=8)
-    parser.add_argument('--logdir', type=str, default='log')
-    parser.add_argument('--render', type=float, default=0.)
-
-    parser.add_argument(
-        '--device', type=str,
-        default='cuda' if torch.cuda.is_available() else 'cpu')
-    # a2c special
-    parser.add_argument('--vf-coef', type=float, default=0.5)
-    parser.add_argument('--ent-coef', type=float, default=0.001)
-    parser.add_argument('--max-grad-norm', type=float, default=None)
-    parser.add_argument('--max-episode-steps', type=int, default=2000)
-    return parser.parse_args()
-
-
-def test_a2c(args=get_args()):
-    env = create_atari_environment(args.task)
-    args.state_shape = env.observation_space.shape or env.observation_space.n
-    args.action_shape = env.env.action_space.shape or env.env.action_space.n
-    # train_envs = gym.make(args.task)
-    train_envs = SubprocVectorEnv(
-        [lambda: create_atari_environment(args.task)
-         for _ in range(args.training_num)])
-    # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv(
-        [lambda: create_atari_environment(args.task)
-         for _ in range(args.test_num)])
-    # seed
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    train_envs.seed(args.seed)
-    test_envs.seed(args.seed)
-    # model
-    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
-              device=args.device)
-    actor = Actor(net, args.action_shape, device=args.device).to(args.device)
-    critic = Critic(net, device=args.device).to(args.device)
-    optim = torch.optim.Adam(set(
-        actor.parameters()).union(critic.parameters()), lr=args.lr)
-    dist = torch.distributions.Categorical
-    policy = A2CPolicy(
-        actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef,
-        ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm)
-    # collector
-    train_collector = Collector(
-        policy, train_envs,
-        VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)),
-        preprocess_fn=preprocess_fn, exploration_noise=True)
-    test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn)
-    # log
-    log_path = os.path.join(args.logdir, args.task, 'a2c')
-    writer = SummaryWriter(log_path)
-    logger = BasicLogger(writer)
-
-    def stop_fn(mean_rewards):
-        if env.env.spec.reward_threshold:
-            return mean_rewards >= env.spec.reward_threshold
-        else:
-            return False
-
-    # trainer
-    result = onpolicy_trainer(
-        policy, train_collector, test_collector, args.epoch,
-        args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size,
-        episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, logger=logger)
-    if __name__ == '__main__':
-        pprint.pprint(result)
-        # Let's watch its performance!
-        env = create_atari_environment(args.task)
-        collector = Collector(policy, env, preprocess_fn=preprocess_fn)
-        result = collector.collect(n_episode=1, render=args.render)
-        rews, lens = result["rews"], result["lens"]
-        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
-
-
-if __name__ == '__main__':
-    test_a2c()
diff --git a/examples/atari/runnable/pong_ppo.py b/examples/atari/runnable/pong_ppo.py
deleted file mode 100644
index 36728de6f..000000000
--- a/examples/atari/runnable/pong_ppo.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import os
-import torch
-import pprint
-import argparse
-import numpy as np
-from torch.utils.tensorboard import SummaryWriter
-
-from tianshou.policy import PPOPolicy
-from tianshou.utils import BasicLogger
-from tianshou.env import SubprocVectorEnv
-from tianshou.utils.net.common import Net
-from tianshou.trainer import onpolicy_trainer
-from tianshou.utils.net.discrete import Actor, Critic
-from tianshou.data import Collector, VectorReplayBuffer
-
-from atari import create_atari_environment, preprocess_fn
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--task', type=str, default='Pong')
-    parser.add_argument('--seed', type=int, default=1626)
-    parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--lr', type=float, default=1e-3)
-    parser.add_argument('--gamma', type=float, default=0.99)
-    parser.add_argument('--epoch', type=int, default=100)
-    parser.add_argument('--step-per-epoch', type=int, default=1000)
-    parser.add_argument('--episode-per-collect', type=int, default=10)
-    parser.add_argument('--repeat-per-collect', type=int, default=2)
-    parser.add_argument('--batch-size', type=int, default=64)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=8)
-    parser.add_argument('--test-num', type=int, default=8)
-    parser.add_argument('--logdir', type=str, default='log')
-    parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument(
-        '--device', type=str,
-        default='cuda' if torch.cuda.is_available() else 'cpu')
-    # ppo special
-    parser.add_argument('--vf-coef', type=float, default=0.5)
-    parser.add_argument('--ent-coef', type=float, default=0.0)
-    parser.add_argument('--eps-clip', type=float, default=0.2)
-    parser.add_argument('--max-grad-norm', type=float, default=0.5)
-    parser.add_argument('--max-episode-steps', type=int, default=2000)
-    return parser.parse_args()
-
-
-def test_ppo(args=get_args()):
-    env = create_atari_environment(args.task)
-    args.state_shape = env.observation_space.shape or env.observation_space.n
-    args.action_shape = env.action_space().shape or env.action_space().n
-    # train_envs = gym.make(args.task)
-    train_envs = SubprocVectorEnv([
-        lambda: create_atari_environment(args.task)
-        for _ in range(args.training_num)])
-    # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv([
-        lambda: create_atari_environment(args.task)
-        for _ in range(args.test_num)])
-    # seed
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    train_envs.seed(args.seed)
-    test_envs.seed(args.seed)
-    # model
-    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
-              device=args.device)
-    actor = Actor(net, args.action_shape, device=args.device).to(args.device)
-    critic = Critic(net, device=args.device).to(args.device)
-    optim = torch.optim.Adam(set(
-        actor.parameters()).union(critic.parameters()), lr=args.lr)
-    dist = torch.distributions.Categorical
-    policy = PPOPolicy(
-        actor, critic, optim, dist, args.gamma,
-        max_grad_norm=args.max_grad_norm,
-        eps_clip=args.eps_clip,
-        vf_coef=args.vf_coef,
-        ent_coef=args.ent_coef,
-        action_range=None)
-    # collector
-    train_collector = Collector(
-        policy, train_envs,
-        VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)),
-        preprocess_fn=preprocess_fn, exploration_noise=True)
-    test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn)
-    # log
-    log_path = os.path.join(args.logdir, args.task, 'ppo')
-    writer = SummaryWriter(log_path)
-    logger = BasicLogger(writer)
-
-    def stop_fn(mean_rewards):
-        if env.env.spec.reward_threshold:
-            return mean_rewards >= env.spec.reward_threshold
-        else:
-            return False
-
-    # trainer
-    result = onpolicy_trainer(
-        policy, train_collector, test_collector, args.epoch,
-        args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size,
-        episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, logger=logger)
-
-    if __name__ == '__main__':
-        pprint.pprint(result)
-        # Let's watch its performance!
-        env = create_atari_environment(args.task)
-        collector = Collector(policy, env, preprocess_fn=preprocess_fn)
-        result = collector.collect(n_step=2000, render=args.render)
-        rews, lens = result["rews"], result["lens"]
-        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
-
-
-if __name__ == '__main__':
-    test_ppo()
diff --git a/examples/mujoco/runnable/ant_v2_ddpg.py b/examples/mujoco/runnable/ant_v2_ddpg.py
deleted file mode 100644
index ce42434c0..000000000
--- a/examples/mujoco/runnable/ant_v2_ddpg.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import os
-import gym
-import torch
-import pprint
-import argparse
-import numpy as np
-from torch.utils.tensorboard import SummaryWriter
-
-from tianshou.policy import DDPGPolicy
-from tianshou.utils import BasicLogger
-from tianshou.env import SubprocVectorEnv
-from tianshou.utils.net.common import Net
-from tianshou.trainer import offpolicy_trainer
-from tianshou.exploration import GaussianNoise
-from tianshou.data import Collector, VectorReplayBuffer
-from tianshou.utils.net.continuous import Actor, Critic
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--task', type=str, default='Ant-v2')
-    parser.add_argument('--seed', type=int, default=1626)
-    parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--actor-lr', type=float, default=1e-4)
-    parser.add_argument('--critic-lr', type=float, default=1e-3)
-    parser.add_argument('--gamma', type=float, default=0.99)
-    parser.add_argument('--tau', type=float, default=0.005)
-    parser.add_argument('--exploration-noise', type=float, default=0.1)
-    parser.add_argument('--epoch', type=int, default=100)
-    parser.add_argument('--step-per-epoch', type=int, default=2400)
-    parser.add_argument('--step-per-collect', type=int, default=4)
-    parser.add_argument('--batch-size', type=int, default=128)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=4)
-    parser.add_argument('--test-num', type=int, default=100)
-    parser.add_argument('--logdir', type=str, default='log')
-    parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument(
-        '--device', type=str,
-        default='cuda' if torch.cuda.is_available() else 'cpu')
-    return parser.parse_args()
-
-
-def test_ddpg(args=get_args()):
-    env = gym.make(args.task)
-    args.state_shape = env.observation_space.shape or env.observation_space.n
-    args.action_shape = env.action_space.shape or env.action_space.n
-    args.max_action = env.action_space.high[0]
-    # train_envs = gym.make(args.task)
-    train_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.training_num)])
-    # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.test_num)])
-    # seed
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    train_envs.seed(args.seed)
-    test_envs.seed(args.seed)
-    # model
-    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
-              device=args.device)
-    actor = Actor(net, args.action_shape, max_action=args.max_action,
-                  device=args.device).to(args.device)
-    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
-    net = Net(args.state_shape, args.action_shape,
-              hidden_sizes=args.hidden_sizes, concat=True, device=args.device)
-    critic = Critic(net, device=args.device).to(args.device)
-    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
-    policy = DDPGPolicy(
-        actor, actor_optim, critic, critic_optim,
-        action_range=[env.action_space.low[0], env.action_space.high[0]],
-        tau=args.tau, gamma=args.gamma,
-        exploration_noise=GaussianNoise(sigma=args.exploration_noise))
-    # collector
-    train_collector = Collector(
-        policy, train_envs,
-        VectorReplayBuffer(args.buffer_size, len(train_envs)),
-        exploration_noise=True)
-    test_collector = Collector(policy, test_envs)
-    # log
-    log_path = os.path.join(args.logdir, args.task, 'ddpg')
-    writer = SummaryWriter(log_path)
-    logger = BasicLogger(writer)
-
-    def stop_fn(mean_rewards):
-        return mean_rewards >= env.spec.reward_threshold
-
-    # trainer
-    result = offpolicy_trainer(
-        policy, train_collector, test_collector, args.epoch,
-        args.step_per_epoch, args.step_per_collect, args.test_num,
-        args.batch_size, stop_fn=stop_fn, logger=logger)
-    assert stop_fn(result['best_reward'])
-    if __name__ == '__main__':
-        pprint.pprint(result)
-        # Let's watch its performance!
-        policy.eval()
-        test_envs.seed(args.seed)
-        test_collector.reset()
-        result = test_collector.collect(n_episode=args.test_num, render=args.render)
-        rews, lens = result["rews"], result["lens"]
-        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
-
-
-if __name__ == '__main__':
-    test_ddpg()
diff --git a/examples/mujoco/runnable/ant_v2_td3.py b/examples/mujoco/runnable/ant_v2_td3.py
deleted file mode 100644
index 5e33c33da..000000000
--- a/examples/mujoco/runnable/ant_v2_td3.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import os
-import gym
-import torch
-import pprint
-import argparse
-import numpy as np
-from torch.utils.tensorboard import SummaryWriter
-
-from tianshou.policy import TD3Policy
-from tianshou.utils import BasicLogger
-from tianshou.env import SubprocVectorEnv
-from tianshou.utils.net.common import Net
-from tianshou.exploration import GaussianNoise
-from tianshou.trainer import offpolicy_trainer
-from tianshou.data import Collector, VectorReplayBuffer
-from tianshou.utils.net.continuous import Actor, Critic
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--task', type=str, default='Ant-v2')
-    parser.add_argument('--seed', type=int, default=1626)
-    parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--actor-lr', type=float, default=3e-4)
-    parser.add_argument('--critic-lr', type=float, default=1e-3)
-    parser.add_argument('--gamma', type=float, default=0.99)
-    parser.add_argument('--tau', type=float, default=0.005)
-    parser.add_argument('--exploration-noise', type=float, default=0.1)
-    parser.add_argument('--policy-noise', type=float, default=0.2)
-    parser.add_argument('--noise-clip', type=float, default=0.5)
-    parser.add_argument('--update-actor-freq', type=int, default=2)
-    parser.add_argument('--epoch', type=int, default=100)
-    parser.add_argument('--step-per-epoch', type=int, default=2400)
-    parser.add_argument('--step-per-collect', type=int, default=10)
-    parser.add_argument('--batch-size', type=int, default=128)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=10)
-    parser.add_argument('--test-num', type=int, default=100)
-    parser.add_argument('--logdir', type=str, default='log')
-    parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument(
-        '--device', type=str,
-        default='cuda' if torch.cuda.is_available() else 'cpu')
-    return parser.parse_args()
-
-
-def test_td3(args=get_args()):
-    env = gym.make(args.task)
-    args.state_shape = env.observation_space.shape or env.observation_space.n
-    args.action_shape = env.action_space.shape or env.action_space.n
-    args.max_action = env.action_space.high[0]
-    # train_envs = gym.make(args.task)
-    train_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.training_num)])
-    # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.test_num)])
-    # seed
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    train_envs.seed(args.seed)
-    test_envs.seed(args.seed)
-    # model
-    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
-              device=args.device)
-    actor = Actor(net, args.action_shape, max_action=args.max_action,
-                  device=args.device).to(args.device)
-    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
-    net = Net(args.state_shape, args.action_shape,
-              hidden_sizes=args.hidden_sizes, concat=True, device=args.device)
-    critic1 = Critic(net, device=args.device).to(args.device)
-    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
-    critic2 = Critic(net, device=args.device).to(args.device)
-    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
-    policy = TD3Policy(
-        actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
-        action_range=[env.action_space.low[0], env.action_space.high[0]],
-        tau=args.tau, gamma=args.gamma,
-        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
-        policy_noise=args.policy_noise,
-        update_actor_freq=args.update_actor_freq,
-        noise_clip=args.noise_clip)
-    # collector
-    train_collector = Collector(
-        policy, train_envs,
-        VectorReplayBuffer(args.buffer_size, len(train_envs)),
-        exploration_noise=True)
-    test_collector = Collector(policy, test_envs)
-    # train_collector.collect(n_step=args.buffer_size)
-    # log
-    log_path = os.path.join(args.logdir, args.task, 'td3')
-    writer = SummaryWriter(log_path)
-    logger = BasicLogger(writer)
-
-    def stop_fn(mean_rewards):
-        return mean_rewards >= env.spec.reward_threshold
-
-    # trainer
-    result = offpolicy_trainer(
-        policy, train_collector, test_collector, args.epoch,
-        args.step_per_epoch, args.step_per_collect, args.test_num,
-        args.batch_size, stop_fn=stop_fn, logger=logger)
-    assert stop_fn(result['best_reward'])
-    if __name__ == '__main__':
-        pprint.pprint(result)
-        # Let's watch its performance!
-        policy.eval()
-        test_envs.seed(args.seed)
-        test_collector.reset()
-        result = test_collector.collect(n_episode=args.test_num, render=args.render)
-        rews, lens = result["rews"], result["lens"]
-        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
-
-
-if __name__ == '__main__':
-    test_td3()
diff --git a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py b/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py
deleted file mode 100644
index 618492771..000000000
--- a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import os
-import gym
-import torch
-import pprint
-import datetime
-import argparse
-import numpy as np
-import pybullet_envs
-from torch.utils.tensorboard import SummaryWriter
-
-from tianshou.policy import SACPolicy
-from tianshou.utils import BasicLogger
-from tianshou.utils.net.common import Net
-from tianshou.env import SubprocVectorEnv
-from tianshou.trainer import offpolicy_trainer
-from tianshou.data import Collector, VectorReplayBuffer
-from tianshou.utils.net.continuous import ActorProb, Critic
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--task', type=str, default='HalfCheetahBulletEnv-v0')
-    parser.add_argument('--run-id', type=str, default='test')
-    parser.add_argument('--seed', type=int, default=1626)
-    parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--actor-lr', type=float, default=3e-4)
-    parser.add_argument('--critic-lr', type=float, default=1e-3)
-    parser.add_argument('--gamma', type=float, default=0.99)
-    parser.add_argument('--tau', type=float, default=0.005)
-    parser.add_argument('--alpha', type=float, default=0.2)
-    parser.add_argument('--epoch', type=int, default=200)
-    parser.add_argument('--step-per-epoch', type=int, default=1000)
-    parser.add_argument('--step-per-collect', type=int, default=10)
-    parser.add_argument('--batch-size', type=int, default=128)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=10)
-    parser.add_argument('--test-num', type=int, default=4)
-    parser.add_argument('--logdir', type=str, default='log')
-    parser.add_argument('--log-interval', type=int, default=100)
-    parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument(
-        '--device', type=str,
-        default='cuda' if torch.cuda.is_available() else 'cpu')
-    return parser.parse_args()
-
-
-def test_sac(args=get_args()):
-    torch.set_num_threads(1)
-    env = gym.make(args.task)
-    args.state_shape = env.observation_space.shape or env.observation_space.n
-    args.action_shape = env.action_space.shape or env.action_space.n
-    args.max_action = env.action_space.high[0]
-    # you can also use tianshou.env.SubprocVectorEnv
-    # train_envs = gym.make(args.task)
-    train_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.training_num)])
-    # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.test_num)])
-    # seed
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    train_envs.seed(args.seed)
-    test_envs.seed(args.seed)
-    # model
-    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
-              device=args.device)
-    actor = ActorProb(net, args.action_shape, max_action=args.max_action,
-                      device=args.device, unbounded=True).to(args.device)
-    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
-    net = Net(args.state_shape, args.action_shape,
-              hidden_sizes=args.hidden_sizes, concat=True, device=args.device)
-    critic1 = Critic(net, device=args.device).to(args.device)
-    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
-    net = Net(args.state_shape, args.action_shape,
-              hidden_sizes=args.hidden_sizes, concat=True, device=args.device)
-    critic2 = Critic(net, device=args.device).to(args.device)
-    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
-    policy = SACPolicy(
-        actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
-        action_range=[env.action_space.low[0], env.action_space.high[0]],
-        tau=args.tau, gamma=args.gamma, alpha=args.alpha)
-    # collector
-    train_collector = Collector(
-        policy, train_envs,
-        VectorReplayBuffer(args.buffer_size, len(train_envs)),
-        exploration_noise=True)
-    test_collector = Collector(policy, test_envs)
-    # train_collector.collect(n_step=args.buffer_size)
-    # log
-    log_path = os.path.join(args.logdir, args.task, 'sac', 'seed_' + str(
-        args.seed) + '_' + datetime.datetime.now().strftime('%m%d-%H%M%S'))
-    writer = SummaryWriter(log_path)
-    logger = BasicLogger(writer, train_interval=args.log_interval)
-
-    def stop_fn(mean_rewards):
-        return mean_rewards >= env.spec.reward_threshold
-
-    # trainer
-    result = offpolicy_trainer(
-        policy, train_collector, test_collector, args.epoch,
-        args.step_per_epoch, args.step_per_collect, args.test_num,
-        args.batch_size, stop_fn=stop_fn,
-        logger=logger)
-    assert stop_fn(result['best_reward'])
-    if __name__ == '__main__':
-        pprint.pprint(result)
-        # Let's watch its performance!
-        policy.eval()
-        test_envs.seed(args.seed)
-        test_collector.reset()
-        result = test_collector.collect(n_episode=args.test_num,
-                                        render=args.render)
-        rews, lens = result["rews"], result["lens"]
-        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
-
-
-if __name__ == '__main__':
-    __all__ = ('pybullet_envs',)  # Avoid F401 error :)
-    test_sac()
diff --git a/examples/mujoco/runnable/mujoco/__init__.py b/examples/mujoco/runnable/mujoco/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/mujoco/runnable/mujoco/assets/point.xml b/examples/mujoco/runnable/mujoco/assets/point.xml
deleted file mode 100644
index 38cc64407..000000000
--- a/examples/mujoco/runnable/mujoco/assets/point.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<mujoco>
-    <compiler inertiafromgeom="true" angle="degree" coordinate="local"/>
-    <option timestep="0.02" integrator="RK4"/>
-    <default>
-        <joint limited="false" armature="0" damping="0"/>
-        <geom condim="3" conaffinity="0" margin="0" friction="1 0.5 0.5" rgba="0.8 0.6 0.4 1" density="100"/>
-    </default>
-    <asset>
-        <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0"/>
-        <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4"
-                 rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01"/>
-        <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100"/>
-        <material name='MatPlane' texture="texplane" shininess="1" texrepeat="30 30" specular="1" reflectance="0.5"/>
-        <material name='geom' texture="texgeom" texuniform="true"/>
-    </asset>
-    <worldbody>
-        <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3"
-               dir="-0 0 -1.3"/>
-        <geom name='floor' material="MatPlane" pos='0 0 0' size='40 40 40' type='plane' conaffinity='1'
-              rgba='0.8 0.9 0.8 1' condim='3'/>
-        <body name="torso" pos="0 0 0">
-            <geom name="pointbody" type="sphere" size="0.5" pos="0 0 0.5"/>
-            <geom name="pointarrow" type="box" size="0.5 0.1 0.1" pos="0.6 0 0.5"/>
-            <joint name='ballx' type='slide' axis='1 0 0' pos='0 0 0'/>
-            <joint name='bally' type='slide' axis='0 1 0' pos='0 0 0'/>
-            <joint name='rot' type='hinge' axis='0 0 1' pos='0 0 0' limited="false"/>
-        </body>
-    </worldbody>
-    <actuator>
-        <!-- Those are just dummy actuators for providing ranges -->
-        <motor joint='ballx' ctrlrange="-1 1" ctrllimited="true"/>
-        <motor joint='rot' ctrlrange="-0.25 0.25" ctrllimited="true"/>
-    </actuator>
-</mujoco>
diff --git a/examples/mujoco/runnable/mujoco/maze_env_utils.py b/examples/mujoco/runnable/mujoco/maze_env_utils.py
deleted file mode 100644
index dafce77f5..000000000
--- a/examples/mujoco/runnable/mujoco/maze_env_utils.py
+++ /dev/null
@@ -1,196 +0,0 @@
-"""Adapted from rllab maze_env_utils.py."""
-import math
-
-
-class Move(object):
-    X = 11
-    Y = 12
-    Z = 13
-    XY = 14
-    XZ = 15
-    YZ = 16
-    XYZ = 17
-    SpinXY = 18
-
-
-def can_move_x(movable):
-    return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
-                       Move.SpinXY]
-
-
-def can_move_y(movable):
-    return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
-                       Move.SpinXY]
-
-
-def can_move_z(movable):
-    return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]
-
-
-def can_spin(movable):
-    return movable in [Move.SpinXY]
-
-
-def can_move(movable):
-    return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)
-
-
-def construct_maze(maze_id='Maze'):
-    if maze_id == 'Maze':
-        structure = [
-            [1, 1, 1, 1, 1],
-            [1, 'r', 0, 0, 1],
-            [1, 1, 1, 0, 1],
-            [1, 'g', 0, 0, 1],
-            [1, 1, 1, 1, 1],
-        ]
-    elif maze_id == 'Maze1':
-        structure = [
-            [1, 1, 1, 1, 1, 1, 1, 1],
-            [1, 'r', 1, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 1, 1, 0, 1],
-            [1, 1, 1, 1, 1, 0, 0, 1],
-            [1, 0, 0, 0, 1, 0, 1, 1],
-            [1, 0, 0, 0, 1, 0, 1, 1],
-            [1, 0, 1, 0, 0, 0, 0, 1],
-            [1, 1, 1, 1, 1, 1, 1, 1],
-        ]
-    elif maze_id == 'Maze2':
-        structure = [
-            [0, 0, 0, 0, 0],
-            [0, 'r', 0, 0, 0],
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0],
-        ]
-    # transfer maze
-    elif maze_id == 'Maze3':
-        structure = [
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 0, 'r', 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1],
-            [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
-            [1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
-            [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 'g', 1],
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-        ]
-    elif maze_id == 'Maze4':
-        structure = [
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 0, 'r', 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 'g', 1],
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-        ]
-    elif maze_id == 'Push':
-        structure = [
-            [1, 1, 1, 1, 1],
-            [1, 0, 'r', 1, 1],
-            [1, 0, Move.XY, 0, 1],
-            [1, 1, 0, 1, 1],
-            [1, 1, 1, 1, 1],
-        ]
-    elif maze_id == 'Fall':
-        structure = [
-            [1, 1, 1, 1],
-            [1, 'r', 0, 1],
-            [1, 0, Move.YZ, 1],
-            [1, -1, -1, 1],
-            [1, 0, 0, 1],
-            [1, 1, 1, 1],
-        ]
-    elif maze_id == 'Block':
-        structure = [
-            [1, 1, 1, 1, 1],
-            [1, 'r', 0, 0, 1],
-            [1, 0, 0, 0, 1],
-            [1, 0, 0, 0, 1],
-            [1, 1, 1, 1, 1],
-        ]
-    elif maze_id == 'BlockMaze':
-        structure = [
-            [1, 1, 1, 1],
-            [1, 'r', 0, 1],
-            [1, 1, 0, 1],
-            [1, 0, 0, 1],
-            [1, 1, 1, 1],
-        ]
-    else:
-        raise NotImplementedError(
-            'The provided MazeId %s is not recognized' % maze_id)
-
-    return structure
-
-
-def line_intersect(pt1, pt2, ptA, ptB):
-    """
-    Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
-    this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
-    """
-
-    DET_TOLERANCE = 0.00000001
-
-    # the first line is pt1 + r*(pt2-pt1)
-    # in component form:
-    x1, y1 = pt1
-    x2, y2 = pt2
-    dx1 = x2 - x1
-    dy1 = y2 - y1
-
-    # the second line is ptA + s*(ptB-ptA)
-    x, y = ptA
-    xB, yB = ptB
-    dx = xB - x
-    dy = yB - y
-
-    DET = (-dx1 * dy + dy1 * dx)
-
-    if math.fabs(DET) < DET_TOLERANCE:
-        return (0, 0, 0, 0, 0)
-
-    # now, the determinant should be OK
-    DETinv = 1.0 / DET
-
-    # find the scalar amount along the "self" segment
-    r = DETinv * (-dy * (x - x1) + dx * (y - y1))
-
-    # find the scalar amount along the input line
-    s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
-
-    # return the average of the two descriptions
-    xi = (x1 + r * dx1 + x + s * dx) / 2.0
-    yi = (y1 + r * dy1 + y + s * dy) / 2.0
-    return (xi, yi, 1, r, s)
-
-
-def ray_segment_intersect(ray, segment):
-    """
-    Check if the ray originated from (x, y) with direction theta
-    intersects the line segment (x1, y1) -- (x2, y2), and return
-    the intersection point if there is one
-    """
-    (x, y), theta = ray
-    # (x1, y1), (x2, y2) = segment
-    pt1 = (x, y)
-    len = 1
-    pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
-    xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
-    if valid and r >= 0 and 0 <= s <= 1:
-        return (xo, yo)
-    return None
-
-
-def point_distance(p1, p2):
-    x1, y1 = p1
-    x2, y2 = p2
-    return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
diff --git a/examples/mujoco/runnable/mujoco/point.py b/examples/mujoco/runnable/mujoco/point.py
deleted file mode 100644
index 2a6a08d41..000000000
--- a/examples/mujoco/runnable/mujoco/point.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Wrapper for creating the ant environment in gym_mujoco."""
-
-import math
-import numpy as np
-from gym import utils
-from gym.envs.mujoco import mujoco_env
-
-
-class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
-    FILE = "point.xml"
-    ORI_IND = 2
-
-    def __init__(self, file_path=None, expose_all_qpos=True, noisy_init=False):
-        self._expose_all_qpos = expose_all_qpos
-        self.noisy_init = noisy_init
-        mujoco_env.MujocoEnv.__init__(self, file_path, 1)
-        utils.EzPickle.__init__(self)
-
-    @property
-    def physics(self):
-        return self.model
-
-    def _step(self, a):
-        return self.step(a)
-
-    def step(self, action):
-        # action[0] is velocity, action[1] is direction
-        action[0] = 0.2 * action[0]
-        qpos = np.copy(self.data.qpos)
-        qpos[2] += action[1]
-        ori = qpos[2]
-        # compute increment in each direction
-        dx = math.cos(ori) * action[0]
-        dy = math.sin(ori) * action[0]
-        # ensure that the robot is within reasonable range
-        qpos[0] = np.clip(qpos[0] + dx, -100, 100)
-        qpos[1] = np.clip(qpos[1] + dy, -100, 100)
-        qvel = np.squeeze(self.data.qvel)
-        self.set_state(qpos, qvel)
-        for _ in range(0, self.frame_skip):
-            # self.physics.step()
-            self.sim.step()
-        next_obs = self._get_obs()
-        reward = 0
-        done = False
-        info = {}
-        return next_obs, reward, done, info
-
-    def _get_obs(self):
-        if self._expose_all_qpos:
-            return np.concatenate([
-                self.data.qpos.flat[:3],  # Only point-relevant coords.
-                self.data.qvel.flat[:3]])
-        return np.concatenate([
-            self.data.qpos.flat[2:3],
-            self.data.qvel.flat[:3]])
-
-    def reset_model(self):
-        if self.noisy_init:
-            qpos = self.init_qpos + self.np_random.uniform(
-                size=self.model.nq, low=-.1, high=.1)
-            qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
-
-        else:
-            qpos = self.init_qpos
-            qvel = self.init_qvel
-
-        # Set everything other than point to original position and 0 velocity.
-        qpos[3:] = self.init_qpos[3:]
-        qvel[3:] = 0.
-        self.set_state(qpos, qvel)
-        return self._get_obs()
-
-    def get_ori(self):
-        return self.data.qpos[self.__class__.ORI_IND]
-
-    def set_xy(self, xy):
-        qpos = np.copy(self.data.qpos)
-        qpos[0] = xy[0]
-        qpos[1] = xy[1]
-
-        qvel = self.data.qvel
-        self.set_state(qpos, qvel)
-
-    def get_xy(self):
-        qpos = np.copy(self.data.qpos)
-        return qpos[:2]
-
-    def viewer_setup(self):
-
-        self.viewer.cam.trackbodyid = -1
-        self.viewer.cam.distance = 80
-        self.viewer.cam.elevation = -90
diff --git a/examples/mujoco/runnable/mujoco/point_maze_env.py b/examples/mujoco/runnable/mujoco/point_maze_env.py
deleted file mode 100644
index c8e8ef84b..000000000
--- a/examples/mujoco/runnable/mujoco/point_maze_env.py
+++ /dev/null
@@ -1,568 +0,0 @@
-"""Adapted from rllab maze_env.py."""
-
-import os
-import tempfile
-import xml.etree.ElementTree as ET
-import math
-import numpy as np
-import gym
-from . import maze_env_utils
-from .point import PointEnv
-from gym.utils import seeding
-
-# Directory that contains mujoco xml files.
-MODEL_DIR = os.path.join(os.path.dirname(__file__), 'assets')
-
-
-class PointMazeEnv(gym.Env):
-    MODEL_CLASS = PointEnv
-
-    MAZE_HEIGHT = None
-    MAZE_SIZE_SCALING = None
-
-    def __init__(
-            self,
-            maze_id=None,
-            maze_height=0.5,
-            maze_size_scaling=8,
-            n_bins=0,
-            sensor_range=3.,
-            sensor_span=2 * math.pi,
-            observe_blocks=False,
-            put_spin_near_agent=False,
-            top_down_view=False,
-            manual_collision=False,
-            goal=None,
-            EPS=0.25,
-            max_episode_steps=2000,
-            *args,
-            **kwargs):
-        self._maze_id = maze_id
-
-        model_cls = self.__class__.MODEL_CLASS
-        if model_cls is None:
-            raise "MODEL_CLASS unspecified!"
-        xml_path = os.path.join(MODEL_DIR, model_cls.FILE)
-        self.tree = tree = ET.parse(xml_path)
-        self.worldbody = worldbody = tree.find(".//worldbody")
-        self.visualize_goal = False
-        self.max_episode_steps = max_episode_steps
-        self.t = 0
-        self.MAZE_HEIGHT = height = maze_height
-        self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
-        self._n_bins = n_bins
-        self._sensor_range = sensor_range * size_scaling
-        self._sensor_span = sensor_span
-        self._observe_blocks = observe_blocks
-        self._put_spin_near_agent = put_spin_near_agent
-        self._top_down_view = top_down_view
-        self._manual_collision = manual_collision
-
-        self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze(
-            maze_id=self._maze_id)
-        # Elevate the maze to allow for falling.
-        self.elevated = any(-1 in row for row in structure)
-        self.blocks = any(
-            any(maze_env_utils.can_move(r) for r in row)
-            for row in structure)  # Are there any movable blocks?
-
-        torso_x, torso_y = self._find_robot()  # x, y coordinates
-        self._init_torso_x = torso_x
-        self._init_torso_y = torso_y
-        self._init_positions = [
-            (x - torso_x, y - torso_y)
-            for x, y in self._find_all_robots()]
-
-        self._view = np.zeros([5, 5, 3])
-
-        height_offset = 0.
-        if self.elevated:
-            height_offset = height * size_scaling
-            torso = tree.find(".//body[@name='torso']")
-            torso.set('pos', '0 0 %.2f' % (0.75 + height_offset))
-        if self.blocks:
-            default = tree.find(".//default")
-            default.find('.//geom').set('solimp', '.995 .995 .01')
-
-        self.movable_blocks = []
-        for i in range(len(structure)):
-            for j in range(len(structure[0])):
-                struct = structure[i][j]
-                if struct == 'r' and self._put_spin_near_agent:
-                    struct = maze_env_utils.Move.SpinXY
-                if self.elevated and struct not in [-1]:
-                    # Create elevated platform.
-                    ET.SubElement(
-                        worldbody, "geom",
-                        name="elevated_%d_%d" % (i, j),
-                        pos="%f %f %f" % (j * size_scaling - torso_x,
-                                          i * size_scaling - torso_y,
-                                          height / 2 * size_scaling),
-                        size="%f %f %f" % (0.5 * size_scaling,
-                                           0.5 * size_scaling,
-                                           height / 2 * size_scaling),
-                        type="box",
-                        material="",
-                        contype="1",
-                        conaffinity="1",
-                        rgba="0.9 0.9 0.9 1",
-                    )
-                if struct == 1:  # Unmovable block.
-                    # Offset all coordinates so that robot starts at the origin
-                    ET.SubElement(
-                        worldbody, "geom",
-                        name="block_%d_%d" % (i, j),
-                        pos="%f %f %f" % (j * size_scaling - torso_x,
-                                          i * size_scaling - torso_y,
-                                          height_offset +
-                                          height / 2 * size_scaling),
-                        size="%f %f %f" % (0.5 * size_scaling,
-                                           0.5 * size_scaling,
-                                           height / 2 * size_scaling),
-                        type="box",
-                        material="",
-                        contype="1",
-                        conaffinity="1",
-                        rgba="0.4 0.4 0.4 1",
-                    )
-                elif maze_env_utils.can_move(struct):
-                    name = "movable_%d_%d" % (i, j)
-                    self.movable_blocks.append((name, struct))
-                    falling = maze_env_utils.can_move_z(struct)
-                    spinning = maze_env_utils.can_spin(struct)
-                    x_offset = 0.25 * size_scaling if spinning else 0.0
-                    y_offset = 0.0
-                    shrink = 0.1 if spinning else 0.99 if falling else 1.0
-                    height_shrink = 0.1 if spinning else 1.0
-                    _x = j * size_scaling - torso_x + x_offset
-                    _y = i * size_scaling - torso_y + y_offset
-                    _z = height / 2 * size_scaling * height_shrink
-                    movable_body = ET.SubElement(
-                        worldbody, "body",
-                        name=name,
-                        pos="%f %f %f" % (_x, _y, height_offset + _z),
-                    )
-                    ET.SubElement(
-                        movable_body, "geom",
-                        name="block_%d_%d" % (i, j),
-                        pos="0 0 0",
-                        size="%f %f %f" % (0.5 * size_scaling * shrink,
-                                           0.5 * size_scaling * shrink,
-                                           _z),
-                        type="box",
-                        material="",
-                        mass="0.001" if falling else "0.0002",
-                        contype="1",
-                        conaffinity="1",
-                        rgba="0.9 0.1 0.1 1"
-                    )
-                    if maze_env_utils.can_move_x(struct):
-                        ET.SubElement(
-                            movable_body, "joint",
-                            armature="0",
-                            axis="1 0 0",
-                            damping="0.0",
-                            limited="true" if falling else "false",
-                            range="%f %f" % (-size_scaling, size_scaling),
-                            margin="0.01",
-                            name="movable_x_%d_%d" % (i, j),
-                            pos="0 0 0",
-                            type="slide"
-                        )
-                    if maze_env_utils.can_move_y(struct):
-                        ET.SubElement(
-                            movable_body, "joint",
-                            armature="0",
-                            axis="0 1 0",
-                            damping="0.0",
-                            limited="true" if falling else "false",
-                            range="%f %f" % (-size_scaling, size_scaling),
-                            margin="0.01",
-                            name="movable_y_%d_%d" % (i, j),
-                            pos="0 0 0",
-                            type="slide"
-                        )
-                    if maze_env_utils.can_move_z(struct):
-                        ET.SubElement(
-                            movable_body, "joint",
-                            armature="0",
-                            axis="0 0 1",
-                            damping="0.0",
-                            limited="true",
-                            range="%f 0" % (-height_offset),
-                            margin="0.01",
-                            name="movable_z_%d_%d" % (i, j),
-                            pos="0 0 0",
-                            type="slide"
-                        )
-                    if maze_env_utils.can_spin(struct):
-                        ET.SubElement(
-                            movable_body, "joint",
-                            armature="0",
-                            axis="0 0 1",
-                            damping="0.0",
-                            limited="false",
-                            name="spinable_%d_%d" % (i, j),
-                            pos="0 0 0",
-                            type="ball"
-                        )
-
-        torso = tree.find(".//body[@name='torso']")
-        geoms = torso.findall(".//geom")
-        for geom in geoms:
-            if 'name' not in geom.attrib:
-                raise Exception("Every geom of the torso must have a name "
-                                "defined")
-
-        _, file_path = tempfile.mkstemp(text=True, suffix='.xml')
-        tree.write(file_path)
-
-        self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs)
-        self.args = args
-        self.kwargs = kwargs
-        self.GOAL = goal
-        if self.GOAL is not None:
-            self.GOAL = self.unwrapped._rowcol_to_xy(*self.GOAL)
-        self.EPS = EPS
-
-    def get_ori(self):
-        return self.wrapped_env.get_ori()
-
-    def get_top_down_view(self):
-        self._view = np.zeros_like(self._view)
-
-        def valid(row, col):
-            return self._view.shape[0] > row >= 0 \
-                   and self._view.shape[1] > col >= 0
-
-        def update_view(x, y, d, row=None, col=None):
-            if row is None or col is None:
-                x = x - self._robot_x
-                y = y - self._robot_y
-
-                row, col = self._xy_to_rowcol(x, y)
-                update_view(x, y, d, row=row, col=col)
-                return
-
-            row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1
-            if row_frac < 0:
-                row_frac += 1
-            if col_frac < 0:
-                col_frac += 1
-
-            if valid(row, col):
-                self._view[row, col, d] += (
-                        (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
-                        (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
-            if valid(row - 1, col):
-                self._view[row - 1, col, d] += (
-                        (max(0., 0.5 - row_frac)) *
-                        (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
-            if valid(row + 1, col):
-                self._view[row + 1, col, d] += (
-                        (max(0., row_frac - 0.5)) *
-                        (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
-            if valid(row, col - 1):
-                self._view[row, col - 1, d] += (
-                        (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
-                        (max(0., 0.5 - col_frac)))
-            if valid(row, col + 1):
-                self._view[row, col + 1, d] += (
-                        (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
-                        (max(0., col_frac - 0.5)))
-            if valid(row - 1, col - 1):
-                self._view[row - 1, col - 1, d] += (
-                        (max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac))
-            if valid(row - 1, col + 1):
-                self._view[row - 1, col + 1, d] += (
-                        (max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5))
-            if valid(row + 1, col + 1):
-                self._view[row + 1, col + 1, d] += (
-                        (max(0., row_frac - 0.5)) * max(0., col_frac - 0.5))
-            if valid(row + 1, col - 1):
-                self._view[row + 1, col - 1, d] += (
-                        (max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac))
-
-        # Draw ant.
-        robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2]
-        self._robot_x = robot_x
-        self._robot_y = robot_y
-        self._robot_ori = self.get_ori()
-
-        structure = self.MAZE_STRUCTURE
-        size_scaling = self.MAZE_SIZE_SCALING
-
-        # Draw immovable blocks and chasms.
-        for i in range(len(structure)):
-            for j in range(len(structure[0])):
-                if structure[i][j] == 1:  # Wall.
-                    update_view(j * size_scaling - self._init_torso_x,
-                                i * size_scaling - self._init_torso_y,
-                                0)
-                if structure[i][j] == -1:  # Chasm.
-                    update_view(j * size_scaling - self._init_torso_x,
-                                i * size_scaling - self._init_torso_y,
-                                1)
-
-        # Draw movable blocks.
-        for block_name, block_type in self.movable_blocks:
-            block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2]
-            update_view(block_x, block_y, 2)
-
-        import cv2
-        cv2.imshow('x.jpg', cv2.resize(
-            np.uint8(self._view * 255), (512, 512),
-            interpolation=cv2.INTER_CUBIC))
-        cv2.waitKey(0)
-
-        return self._view
-
-    def get_range_sensor_obs(self):
-        """Returns egocentric range sensor observations of maze."""
-        robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3]
-        ori = self.get_ori()
-
-        structure = self.MAZE_STRUCTURE
-        size_scaling = self.MAZE_SIZE_SCALING
-        height = self.MAZE_HEIGHT
-
-        segments = []
-        # Get line segments (corresponding to outer boundary) of each immovable
-        # block or drop-off.
-        for i in range(len(structure)):
-            for j in range(len(structure[0])):
-                if structure[i][j] in [1, -1]:  # There's a wall or drop-off.
-                    cx = j * size_scaling - self._init_torso_x
-                    cy = i * size_scaling - self._init_torso_y
-                    x1 = cx - 0.5 * size_scaling
-                    x2 = cx + 0.5 * size_scaling
-                    y1 = cy - 0.5 * size_scaling
-                    y2 = cy + 0.5 * size_scaling
-                    struct_segments = [
-                        ((x1, y1), (x2, y1)),
-                        ((x2, y1), (x2, y2)),
-                        ((x2, y2), (x1, y2)),
-                        ((x1, y2), (x1, y1)),
-                    ]
-                    for seg in struct_segments:
-                        segments.append(dict(
-                            segment=seg,
-                            type=structure[i][j],
-                        ))
-
-        for block_name, block_type in self.movable_blocks:
-            block_x, block_y, block_z = \
-                self.wrapped_env.get_body_com(block_name)[:3]
-            if (block_z + height * size_scaling / 2 >= robot_z and
-                    robot_z >= block_z - height * size_scaling / 2):
-                # Block in view.
-                x1 = block_x - 0.5 * size_scaling
-                x2 = block_x + 0.5 * size_scaling
-                y1 = block_y - 0.5 * size_scaling
-                y2 = block_y + 0.5 * size_scaling
-                struct_segments = [
-                    ((x1, y1), (x2, y1)),
-                    ((x2, y1), (x2, y2)),
-                    ((x2, y2), (x1, y2)),
-                    ((x1, y2), (x1, y1)),
-                ]
-                for seg in struct_segments:
-                    segments.append(dict(
-                        segment=seg,
-                        type=block_type,
-                    ))
-
-        # 3 for wall, drop-off, block
-        sensor_readings = np.zeros((self._n_bins, 3))
-        for ray_idx in range(self._n_bins):
-            ray_ori = (ori - self._sensor_span * 0.5 + (
-                    2 * ray_idx + 1.0) /
-                       (2 * self._n_bins) * self._sensor_span)
-            ray_segments = []
-            # Get all segments that intersect with ray.
-            for seg in segments:
-                p = maze_env_utils.ray_segment_intersect(
-                    ray=((robot_x, robot_y), ray_ori),
-                    segment=seg["segment"])
-                if p is not None:
-                    ray_segments.append(dict(
-                        segment=seg["segment"],
-                        type=seg["type"],
-                        ray_ori=ray_ori,
-                        distance=maze_env_utils.point_distance(
-                            p, (robot_x, robot_y)),
-                    ))
-            if len(ray_segments) > 0:
-                # Find out which segment is intersected first.
-                first_seg = sorted(
-                    ray_segments, key=lambda x: x["distance"])[0]
-                seg_type = first_seg["type"]
-                idx = (0 if seg_type == 1 else  # Wall.
-                       1 if seg_type == -1 else  # Drop-off.
-                       2 if maze_env_utils.can_move(seg_type) else  # Block.
-                       None)
-                if first_seg["distance"] <= self._sensor_range:
-                    sensor_readings[ray_idx][idx] = \
-                        (self._sensor_range - first_seg[
-                            "distance"]) / self._sensor_range
-        return sensor_readings
-
-    def _get_obs(self):
-        wrapped_obs = self.wrapped_env._get_obs()
-        if self._top_down_view:
-            self.get_top_down_view()
-
-        if self._observe_blocks:
-            additional_obs = []
-            for block_name, block_type in self.movable_blocks:
-                additional_obs.append(
-                    self.wrapped_env.get_body_com(block_name))
-            wrapped_obs = np.concatenate([wrapped_obs[:3]] + additional_obs +
-                                         [wrapped_obs[3:]])
-
-        self.get_range_sensor_obs()
-        return wrapped_obs
-
-    def seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def reset(self, goal=None):
-        self.goal = goal
-
-        if self.visualize_goal:  # remove the prev goal and add a new goal
-            goal_x, goal_y = goal[0], goal[1]
-            size_scaling = self.MAZE_SIZE_SCALING
-            # remove the original goal
-            try:
-                self.worldbody.remove(self.goal_element)
-            except AttributeError:
-                pass
-            # offset all coordinates so that robot starts at the origin
-            self.goal_element = \
-                ET.SubElement(
-                    self.worldbody, "geom",
-                    name="goal_%d_%d" % (goal_x, goal_y),
-                    pos="%f %f %f" % (goal_x,
-                                      goal_y,
-                                      self.MAZE_HEIGHT / 2 * size_scaling),
-                    # smaller than the block to prevent collision
-                    size="%f %f %f" % (0.1 * size_scaling,
-                                       0.1 * size_scaling,
-                                       self.MAZE_HEIGHT / 2 * size_scaling),
-                    type="box",
-                    material="",
-                    contype="1",
-                    conaffinity="1",
-                    rgba="1.0 0.0 0.0 0.5"
-                )
-            # Note: running the lines below will make the robot position wrong!
-            # (because the graph is rebuilt)
-            torso = self.tree.find(".//body[@name='torso']")
-            geoms = torso.findall(".//geom")
-            for geom in geoms:
-                if 'name' not in geom.attrib:
-                    raise Exception("Every geom of the torso must have a name "
-                                    "defined")
-            _, file_path = tempfile.mkstemp(text=True, suffix='.xml')
-            self.tree.write(file_path)
-            # here we write a temporal file with the robot specifications.
-            # Why not the original one??
-
-            model_cls = self.__class__.MODEL_CLASS
-            # file to the robot specifications; model_cls is AntEnv
-            self.wrapped_env = model_cls(
-                *self.args, file_path=file_path, **self.kwargs)
-
-        self.t = 0
-        self.trajectory = []
-        self.wrapped_env.reset()
-        if len(self._init_positions) > 1:
-            xy = self._init_positions[self.np_random.randint(
-                len(self._init_positions))]
-            self.wrapped_env.set_xy(xy)
-        return self._get_obs()
-
-    @property
-    def viewer(self):
-        return self.wrapped_env.viewer
-
-    def render(self, *args, **kwargs):
-        return self.wrapped_env.render(*args, **kwargs)
-
-    @property
-    def observation_space(self):
-        shape = self._get_obs().shape
-        high = np.inf * np.ones(shape)
-        low = -high
-        return gym.spaces.Box(low, high)
-
-    @property
-    def action_space(self):
-        return self.wrapped_env.action_space
-
-    def _find_robot(self):
-        structure = self.MAZE_STRUCTURE
-        size_scaling = self.MAZE_SIZE_SCALING
-        for i in range(len(structure)):
-            for j in range(len(structure[0])):
-                if structure[i][j] == 'r':
-                    return j * size_scaling, i * size_scaling
-        assert False, 'No robot in maze specification.'
-
-    def _find_all_robots(self):
-        structure = self.MAZE_STRUCTURE
-        size_scaling = self.MAZE_SIZE_SCALING
-        coords = []
-        for i in range(len(structure)):
-            for j in range(len(structure[0])):
-                if structure[i][j] == 'r':
-                    coords.append((j * size_scaling, i * size_scaling))
-        return coords
-
-    def _is_in_collision(self, pos):
-        x, y = pos
-        structure = self.MAZE_STRUCTURE
-        scale = self.MAZE_SIZE_SCALING
-        for i in range(len(structure)):
-            for j in range(len(structure[0])):
-                if structure[i][j] == 1:
-                    minx = j * scale - scale * 0.5 - self._init_torso_x
-                    maxx = j * scale + scale * 0.5 - self._init_torso_x
-                    miny = i * scale - scale * 0.5 - self._init_torso_y
-                    maxy = i * scale + scale * 0.5 - self._init_torso_y
-                    if minx <= x <= maxx and miny <= y <= maxy:
-                        return True
-        return False
-
-    def _rowcol_to_xy(self, j, i):
-        scale = self.MAZE_SIZE_SCALING
-        minx = j * scale - scale * 0.5 - self._init_torso_x
-        maxx = j * scale + scale * 0.5 - self._init_torso_x
-        miny = i * scale - scale * 0.5 - self._init_torso_y
-        maxy = i * scale + scale * 0.5 - self._init_torso_y
-        return (minx + maxx) / 2, (miny + maxy) / 2
-
-    def step(self, action):
-        self.t += 1
-        if self._manual_collision:
-            old_pos = self.wrapped_env.get_xy()
-            inner_next_obs, inner_reward, inner_done, info = \
-                self.wrapped_env.step(action)
-            new_pos = self.wrapped_env.get_xy()
-            if self._is_in_collision(new_pos):
-                self.wrapped_env.set_xy(old_pos)
-        else:
-            inner_next_obs, inner_reward, inner_done, info = \
-                self.wrapped_env.step(action)
-        next_obs = self._get_obs()
-        done = False
-        if self.goal is not None:
-            done = bool(((next_obs[:2] - self.goal[:2]) ** 2).sum() < self.EPS)
-
-        new_pos = self.wrapped_env.get_xy()
-        if self._is_in_collision(new_pos) or inner_done:
-            done = True
-        if self.t >= self.max_episode_steps:
-            done = True
-        return next_obs, inner_reward, done, info
diff --git a/examples/mujoco/runnable/mujoco/register.py b/examples/mujoco/runnable/mujoco/register.py
deleted file mode 100644
index 82acac2af..000000000
--- a/examples/mujoco/runnable/mujoco/register.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from gym.envs.registration import register
-
-
-def reg():
-    register(
-        id='PointMaze-v0',
-        entry_point='mujoco.point_maze_env:PointMazeEnv',
-        kwargs={
-            "maze_size_scaling": 4,
-            "maze_id": "Maze2",
-            "maze_height": 0.5,
-            "manual_collision": True,
-            "goal": (1, 3),
-        }
-    )
-
-    register(
-        id='PointMaze-v1',
-        entry_point='mujoco.point_maze_env:PointMazeEnv',
-        kwargs={
-            "maze_size_scaling": 2,
-            "maze_id": "Maze2",
-            "maze_height": 0.5,
-            "manual_collision": True,
-            "goal": (1, 3),
-        }
-    )
diff --git a/examples/mujoco/runnable/point_maze_td3.py b/examples/mujoco/runnable/point_maze_td3.py
deleted file mode 100644
index eda299244..000000000
--- a/examples/mujoco/runnable/point_maze_td3.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import os
-import gym
-import torch
-import pprint
-import argparse
-import numpy as np
-from torch.utils.tensorboard import SummaryWriter
-
-from tianshou.policy import TD3Policy
-from tianshou.utils import BasicLogger
-from tianshou.utils.net.common import Net
-from tianshou.env import SubprocVectorEnv
-from tianshou.exploration import GaussianNoise
-from tianshou.trainer import offpolicy_trainer
-from tianshou.data import Collector, VectorReplayBuffer
-from tianshou.utils.net.continuous import Actor, Critic
-
-from mujoco.register import reg
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--task', type=str, default='PointMaze-v1')
-    parser.add_argument('--seed', type=int, default=1626)
-    parser.add_argument('--buffer-size', type=int, default=20000)
-    parser.add_argument('--actor-lr', type=float, default=3e-5)
-    parser.add_argument('--critic-lr', type=float, default=1e-4)
-    parser.add_argument('--gamma', type=float, default=0.99)
-    parser.add_argument('--tau', type=float, default=0.005)
-    parser.add_argument('--exploration-noise', type=float, default=0.1)
-    parser.add_argument('--policy-noise', type=float, default=0.2)
-    parser.add_argument('--noise-clip', type=float, default=0.5)
-    parser.add_argument('--update-actor-freq', type=int, default=2)
-    parser.add_argument('--epoch', type=int, default=100)
-    parser.add_argument('--step-per-epoch', type=int, default=2400)
-    parser.add_argument('--step-per-collect', type=int, default=10)
-    parser.add_argument('--batch-size', type=int, default=128)
-    parser.add_argument('--hidden-sizes', type=int,
-                        nargs='*', default=[128, 128])
-    parser.add_argument('--training-num', type=int, default=10)
-    parser.add_argument('--test-num', type=int, default=100)
-    parser.add_argument('--logdir', type=str, default='log')
-    parser.add_argument('--render', type=float, default=0.)
-    parser.add_argument(
-        '--device', type=str,
-        default='cuda' if torch.cuda.is_available() else 'cpu')
-    return parser.parse_args()
-
-
-def test_td3(args=get_args()):
-    reg()
-    env = gym.make(args.task)
-    args.state_shape = env.observation_space.shape or env.observation_space.n
-    args.action_shape = env.action_space.shape or env.action_space.n
-    args.max_action = env.action_space.high[0]
-    # train_envs = gym.make(args.task)
-    train_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.training_num)])
-    # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.test_num)])
-    # seed
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    train_envs.seed(args.seed)
-    test_envs.seed(args.seed)
-    # model
-    net = Net(args.state_shape, hidden_sizes=args.hidden_sizes,
-              device=args.device)
-    actor = Actor(net, args.action_shape, max_action=args.max_action,
-                  device=args.device).to(args.device)
-    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
-    net = Net(args.state_shape, args.action_shape,
-              hidden_sizes=args.hidden_sizes, concat=True, device=args.device)
-    critic1 = Critic(net, device=args.device).to(args.device)
-    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
-    net = Net(args.state_shape, args.action_shape,
-              hidden_sizes=args.hidden_sizes, concat=True, device=args.device)
-    critic2 = Critic(net, device=args.device).to(args.device)
-    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
-    policy = TD3Policy(
-        actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
-        action_range=[env.action_space.low[0], env.action_space.high[0]],
-        tau=args.tau, gamma=args.gamma,
-        exploration_noise=GaussianNoise(sigma=args.exploration_noise),
-        policy_noise=args.policy_noise,
-        update_actor_freq=args.update_actor_freq,
-        noise_clip=args.noise_clip)
-    # collector
-    train_collector = Collector(
-        policy, train_envs,
-        VectorReplayBuffer(args.buffer_size, len(train_envs)),
-        exploration_noise=True)
-    test_collector = Collector(policy, test_envs)
-    # train_collector.collect(n_step=args.buffer_size)
-    # log
-    log_path = os.path.join(args.logdir, args.task, 'td3')
-    writer = SummaryWriter(log_path)
-    logger = BasicLogger(writer)
-
-    def stop_fn(mean_rewards):
-        if env.spec.reward_threshold:
-            return mean_rewards >= env.spec.reward_threshold
-        else:
-            return False
-
-    # trainer
-    result = offpolicy_trainer(
-        policy, train_collector, test_collector, args.epoch,
-        args.step_per_epoch, args.step_per_collect, args.test_num,
-        args.batch_size, stop_fn=stop_fn, logger=logger)
-    assert stop_fn(result['best_reward'])
-    if __name__ == '__main__':
-        pprint.pprint(result)
-        # Let's watch its performance!
-        policy.eval()
-        test_envs.seed(args.seed)
-        test_collector.reset()
-        result = test_collector.collect(n_episode=args.test_num,
-                                        render=args.render)
-        rews, lens = result["rews"], result["lens"]
-        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
-
-
-if __name__ == '__main__':
-    test_td3()

From c530139d4a1385ed4d61055c6683a3dd0dd7211f Mon Sep 17 00:00:00 2001
From: Trinkle23897 <trinkle23897@gmail.com>
Date: Sat, 27 Feb 2021 10:51:12 +0800
Subject: [PATCH 13/13] greater

---
 tianshou/data/buffer.py                   | 2 +-
 tianshou/policy/base.py                   | 4 ++--
 tianshou/policy/imitation/discrete_bcq.py | 2 +-
 tianshou/policy/modelfree/c51.py          | 4 ++--
 tianshou/policy/modelfree/ddpg.py         | 2 +-
 tianshou/policy/modelfree/dqn.py          | 4 ++--
 tianshou/policy/modelfree/qrdqn.py        | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tianshou/data/buffer.py b/tianshou/data/buffer.py
index 2ceb8081f..05f5ecef9 100644
--- a/tianshou/data/buffer.py
+++ b/tianshou/data/buffer.py
@@ -61,7 +61,7 @@ def __init__(
         }
         super().__init__()
         self.maxsize = size
-        assert stack_num > 0, "stack_num should greater than 0"
+        assert stack_num > 0, "stack_num should be greater than 0"
         self.stack_num = stack_num
         self._indices = np.arange(size)
         self._save_obs_next = not ignore_obs_next
diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py
index 7e910996b..730ee28b0 100644
--- a/tianshou/policy/base.py
+++ b/tianshou/policy/base.py
@@ -286,8 +286,8 @@ def compute_nstep_return(
         :return: a Batch. The result will be stored in batch.returns as a
             torch.Tensor with the same shape as target_q_fn's return tensor.
         """
-        assert not rew_norm, (
-            "Reward normalization in computing n-step return is unsupported for now.")
+        assert not rew_norm, \
+            "Reward normalization in computing n-step returns is unsupported now."
         rew = buffer.rew
         bsz = len(indice)
         indices = [indice]
diff --git a/tianshou/policy/imitation/discrete_bcq.py b/tianshou/policy/imitation/discrete_bcq.py
index 610b164f1..5d7082243 100644
--- a/tianshou/policy/imitation/discrete_bcq.py
+++ b/tianshou/policy/imitation/discrete_bcq.py
@@ -17,7 +17,7 @@ class DiscreteBCQPolicy(DQNPolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> imtation_logits)
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param float discount_factor: in [0, 1].
-    :param int estimation_step: greater than 1, the number of steps to look ahead.
+    :param int estimation_step: the number of steps to look ahead. Default to 1.
     :param int target_update_freq: the target network update frequency.
     :param float eval_eps: the epsilon-greedy noise added in evaluation.
     :param float unlikely_action_threshold: the threshold (tau) for unlikely
diff --git a/tianshou/policy/modelfree/c51.py b/tianshou/policy/modelfree/c51.py
index eb24f0eb8..20ef89c1a 100644
--- a/tianshou/policy/modelfree/c51.py
+++ b/tianshou/policy/modelfree/c51.py
@@ -19,9 +19,9 @@ class C51Policy(DQNPolicy):
         Default to -10.0.
     :param float v_max: the value of the largest atom in the support set.
         Default to 10.0.
-    :param int estimation_step: greater than 1, the number of steps to look ahead.
+    :param int estimation_step: the number of steps to look ahead. Default to 1.
     :param int target_update_freq: the target network update frequency (0 if
-        you do not use the target network).
+        you do not use the target network). Default to 0.
     :param bool reward_normalization: normalize the reward to Normal(0, 1).
         Default to False.
 
diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py
index c858b29e2..9a4dad062 100644
--- a/tianshou/policy/modelfree/ddpg.py
+++ b/tianshou/policy/modelfree/ddpg.py
@@ -24,7 +24,7 @@ class DDPGPolicy(BasePolicy):
         add to the action. Default to ``GaussianNoise(sigma=0.1)``.
     :param bool reward_normalization: normalize the reward to Normal(0, 1),
         Default to False.
-    :param int estimation_step: greater than 1, the number of steps to look ahead.
+    :param int estimation_step: the number of steps to look ahead. Default to 1.
 
     .. seealso::
 
diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py
index a4ad772fb..bd1fea14a 100644
--- a/tianshou/policy/modelfree/dqn.py
+++ b/tianshou/policy/modelfree/dqn.py
@@ -19,9 +19,9 @@ class DQNPolicy(BasePolicy):
         :class:`~tianshou.policy.BasePolicy`. (s -> logits)
     :param torch.optim.Optimizer optim: a torch.optim for optimizing the model.
     :param float discount_factor: in [0, 1].
-    :param int estimation_step: greater than 1, the number of steps to look ahead.
+    :param int estimation_step: the number of steps to look ahead. Default to 1.
     :param int target_update_freq: the target network update frequency (0 if
-        you do not use the target network).
+        you do not use the target network). Default to 0.
     :param bool reward_normalization: normalize the reward to Normal(0, 1).
         Default to False.
 
diff --git a/tianshou/policy/modelfree/qrdqn.py b/tianshou/policy/modelfree/qrdqn.py
index ffc93aeee..7e154e7f7 100644
--- a/tianshou/policy/modelfree/qrdqn.py
+++ b/tianshou/policy/modelfree/qrdqn.py
@@ -17,7 +17,7 @@ class QRDQNPolicy(DQNPolicy):
     :param float discount_factor: in [0, 1].
     :param int num_quantiles: the number of quantile midpoints in the inverse
         cumulative distribution function of the value. Default to 200.
-    :param int estimation_step: greater than 1, the number of steps to look ahead.
+    :param int estimation_step: the number of steps to look ahead. Default to 1.
     :param int target_update_freq: the target network update frequency (0 if
         you do not use the target network).
     :param bool reward_normalization: normalize the reward to Normal(0, 1).