diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index c8d5838a8..ac52b1979 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -8,7 +8,7 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip')" strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/docs/requirements.txt b/docs/requirements.txt index 0be924025..fb2fdd237 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,8 @@ gym -tqdm -torch numba -tensorboard +numpy>=1.20 sphinx<4 sphinxcontrib-bibtex +tensorboard +torch +tqdm diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index 6ab4717d6..98d69a269 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -81,12 +81,12 @@ def test_ppo(args=get_args()): args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device ), device=args.device).to(args.device) # orthogonal initialization - for m in list(actor.modules()) + list(critic.modules()): + for m in set(actor.modules()).union(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam( - list(actor.parameters()) + list(critic.parameters()), lr=args.lr) + set(actor.parameters()).union(critic.parameters()), lr=args.lr) # replace DiagGuassian with Independent(Normal) which is equivalent # pass *logits to be consistent with policy.forward diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py index 9714219e9..fde783da5 100644 --- a/test/discrete/test_a2c_with_il.py +++ b/test/discrete/test_a2c_with_il.py @@ -75,7 +75,7 @@ def test_a2c_with_il(args=get_args()): actor = Actor(net, args.action_shape, device=args.device).to(args.device) critic = Critic(net, device=args.device).to(args.device) optim = torch.optim.Adam( - list(actor.parameters()) + list(critic.parameters()), lr=args.lr) + set(actor.parameters()).union(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index ee63b9b2a..2cf6d49ca 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -72,12 +72,12 @@ def test_ppo(args=get_args()): actor = Actor(net, args.action_shape, device=args.device).to(args.device) critic = Critic(net, device=args.device).to(args.device) # orthogonal initialization - for m in list(actor.modules()) + list(critic.modules()): + for m in set(actor.modules()).union(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam( - list(actor.parameters()) + list(critic.parameters()), lr=args.lr) + set(actor.parameters()).union(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( actor, critic, optim, dist, diff --git a/test/discrete/test_qrdqn.py b/test/discrete/test_qrdqn.py index 956099bb1..f6bf5ae6b 100644 --- a/test/discrete/test_qrdqn.py +++ b/test/discrete/test_qrdqn.py @@ -18,7 +18,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='CartPole-v0') - parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--seed', type=int, default=1) parser.add_argument('--eps-test', type=float, default=0.05) parser.add_argument('--eps-train', type=float, default=0.1) parser.add_argument('--buffer-size', type=int, default=20000) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 75504b407..3e05ce0b6 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -129,7 +129,7 @@ def learn( # type: ignore loss.backward() if self._grad_norm: # clip large gradient nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), + set(self.actor.parameters()).union(self.critic.parameters()), max_norm=self._grad_norm) self.optim.step() actor_losses.append(actor_loss.item()) diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index c7aed4d8a..aaf34bd3e 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -139,7 +139,7 @@ def learn( # type: ignore loss.backward() if self._grad_norm: # clip large gradient nn.utils.clip_grad_norm_( - list(self.actor.parameters()) + list(self.critic.parameters()), + set(self.actor.parameters()).union(self.critic.parameters()), max_norm=self._grad_norm) self.optim.step() clip_losses.append(clip_loss.item())