From de13d5d30d30d20ca209627a6d858add414c258d Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Mon, 4 Oct 2021 09:23:12 -0400 Subject: [PATCH 1/3] support envpool async api --- tianshou/data/collector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index d52cc511d..b52988f5d 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -338,7 +338,7 @@ def __init__( preprocess_fn: Optional[Callable[..., Batch]] = None, exploration_noise: bool = False, ) -> None: - assert env.is_async + # assert env.is_async super().__init__(policy, env, buffer, preprocess_fn, exploration_noise) def reset_env(self) -> None: @@ -452,7 +452,10 @@ def collect( obs_next, rew, done, info = result # change self.data here because ready_env_ids has changed - ready_env_ids = np.array([i["env_id"] for i in info]) + try: + ready_env_ids = info["env_id"] + except TypeError: + ready_env_ids = np.array([i["env_id"] for i in info]) self.data = whole_data[ready_env_ids] self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) From ee57c5a9a2b04b0cdf74cbcf1cfa99306cf741b9 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Mon, 4 Oct 2021 10:35:45 -0400 Subject: [PATCH 2/3] fix #449 --- test/continuous/test_ppo.py | 9 ++++----- test/discrete/test_a2c_with_il.py | 6 ++---- test/discrete/test_ppo.py | 9 ++++----- tianshou/policy/modelfree/a2c.py | 5 +++-- tianshou/policy/modelfree/ppo.py | 3 +-- tianshou/utils/net/common.py | 16 ++++++++++++++++ 6 files changed, 30 insertions(+), 18 deletions(-) diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index 473222816..9bfb4e324 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -13,7 +13,7 @@ from tianshou.policy import PPOPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger -from tianshou.utils.net.common import Net +from tianshou.utils.net.common import ActorCritic, Net from tianshou.utils.net.continuous import ActorProb, Critic @@ -84,14 +84,13 @@ def test_ppo(args=get_args()): Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device), device=args.device ).to(args.device) + actor_critic = ActorCritic(actor, critic) # orthogonal initialization - for m in set(actor.modules()).union(critic.modules()): + for m in actor_critic.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) - optim = torch.optim.Adam( - set(actor.parameters()).union(critic.parameters()), lr=args.lr - ) + optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) # replace DiagGuassian with Independent(Normal) which is equivalent # pass *logits to be consistent with policy.forward diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py index 745295826..81f8bedde 100644 --- a/test/discrete/test_a2c_with_il.py +++ b/test/discrete/test_a2c_with_il.py @@ -12,7 +12,7 @@ from tianshou.policy import A2CPolicy, ImitationPolicy from tianshou.trainer import offpolicy_trainer, onpolicy_trainer from tianshou.utils import TensorboardLogger -from tianshou.utils.net.common import Net +from tianshou.utils.net.common import ActorCritic, Net from tianshou.utils.net.discrete import Actor, Critic @@ -74,9 +74,7 @@ def test_a2c_with_il(args=get_args()): net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, device=args.device).to(args.device) critic = Critic(net, device=args.device).to(args.device) - optim = torch.optim.Adam( - set(actor.parameters()).union(critic.parameters()), lr=args.lr - ) + optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( actor, diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index 96650b14b..8c1469649 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -12,7 +12,7 @@ from tianshou.policy import PPOPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger -from tianshou.utils.net.common import Net +from tianshou.utils.net.common import ActorCritic, Net from tianshou.utils.net.discrete import Actor, Critic @@ -73,14 +73,13 @@ def test_ppo(args=get_args()): net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, device=args.device).to(args.device) critic = Critic(net, device=args.device).to(args.device) + actor_critic = ActorCritic(actor, critic) # orthogonal initialization - for m in set(actor.modules()).union(critic.modules()): + for m in actor_critic.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) - optim = torch.optim.Adam( - set(actor.parameters()).union(critic.parameters()), lr=args.lr - ) + optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( actor, diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index e44b58b58..f67d25d5a 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -7,6 +7,7 @@ from tianshou.data import Batch, ReplayBuffer, to_torch_as from tianshou.policy import PGPolicy +from tianshou.utils.net.common import ActorCritic class A2CPolicy(PGPolicy): @@ -70,6 +71,7 @@ def __init__( self._weight_ent = ent_coef self._grad_norm = max_grad_norm self._batch = max_batchsize + self._actor_critic = ActorCritic(self.actor, self.critic) def process_fn( self, batch: Batch, buffer: ReplayBuffer, indices: np.ndarray @@ -136,8 +138,7 @@ def learn( # type: ignore loss.backward() if self._grad_norm: # clip large gradient nn.utils.clip_grad_norm_( - set(self.actor.parameters()).union(self.critic.parameters()), - max_norm=self._grad_norm + self._actor_critic.parameters(), max_norm=self._grad_norm ) self.optim.step() actor_losses.append(actor_loss.item()) diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index e1e17aa2f..f27463fdf 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -140,8 +140,7 @@ def learn( # type: ignore loss.backward() if self._grad_norm: # clip large gradient nn.utils.clip_grad_norm_( - set(self.actor.parameters()).union(self.critic.parameters()), - max_norm=self._grad_norm + self._actor_critic.parameters(), max_norm=self._grad_norm ) self.optim.step() clip_losses.append(clip_loss.item()) diff --git a/tianshou/utils/net/common.py b/tianshou/utils/net/common.py index b518a54a7..5a84efa18 100644 --- a/tianshou/utils/net/common.py +++ b/tianshou/utils/net/common.py @@ -262,3 +262,19 @@ def forward( s = self.fc2(s[:, -1]) # please ensure the first dim is batch size: [bsz, len, ...] return s, {"h": h.transpose(0, 1).detach(), "c": c.transpose(0, 1).detach()} + + +class ActorCritic(nn.Module): + """An actor-critic network for parsing parameters. + + Using ``actor_critic.parameters()`` instead of set.union or list+list to avoid + issue #449. + + :param nn.Module actor: the actor network. + :param nn.Module critic: the critic network. + """ + + def __init__(self, actor: nn.Module, critic: nn.Module) -> None: + super().__init__() + self.actor = actor + self.critic = critic From 086453f9dcc85f08b6c62e5043df1b80df342972 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Mon, 4 Oct 2021 10:54:39 -0400 Subject: [PATCH 3/3] fix ci --- tianshou/data/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index b52988f5d..8aaef2d55 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -454,7 +454,7 @@ def collect( # change self.data here because ready_env_ids has changed try: ready_env_ids = info["env_id"] - except TypeError: + except Exception: ready_env_ids = np.array([i["env_id"] for i in info]) self.data = whole_data[ready_env_ids]