From 8717ed489dcee56eb6213f9a4453cc86a58fc46e Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Sun, 21 Mar 2021 18:37:17 +0800 Subject: [PATCH 1/3] le_schedular --- tianshou/policy/modelfree/a2c.py | 2 ++ tianshou/policy/modelfree/pg.py | 7 +++++++ tianshou/policy/modelfree/ppo.py | 2 ++ 3 files changed, 11 insertions(+) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 79fb308de..8a9de707e 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -142,6 +142,8 @@ def learn( # type: ignore vf_losses.append(vf_loss.item()) ent_losses.append(ent_loss.item()) losses.append(loss.item()) + + self.update_lr_scheduler() return { "loss": losses, "loss/actor": actor_losses, diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index 742423aa5..d92e33ab7 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -32,6 +32,7 @@ class PGPolicy(BasePolicy): def __init__( self, model: Optional[torch.nn.Module], + #TODO lack doc optim: torch.optim.Optimizer, dist_fn: Type[torch.distributions.Distribution], discount_factor: float = 0.99, @@ -110,8 +111,14 @@ def learn( # type: ignore loss.backward() self.optim.step() losses.append(loss.item()) + + self.update_lr_scheduler() return {"loss": losses} + def update_lr_scheduler(self): + if hasattr(self.optim, "lr_scheduler"): + self.optim.lr_scheduler.step() + # def _vanilla_returns(self, batch): # returns = batch.rew[:] # last = 0 diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 8f96ce38c..7ad0bf3ad 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -179,6 +179,8 @@ def learn( # type: ignore list(self.actor.parameters()) + list(self.critic.parameters()), self._max_grad_norm) self.optim.step() + + self.update_lr_scheduler() return { "loss": losses, "loss/clip": clip_losses, From e53adf6bd50152324c04e75fa8f4ba2e10319858 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Mon, 22 Mar 2021 15:07:51 +0800 Subject: [PATCH 2/3] change how to use # update learning rate if given lr_scheduler --- tianshou/policy/modelfree/a2c.py | 4 +++- tianshou/policy/modelfree/pg.py | 12 +++++++----- tianshou/policy/modelfree/ppo.py | 4 +++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 8a9de707e..11ed35d57 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -142,8 +142,10 @@ def learn( # type: ignore vf_losses.append(vf_loss.item()) ent_losses.append(ent_loss.item()) losses.append(loss.item()) + # update learning rate if given lr_scheduler + if self.lr_scheduler is not None: + self.lr_scheduler.step() - self.update_lr_scheduler() return { "loss": losses, "loss/actor": actor_losses, diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index d92e33ab7..014fe9a65 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -39,6 +39,8 @@ def __init__( reward_normalization: bool = False, action_scaling: bool = True, action_bound_method: str = "clip", + # TODO doc + lr_scheduler: Optional[torch.optim.lr_scheduler] = None, **kwargs: Any, ) -> None: super().__init__(action_scaling=action_scaling, @@ -46,6 +48,7 @@ def __init__( if model is not None: self.model: torch.nn.Module = model self.optim = optim + self.lr_scheduler = lr_scheduler self.dist_fn = dist_fn assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]" self._gamma = discount_factor @@ -111,13 +114,12 @@ def learn( # type: ignore loss.backward() self.optim.step() losses.append(loss.item()) - - self.update_lr_scheduler() + # update learning rate if given lr_scheduler + if self.lr_scheduler is not None: + self.lr_scheduler.step() + return {"loss": losses} - def update_lr_scheduler(self): - if hasattr(self.optim, "lr_scheduler"): - self.optim.lr_scheduler.step() # def _vanilla_returns(self, batch): # returns = batch.rew[:] diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 7ad0bf3ad..dbc35dc2d 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -179,8 +179,10 @@ def learn( # type: ignore list(self.actor.parameters()) + list(self.critic.parameters()), self._max_grad_norm) self.optim.step() + # update learning rate if given lr_scheduler + if self.lr_scheduler is not None: + self.lr_scheduler.step() - self.update_lr_scheduler() return { "loss": losses, "loss/clip": clip_losses, From 334456ed906b7ea8d17027c610f7b09166d70293 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Mon, 22 Mar 2021 16:38:03 +0800 Subject: [PATCH 3/3] fix test --- tianshou/policy/modelfree/a2c.py | 4 +++- tianshou/policy/modelfree/pg.py | 9 ++++----- tianshou/policy/modelfree/ppo.py | 4 +++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 11ed35d57..0abf62cd1 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -38,6 +38,8 @@ class A2CPolicy(PGPolicy): squashing) for now, or empty string for no bounding. Default to "clip". :param Optional[gym.Space] action_space: env's action space, mandatory if you want to use option "action_scaling" or "action_bound_method". Default to None. + :param lr_scheduler: a learning rate scheduler that adjusts the learning rate in + optimizer in each policy.update(). Default to None (no lr_scheduler). .. seealso:: @@ -142,7 +144,7 @@ def learn( # type: ignore vf_losses.append(vf_loss.item()) ent_losses.append(ent_loss.item()) losses.append(loss.item()) - # update learning rate if given lr_scheduler + # update learning rate if lr_scheduler is given if self.lr_scheduler is not None: self.lr_scheduler.step() diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index 014fe9a65..4333112b4 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -22,6 +22,8 @@ class PGPolicy(BasePolicy): squashing) for now, or empty string for no bounding. Default to "clip". :param Optional[gym.Space] action_space: env's action space, mandatory if you want to use option "action_scaling" or "action_bound_method". Default to None. + :param lr_scheduler: a learning rate scheduler that adjusts the learning rate in + optimizer in each policy.update(). Default to None (no lr_scheduler). .. seealso:: @@ -32,15 +34,13 @@ class PGPolicy(BasePolicy): def __init__( self, model: Optional[torch.nn.Module], - #TODO lack doc optim: torch.optim.Optimizer, dist_fn: Type[torch.distributions.Distribution], discount_factor: float = 0.99, reward_normalization: bool = False, action_scaling: bool = True, action_bound_method: str = "clip", - # TODO doc - lr_scheduler: Optional[torch.optim.lr_scheduler] = None, + lr_scheduler: Optional[torch.optim.lr_scheduler.LambdaLR] = None, **kwargs: Any, ) -> None: super().__init__(action_scaling=action_scaling, @@ -114,13 +114,12 @@ def learn( # type: ignore loss.backward() self.optim.step() losses.append(loss.item()) - # update learning rate if given lr_scheduler + # update learning rate if lr_scheduler is given if self.lr_scheduler is not None: self.lr_scheduler.step() return {"loss": losses} - # def _vanilla_returns(self, batch): # returns = batch.rew[:] # last = 0 diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index dbc35dc2d..4d81dd6cd 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -43,6 +43,8 @@ class PPOPolicy(PGPolicy): squashing) for now, or empty string for no bounding. Default to "clip". :param Optional[gym.Space] action_space: env's action space, mandatory if you want to use option "action_scaling" or "action_bound_method". Default to None. + :param lr_scheduler: a learning rate scheduler that adjusts the learning rate in + optimizer in each policy.update(). Default to None (no lr_scheduler). .. seealso:: @@ -179,7 +181,7 @@ def learn( # type: ignore list(self.actor.parameters()) + list(self.critic.parameters()), self._max_grad_norm) self.optim.step() - # update learning rate if given lr_scheduler + # update learning rate if lr_scheduler is given if self.lr_scheduler is not None: self.lr_scheduler.step()