From bc4d0bcf9990fab912ff0842efd96238a17cecdc Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Fri, 26 Feb 2021 15:36:59 +0800 Subject: [PATCH 01/13] remove rew_norm in offpolicy algorithm --- examples/mujoco/runnable/ant_v2_ddpg.py | 3 +-- examples/mujoco/runnable/ant_v2_td3.py | 3 +-- .../runnable/halfcheetahBullet_v0_sac.py | 3 +-- examples/mujoco/runnable/point_maze_td3.py | 3 +-- test/continuous/test_ddpg.py | 3 +-- test/continuous/test_sac_with_il.py | 3 +-- test/continuous/test_td3.py | 3 +-- test/discrete/test_sac.py | 1 - tianshou/policy/base.py | 18 +++++------------- tianshou/policy/modelfree/ddpg.py | 2 ++ tianshou/policy/modelfree/dqn.py | 2 ++ 11 files changed, 16 insertions(+), 28 deletions(-) diff --git a/examples/mujoco/runnable/ant_v2_ddpg.py b/examples/mujoco/runnable/ant_v2_ddpg.py index 53e9ac4d7..ce42434c0 100644 --- a/examples/mujoco/runnable/ant_v2_ddpg.py +++ b/examples/mujoco/runnable/ant_v2_ddpg.py @@ -72,8 +72,7 @@ def test_ddpg(args=get_args()): actor, actor_optim, critic, critic_optim, action_range=[env.action_space.low[0], env.action_space.high[0]], tau=args.tau, gamma=args.gamma, - exploration_noise=GaussianNoise(sigma=args.exploration_noise), - reward_normalization=True) + exploration_noise=GaussianNoise(sigma=args.exploration_noise)) # collector train_collector = Collector( policy, train_envs, diff --git a/examples/mujoco/runnable/ant_v2_td3.py b/examples/mujoco/runnable/ant_v2_td3.py index cbbd952f3..5e33c33da 100644 --- a/examples/mujoco/runnable/ant_v2_td3.py +++ b/examples/mujoco/runnable/ant_v2_td3.py @@ -80,8 +80,7 @@ def test_td3(args=get_args()): exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, - noise_clip=args.noise_clip, - reward_normalization=True) + noise_clip=args.noise_clip) # collector train_collector = Collector( policy, train_envs, diff --git a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py b/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py index db0ce6ec8..618492771 100644 --- a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py +++ b/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py @@ -80,8 +80,7 @@ def test_sac(args=get_args()): policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low[0], env.action_space.high[0]], - tau=args.tau, gamma=args.gamma, alpha=args.alpha, - reward_normalization=True) + tau=args.tau, gamma=args.gamma, alpha=args.alpha) # collector train_collector = Collector( policy, train_envs, diff --git a/examples/mujoco/runnable/point_maze_td3.py b/examples/mujoco/runnable/point_maze_td3.py index ed2ce0efc..eda299244 100644 --- a/examples/mujoco/runnable/point_maze_td3.py +++ b/examples/mujoco/runnable/point_maze_td3.py @@ -85,8 +85,7 @@ def test_td3(args=get_args()): exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, - noise_clip=args.noise_clip, - reward_normalization=True) + noise_clip=args.noise_clip) # collector train_collector = Collector( policy, train_envs, diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index 232eef17c..6d7020ad4 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -37,8 +37,7 @@ def get_args(): parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--rew-norm', type=int, default=1) - parser.add_argument('--ignore-done', type=int, default=1) + parser.add_argument('--rew-norm', type=int, default=0) parser.add_argument('--n-step', type=int, default=1) parser.add_argument( '--device', type=str, diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py index 8d1842876..900c6e0c4 100644 --- a/test/continuous/test_sac_with_il.py +++ b/test/continuous/test_sac_with_il.py @@ -40,8 +40,7 @@ def get_args(): parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--rew-norm', type=int, default=1) - parser.add_argument('--ignore-done', type=int, default=1) + parser.add_argument('--rew-norm', type=int, default=0) parser.add_argument('--n-step', type=int, default=4) parser.add_argument( '--device', type=str, diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index c24741c3c..340cb261d 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -40,8 +40,7 @@ def get_args(): parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--rew-norm', type=int, default=1) - parser.add_argument('--ignore-done', type=int, default=1) + parser.add_argument('--rew-norm', type=int, default=0) parser.add_argument('--n-step', type=int, default=1) parser.add_argument( '--device', type=str, diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py index b5871f66a..465331a99 100644 --- a/test/discrete/test_sac.py +++ b/test/discrete/test_sac.py @@ -39,7 +39,6 @@ def get_args(): parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.0) parser.add_argument('--rew-norm', type=int, default=0) - parser.add_argument('--ignore-done', type=int, default=0) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index cf2678f74..28e1200fc 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -286,15 +286,10 @@ def compute_nstep_return( :return: a Batch. The result will be stored in batch.returns as a torch.Tensor with the same shape as target_q_fn's return tensor. """ + assert rew_norm == False, ( + "Reward normalization in computing n-step return is unsupported for now.") rew = buffer.rew bsz = len(indice) - if rew_norm: # TODO: remove it or fix this bug - bfr = rew[:min(len(buffer), 1000)] # avoid large buffer - mean, std = bfr.mean(), bfr.std() - if np.isclose(std, 0, 1e-2): - mean, std = 0.0, 1.0 - else: - mean, std = 0.0, 1.0 indices = [indice] for _ in range(n_step - 1): indices.append(buffer.next(indices[-1])) @@ -308,8 +303,7 @@ def compute_nstep_return( target_q = target_q * BasePolicy.value_mask(buffer, terminal).reshape(-1, 1) end_flag = buffer.done.copy() end_flag[buffer.unfinished_index()] = True - target_q = _nstep_return(rew, end_flag, target_q, - indices, gamma, n_step, mean, std) + target_q = _nstep_return(rew, end_flag, target_q, indices, gamma, n_step) batch.returns = to_torch_as(target_q, target_q_torch) if hasattr(batch, "weight"): # prio buffer update @@ -325,7 +319,7 @@ def _compile(self) -> None: _gae_return(f32, f32, f64, b, 0.1, 0.1) _episodic_return(f64, f64, b, 0.1, 0.1) _episodic_return(f32, f64, b, 0.1, 0.1) - _nstep_return(f64, b, f32.reshape(-1, 1), i64, 0.1, 1, 0.0, 1.0) + _nstep_return(f64, b, f32.reshape(-1, 1), i64, 0.1, 1) @njit @@ -368,8 +362,6 @@ def _nstep_return( indices: np.ndarray, gamma: float, n_step: int, - mean: float, - std: float, ) -> np.ndarray: gamma_buffer = np.ones(n_step + 1) for i in range(1, n_step + 1): @@ -384,6 +376,6 @@ def _nstep_return( now = indices[n] gammas[end_flag[now] > 0] = n + 1 returns[end_flag[now] > 0] = 0.0 - returns = (rew[now].reshape(bsz, 1) - mean) / std + gamma * returns + returns = rew[now].reshape(bsz, 1) + gamma * returns target_q = target_q * gamma_buffer[gammas].reshape(bsz, 1) + returns return target_q.reshape(target_shape) diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py index d91359a2f..fe1a77f9c 100644 --- a/tianshou/policy/modelfree/ddpg.py +++ b/tianshou/policy/modelfree/ddpg.py @@ -71,6 +71,8 @@ def __init__( # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rew_norm = reward_normalization + assert self._rew_norm == False, ( + "Reward normalization in offpolicy algorithm is unsupported for now.") assert estimation_step > 0, "estimation_step should be greater than 0" self._n_step = estimation_step diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py index e79ff3206..6466ec568 100644 --- a/tianshou/policy/modelfree/dqn.py +++ b/tianshou/policy/modelfree/dqn.py @@ -57,6 +57,8 @@ def __init__( self.model_old = deepcopy(self.model) self.model_old.eval() self._rew_norm = reward_normalization + assert self._rew_norm == False, ( + "Reward normalization in offpolicy algorithm is unsupported for now.") def set_eps(self, eps: float) -> None: """Set the eps for epsilon-greedy exploration.""" From 6c75da73db89467fa0a0410e1dd006da4ff903a3 Mon Sep 17 00:00:00 2001 From: chy <308604256@qq.com> Date: Fri, 26 Feb 2021 15:43:39 +0800 Subject: [PATCH 02/13] all --- tianshou/policy/base.py | 2 +- tianshou/policy/modelfree/ddpg.py | 2 +- tianshou/policy/modelfree/dqn.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 28e1200fc..7e910996b 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -286,7 +286,7 @@ def compute_nstep_return( :return: a Batch. The result will be stored in batch.returns as a torch.Tensor with the same shape as target_q_fn's return tensor. """ - assert rew_norm == False, ( + assert not rew_norm, ( "Reward normalization in computing n-step return is unsupported for now.") rew = buffer.rew bsz = len(indice) diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py index fe1a77f9c..90068841f 100644 --- a/tianshou/policy/modelfree/ddpg.py +++ b/tianshou/policy/modelfree/ddpg.py @@ -71,7 +71,7 @@ def __init__( # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rew_norm = reward_normalization - assert self._rew_norm == False, ( + assert not self._rew_norm, ( "Reward normalization in offpolicy algorithm is unsupported for now.") assert estimation_step > 0, "estimation_step should be greater than 0" self._n_step = estimation_step diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py index 6466ec568..2ee42275f 100644 --- a/tianshou/policy/modelfree/dqn.py +++ b/tianshou/policy/modelfree/dqn.py @@ -57,7 +57,7 @@ def __init__( self.model_old = deepcopy(self.model) self.model_old.eval() self._rew_norm = reward_normalization - assert self._rew_norm == False, ( + assert not self._rew_norm, ( "Reward normalization in offpolicy algorithm is unsupported for now.") def set_eps(self, eps: float) -> None: From f5f9b24f7c5746624f9f72cc11637ef401bb4176 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 21:15:07 +0800 Subject: [PATCH 03/13] defaults -> Default --- tianshou/policy/imitation/base.py | 7 ++- tianshou/policy/imitation/discrete_bcq.py | 23 ++++----- tianshou/policy/modelbase/psrl.py | 2 +- tianshou/policy/modelfree/a2c.py | 39 +++++++------- tianshou/policy/modelfree/c51.py | 17 +++--- tianshou/policy/modelfree/ddpg.py | 22 +++----- tianshou/policy/modelfree/discrete_sac.py | 17 +++--- tianshou/policy/modelfree/dqn.py | 13 ++--- tianshou/policy/modelfree/pg.py | 14 +++-- tianshou/policy/modelfree/ppo.py | 63 ++++++++++------------- tianshou/policy/modelfree/qrdqn.py | 13 ++--- tianshou/policy/modelfree/sac.py | 40 ++++++-------- tianshou/policy/modelfree/td3.py | 46 +++++++---------- tianshou/utils/net/common.py | 10 ++-- 14 files changed, 134 insertions(+), 192 deletions(-) diff --git a/tianshou/policy/imitation/base.py b/tianshou/policy/imitation/base.py index 954bc81f6..a618dd480 100644 --- a/tianshou/policy/imitation/base.py +++ b/tianshou/policy/imitation/base.py @@ -14,7 +14,7 @@ class ImitationPolicy(BasePolicy): :class:`~tianshou.policy.BasePolicy`. (s -> a) :param torch.optim.Optimizer optim: for optimizing the model. :param str mode: indicate the imitation type ("continuous" or "discrete" - action space), defaults to "continuous". + action space). Default to "continuous". .. seealso:: @@ -32,9 +32,8 @@ def __init__( super().__init__(**kwargs) self.model = model self.optim = optim - assert ( - mode in ["continuous", "discrete"] - ), f"Mode {mode} is not in ['continuous', 'discrete']." + assert mode in ["continuous", "discrete"], \ + f"Mode {mode} is not in ['continuous', 'discrete']." self.mode = mode def forward( diff --git a/tianshou/policy/imitation/discrete_bcq.py b/tianshou/policy/imitation/discrete_bcq.py index 0061ea20f..610b164f1 100644 --- a/tianshou/policy/imitation/discrete_bcq.py +++ b/tianshou/policy/imitation/discrete_bcq.py @@ -17,16 +17,15 @@ class DiscreteBCQPolicy(DQNPolicy): :class:`~tianshou.policy.BasePolicy`. (s -> imtation_logits) :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param float discount_factor: in [0, 1]. - :param int estimation_step: greater than 1, the number of steps to look - ahead. + :param int estimation_step: greater than 1, the number of steps to look ahead. :param int target_update_freq: the target network update frequency. :param float eval_eps: the epsilon-greedy noise added in evaluation. :param float unlikely_action_threshold: the threshold (tau) for unlikely - actions, as shown in Equ. (17) in the paper, defaults to 0.3. + actions, as shown in Equ. (17) in the paper. Default to 0.3. :param float imitation_logits_penalty: reguralization weight for imitation - logits, defaults to 1e-2. - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. + logits. Default to 1e-2. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. .. seealso:: @@ -52,9 +51,8 @@ def __init__( target_update_freq, reward_normalization, **kwargs) assert target_update_freq > 0, "BCQ needs target network setting." self.imitator = imitator - assert ( - 0.0 <= unlikely_action_threshold < 1.0 - ), "unlikely_action_threshold should be in [0, 1)" + assert 0.0 <= unlikely_action_threshold < 1.0, \ + "unlikely_action_threshold should be in [0, 1)" if unlikely_action_threshold > 0: self._log_tau = math.log(unlikely_action_threshold) else: @@ -69,9 +67,7 @@ def train(self, mode: bool = True) -> "DiscreteBCQPolicy": self.imitator.train(mode) return self - def _target_q( - self, buffer: ReplayBuffer, indice: np.ndarray - ) -> torch.Tensor: + def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor: batch = buffer[indice] # batch.obs_next: s_{t+n} # target_Q = Q_old(s_, argmax(Q_new(s_, *))) act = self(batch, input="obs_next").act @@ -93,8 +89,7 @@ def forward( # type: ignore imitation_logits, _ = self.imitator(obs, state=state, info=batch.info) # mask actions for argmax - ratio = imitation_logits - imitation_logits.max( - dim=-1, keepdim=True).values + ratio = imitation_logits - imitation_logits.max(dim=-1, keepdim=True).values mask = (ratio < self._log_tau).float() action = (q_value - np.inf * mask).argmax(dim=-1) diff --git a/tianshou/policy/modelbase/psrl.py b/tianshou/policy/modelbase/psrl.py index dcf6a5d05..4a565976f 100644 --- a/tianshou/policy/modelbase/psrl.py +++ b/tianshou/policy/modelbase/psrl.py @@ -149,7 +149,7 @@ class PSRLPolicy(BasePolicy): :param float discount_factor: in [0, 1]. :param float epsilon: for precision control in value iteration. :param bool add_done_loop: whether to add an extra self-loop for the - terminal state in MDP, defaults to False. + terminal state in MDP. Default to False. .. seealso:: diff --git a/tianshou/policy/modelfree/a2c.py b/tianshou/policy/modelfree/a2c.py index 1c8edb300..f79682789 100644 --- a/tianshou/policy/modelfree/a2c.py +++ b/tianshou/policy/modelfree/a2c.py @@ -2,7 +2,7 @@ import numpy as np from torch import nn import torch.nn.functional as F -from typing import Any, Dict, List, Union, Optional, Callable +from typing import Any, Dict, List, Type, Union, Optional from tianshou.policy import PGPolicy from tianshou.data import Batch, ReplayBuffer, to_torch_as, to_numpy @@ -17,20 +17,20 @@ class A2CPolicy(PGPolicy): :param torch.optim.Optimizer optim: the optimizer for actor and critic network. :param dist_fn: distribution class for computing the action. - :type dist_fn: Callable[[], torch.distributions.Distribution] - :param float discount_factor: in [0, 1], defaults to 0.99. - :param float vf_coef: weight for value loss, defaults to 0.5. - :param float ent_coef: weight for entropy loss, defaults to 0.01. - :param float max_grad_norm: clipping gradients in back propagation, - defaults to None. + :type dist_fn: Type[torch.distributions.Distribution] + :param float discount_factor: in [0, 1]. Default to 0.99. + :param float vf_coef: weight for value loss. Default to 0.5. + :param float ent_coef: weight for entropy loss. Default to 0.01. + :param float max_grad_norm: clipping gradients in back propagation. + Default to None. :param float gae_lambda: in [0, 1], param for Generalized Advantage - Estimation, defaults to 0.95. - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. + Estimation. Default to 0.95. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. :param int max_batchsize: the maximum size of the batch when computing GAE, depends on the size of available memory and the memory cost of the - model; should be as large as possible within the memory constraint; - defaults to 256. + model; should be as large as possible within the memory constraint. + Default to 256. .. seealso:: @@ -43,7 +43,7 @@ def __init__( actor: torch.nn.Module, critic: torch.nn.Module, optim: torch.optim.Optimizer, - dist_fn: Callable[[], torch.distributions.Distribution], + dist_fn: Type[torch.distributions.Distribution], discount_factor: float = 0.99, vf_coef: float = 0.5, ent_coef: float = 0.01, @@ -77,9 +77,8 @@ def process_fn( v_.append(to_numpy(self.critic(b.obs_next))) v_ = np.concatenate(v_, axis=0) return self.compute_episodic_return( - batch, buffer, indice, - v_, gamma=self._gamma, - gae_lambda=self._lambda, rew_norm=self._rew_norm) + batch, buffer, indice, v_, + gamma=self._gamma, gae_lambda=self._lambda, rew_norm=self._rew_norm) def forward( self, @@ -105,7 +104,7 @@ def forward( if isinstance(logits, tuple): dist = self.dist_fn(*logits) else: - dist = self.dist_fn(logits) # type: ignore + dist = self.dist_fn(logits) act = dist.sample() return Batch(logits=logits, act=act, state=h, dist=dist) @@ -124,13 +123,11 @@ def learn( # type: ignore a_loss = -(log_prob * (r - v).detach()).mean() vf_loss = F.mse_loss(r, v) # type: ignore ent_loss = dist.entropy().mean() - loss = a_loss + self._weight_vf * vf_loss - \ - self._weight_ent * ent_loss + loss = a_loss + self._weight_vf * vf_loss - self._weight_ent * ent_loss loss.backward() if self._grad_norm is not None: nn.utils.clip_grad_norm_( - list(self.actor.parameters()) - + list(self.critic.parameters()), + list(self.actor.parameters()) + list(self.critic.parameters()), max_norm=self._grad_norm, ) self.optim.step() diff --git a/tianshou/policy/modelfree/c51.py b/tianshou/policy/modelfree/c51.py index dce2112a8..eb24f0eb8 100644 --- a/tianshou/policy/modelfree/c51.py +++ b/tianshou/policy/modelfree/c51.py @@ -14,17 +14,16 @@ class C51Policy(DQNPolicy): :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param float discount_factor: in [0, 1]. :param int num_atoms: the number of atoms in the support set of the - value distribution, defaults to 51. - :param float v_min: the value of the smallest atom in the support set, - defaults to -10.0. - :param float v_max: the value of the largest atom in the support set, - defaults to 10.0. - :param int estimation_step: greater than 1, the number of steps to look - ahead. + value distribution. Default to 51. + :param float v_min: the value of the smallest atom in the support set. + Default to -10.0. + :param float v_max: the value of the largest atom in the support set. + Default to 10.0. + :param int estimation_step: greater than 1, the number of steps to look ahead. :param int target_update_freq: the target network update frequency (0 if you do not use the target network). - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. .. seealso:: diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py index 90068841f..c858b29e2 100644 --- a/tianshou/policy/modelfree/ddpg.py +++ b/tianshou/policy/modelfree/ddpg.py @@ -15,19 +15,16 @@ class DDPGPolicy(BasePolicy): :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.optim.Optimizer actor_optim: the optimizer for actor network. :param torch.nn.Module critic: the critic network. (s, a -> Q(s, a)) - :param torch.optim.Optimizer critic_optim: the optimizer for critic - network. + :param torch.optim.Optimizer critic_optim: the optimizer for critic network. :param action_range: the action range (minimum, maximum). :type action_range: Tuple[float, float] - :param float tau: param for soft update of the target network, defaults to - 0.005. - :param float gamma: discount factor, in [0, 1], defaults to 0.99. + :param float tau: param for soft update of the target network. Default to 0.005. + :param float gamma: discount factor, in [0, 1]. Default to 0.99. :param BaseNoise exploration_noise: the exploration noise, - add to the action, defaults to ``GaussianNoise(sigma=0.1)``. + add to the action. Default to ``GaussianNoise(sigma=0.1)``. :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. - :param int estimation_step: greater than 1, the number of steps to look - ahead. + Default to False. + :param int estimation_step: greater than 1, the number of steps to look ahead. .. seealso:: @@ -71,9 +68,6 @@ def __init__( # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rew_norm = reward_normalization - assert not self._rew_norm, ( - "Reward normalization in offpolicy algorithm is unsupported for now.") - assert estimation_step > 0, "estimation_step should be greater than 0" self._n_step = estimation_step def set_exp_noise(self, noise: Optional[BaseNoise]) -> None: @@ -91,9 +85,7 @@ def sync_weight(self) -> None: """Soft-update the weight for the target network.""" for o, n in zip(self.actor_old.parameters(), self.actor.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) - for o, n in zip( - self.critic_old.parameters(), self.critic.parameters() - ): + for o, n in zip(self.critic_old.parameters(), self.critic.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) def _target_q( diff --git a/tianshou/policy/modelfree/discrete_sac.py b/tianshou/policy/modelfree/discrete_sac.py index fd67d4738..a53bbbbf8 100644 --- a/tianshou/policy/modelfree/discrete_sac.py +++ b/tianshou/policy/modelfree/discrete_sac.py @@ -19,15 +19,14 @@ class DiscreteSACPolicy(SACPolicy): :param torch.nn.Module critic2: the second critic network. (s -> Q(s)) :param torch.optim.Optimizer critic2_optim: the optimizer for the second critic network. - :param float tau: param for soft update of the target network, defaults to - 0.005. - :param float gamma: discount factor, in [0, 1], defaults to 0.99. + :param float tau: param for soft update of the target network. Default to 0.005. + :param float gamma: discount factor, in [0, 1]. Default to 0.99. :param (float, torch.Tensor, torch.optim.Optimizer) or float alpha: entropy - regularization coefficient, default to 0.2. - If a tuple (target_entropy, log_alpha, alpha_optim) is provided, then + regularization coefficient. Default to 0.2. + If a tuple (target_entropy, log_alpha, alpha_optim) is provided, the alpha is automatatically tuned. - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to ``False``. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. .. seealso:: @@ -45,9 +44,7 @@ def __init__( critic2_optim: torch.optim.Optimizer, tau: float = 0.005, gamma: float = 0.99, - alpha: Union[ - float, Tuple[float, torch.Tensor, torch.optim.Optimizer] - ] = 0.2, + alpha: Union[float, Tuple[float, torch.Tensor, torch.optim.Optimizer]] = 0.2, reward_normalization: bool = False, estimation_step: int = 1, **kwargs: Any, diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py index 2ee42275f..a4ad772fb 100644 --- a/tianshou/policy/modelfree/dqn.py +++ b/tianshou/policy/modelfree/dqn.py @@ -19,12 +19,11 @@ class DQNPolicy(BasePolicy): :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param float discount_factor: in [0, 1]. - :param int estimation_step: greater than 1, the number of steps to look - ahead. + :param int estimation_step: greater than 1, the number of steps to look ahead. :param int target_update_freq: the target network update frequency (0 if you do not use the target network). - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. .. seealso:: @@ -57,8 +56,6 @@ def __init__( self.model_old = deepcopy(self.model) self.model_old.eval() self._rew_norm = reward_normalization - assert not self._rew_norm, ( - "Reward normalization in offpolicy algorithm is unsupported for now.") def set_eps(self, eps: float) -> None: """Set the eps for epsilon-greedy exploration.""" @@ -74,9 +71,7 @@ def sync_weight(self) -> None: """Synchronize the weight for the target network.""" self.model_old.load_state_dict(self.model.state_dict()) - def _target_q( - self, buffer: ReplayBuffer, indice: np.ndarray - ) -> torch.Tensor: + def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor: batch = buffer[indice] # batch.obs_next: s_{t+n} # target_Q = Q_old(s_, argmax(Q_new(s_, *))) if self._target: diff --git a/tianshou/policy/modelfree/pg.py b/tianshou/policy/modelfree/pg.py index 82fb9f704..080ba70a2 100644 --- a/tianshou/policy/modelfree/pg.py +++ b/tianshou/policy/modelfree/pg.py @@ -1,6 +1,6 @@ import torch import numpy as np -from typing import Any, Dict, List, Union, Optional, Callable +from typing import Any, Dict, List, Type, Union, Optional from tianshou.policy import BasePolicy from tianshou.data import Batch, ReplayBuffer, to_torch_as @@ -13,8 +13,8 @@ class PGPolicy(BasePolicy): :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param dist_fn: distribution class for computing the action. - :type dist_fn: Callable[[], torch.distributions.Distribution] - :param float discount_factor: in [0, 1]. + :type dist_fn: Type[torch.distributions.Distribution] + :param float discount_factor: in [0, 1]. Default to 0.99. .. seealso:: @@ -26,7 +26,7 @@ def __init__( self, model: Optional[torch.nn.Module], optim: torch.optim.Optimizer, - dist_fn: Callable[[], torch.distributions.Distribution], + dist_fn: Type[torch.distributions.Distribution], discount_factor: float = 0.99, reward_normalization: bool = False, **kwargs: Any, @@ -36,9 +36,7 @@ def __init__( self.model: torch.nn.Module = model self.optim = optim self.dist_fn = dist_fn - assert ( - 0.0 <= discount_factor <= 1.0 - ), "discount factor should be in [0, 1]" + assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]" self._gamma = discount_factor self._rew_norm = reward_normalization @@ -83,7 +81,7 @@ def forward( if isinstance(logits, tuple): dist = self.dist_fn(*logits) else: - dist = self.dist_fn(logits) # type: ignore + dist = self.dist_fn(logits) act = dist.sample() return Batch(logits=logits, act=act, state=h, dist=dist) diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index 7fd6f1f26..953829195 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -1,7 +1,7 @@ import torch import numpy as np from torch import nn -from typing import Any, Dict, List, Tuple, Union, Optional, Callable +from typing import Any, Dict, List, Type, Tuple, Union, Optional from tianshou.policy import PGPolicy from tianshou.data import Batch, ReplayBuffer, to_numpy, to_torch_as @@ -13,32 +13,31 @@ class PPOPolicy(PGPolicy): :param torch.nn.Module actor: the actor network following the rules in :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.nn.Module critic: the critic network. (s -> V(s)) - :param torch.optim.Optimizer optim: the optimizer for actor and critic - network. + :param torch.optim.Optimizer optim: the optimizer for actor and critic network. :param dist_fn: distribution class for computing the action. - :type dist_fn: Callable[[], torch.distributions.Distribution] - :param float discount_factor: in [0, 1], defaults to 0.99. - :param float max_grad_norm: clipping gradients in back propagation, - defaults to None. + :type dist_fn: Type[torch.distributions.Distribution] + :param float discount_factor: in [0, 1]. Default to 0.99. + :param float max_grad_norm: clipping gradients in back propagation. + Default to None. :param float eps_clip: :math:`\epsilon` in :math:`L_{CLIP}` in the original - paper, defaults to 0.2. - :param float vf_coef: weight for value loss, defaults to 0.5. - :param float ent_coef: weight for entropy loss, defaults to 0.01. + paper. Default to 0.2. + :param float vf_coef: weight for value loss. Default to 0.5. + :param float ent_coef: weight for entropy loss. Default to 0.01. :param action_range: the action range (minimum, maximum). :type action_range: (float, float) :param float gae_lambda: in [0, 1], param for Generalized Advantage - Estimation, defaults to 0.95. + Estimation. Default to 0.95. :param float dual_clip: a parameter c mentioned in arXiv:1912.09729 Equ. 5, - where c > 1 is a constant indicating the lower bound, - defaults to 5.0 (set ``None`` if you do not want to use it). - :param bool value_clip: a parameter mentioned in arXiv:1811.02553 Sec. 4.1, - defaults to True. - :param bool reward_normalization: normalize the returns to Normal(0, 1), - defaults to True. + where c > 1 is a constant indicating the lower bound. + Default to 5.0 (set None if you do not want to use it). + :param bool value_clip: a parameter mentioned in arXiv:1811.02553 Sec. 4.1. + Default to True. + :param bool reward_normalization: normalize the returns to Normal(0, 1). + Default to True. :param int max_batchsize: the maximum size of the batch when computing GAE, depends on the size of available memory and the memory cost of the - model; should be as large as possible within the memory constraint; - defaults to 256. + model; should be as large as possible within the memory constraint. + Default to 256. .. seealso:: @@ -51,7 +50,7 @@ def __init__( actor: torch.nn.Module, critic: torch.nn.Module, optim: torch.optim.Optimizer, - dist_fn: Callable[[], torch.distributions.Distribution], + dist_fn: Type[torch.distributions.Distribution], discount_factor: float = 0.99, max_grad_norm: Optional[float] = None, eps_clip: float = 0.2, @@ -76,9 +75,8 @@ def __init__( self._batch = max_batchsize assert 0.0 <= gae_lambda <= 1.0, "GAE lambda should be in [0, 1]." self._lambda = gae_lambda - assert ( - dual_clip is None or dual_clip > 1.0 - ), "Dual-clip PPO parameter should greater than 1.0." + assert dual_clip is None or dual_clip > 1.0, \ + "Dual-clip PPO parameter should greater than 1.0." self._dual_clip = dual_clip self._value_clip = value_clip self._rew_norm = reward_normalization @@ -95,9 +93,7 @@ def process_fn( for b in batch.split(self._batch, shuffle=False, merge_last=True): v_.append(self.critic(b.obs_next)) v.append(self.critic(b.obs)) - old_log_prob.append( - self(b).dist.log_prob(to_torch_as(b.act, v[0])) - ) + old_log_prob.append(self(b).dist.log_prob(to_torch_as(b.act, v[0]))) v_ = to_numpy(torch.cat(v_, dim=0)) batch = self.compute_episodic_return( batch, buffer, indice, v_, gamma=self._gamma, @@ -137,7 +133,7 @@ def forward( if isinstance(logits, tuple): dist = self.dist_fn(*logits) else: - dist = self.dist_fn(logits) # type: ignore + dist = self.dist_fn(logits) act = dist.sample() if self._range: act = act.clamp(self._range[0], self._range[1]) @@ -154,8 +150,7 @@ def learn( # type: ignore ratio = (dist.log_prob(b.act) - b.logp_old).exp().float() ratio = ratio.reshape(ratio.size(0), -1).transpose(0, 1) surr1 = ratio * b.adv - surr2 = ratio.clamp(1.0 - self._eps_clip, - 1.0 + self._eps_clip) * b.adv + surr2 = ratio.clamp(1.0 - self._eps_clip, 1.0 + self._eps_clip) * b.adv if self._dual_clip: clip_loss = -torch.max( torch.min(surr1, surr2), self._dual_clip * b.adv @@ -164,8 +159,7 @@ def learn( # type: ignore clip_loss = -torch.min(surr1, surr2).mean() clip_losses.append(clip_loss.item()) if self._value_clip: - v_clip = b.v + (value - b.v).clamp( - -self._eps_clip, self._eps_clip) + v_clip = b.v + (value - b.v).clamp(-self._eps_clip, self._eps_clip) vf1 = (b.returns - value).pow(2) vf2 = (b.returns - v_clip).pow(2) vf_loss = 0.5 * torch.max(vf1, vf2).mean() @@ -174,15 +168,14 @@ def learn( # type: ignore vf_losses.append(vf_loss.item()) e_loss = dist.entropy().mean() ent_losses.append(e_loss.item()) - loss = clip_loss + self._weight_vf * vf_loss - \ - self._weight_ent * e_loss + loss = clip_loss + self._weight_vf * vf_loss \ + - self._weight_ent * e_loss losses.append(loss.item()) self.optim.zero_grad() loss.backward() if self._max_grad_norm: nn.utils.clip_grad_norm_( - list(self.actor.parameters()) - + list(self.critic.parameters()), + list(self.actor.parameters()) + list(self.critic.parameters()), self._max_grad_norm) self.optim.step() return { diff --git a/tianshou/policy/modelfree/qrdqn.py b/tianshou/policy/modelfree/qrdqn.py index 8816b6b1a..ffc93aeee 100644 --- a/tianshou/policy/modelfree/qrdqn.py +++ b/tianshou/policy/modelfree/qrdqn.py @@ -16,13 +16,12 @@ class QRDQNPolicy(DQNPolicy): :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param float discount_factor: in [0, 1]. :param int num_quantiles: the number of quantile midpoints in the inverse - cumulative distribution function of the value, defaults to 200. - :param int estimation_step: greater than 1, the number of steps to look - ahead. + cumulative distribution function of the value. Default to 200. + :param int estimation_step: greater than 1, the number of steps to look ahead. :param int target_update_freq: the target network update frequency (0 if you do not use the target network). - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. .. seealso:: @@ -50,9 +49,7 @@ def __init__( ((tau[:-1] + tau[1:]) / 2).view(1, -1, 1), requires_grad=False) warnings.filterwarnings("ignore", message="Using a target size") - def _target_q( - self, buffer: ReplayBuffer, indice: np.ndarray - ) -> torch.Tensor: + def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor: batch = buffer[indice] # batch.obs_next: s_{t+n} if self._target: a = self(batch, input="obs_next").act diff --git a/tianshou/policy/modelfree/sac.py b/tianshou/policy/modelfree/sac.py index cb53fad7f..68bef3971 100644 --- a/tianshou/policy/modelfree/sac.py +++ b/tianshou/policy/modelfree/sac.py @@ -15,30 +15,27 @@ class SACPolicy(DDPGPolicy): :param torch.nn.Module actor: the actor network following the rules in :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.optim.Optimizer actor_optim: the optimizer for actor network. - :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s, - a)) + :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s, a)) :param torch.optim.Optimizer critic1_optim: the optimizer for the first critic network. - :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s, - a)) + :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s, a)) :param torch.optim.Optimizer critic2_optim: the optimizer for the second critic network. :param action_range: the action range (minimum, maximum). :type action_range: Tuple[float, float] - :param float tau: param for soft update of the target network, defaults to - 0.005. - :param float gamma: discount factor, in [0, 1], defaults to 0.99. + :param float tau: param for soft update of the target network. Default to 0.005. + :param float gamma: discount factor, in [0, 1]. Default to 0.99. :param (float, torch.Tensor, torch.optim.Optimizer) or float alpha: entropy - regularization coefficient, default to 0.2. + regularization coefficient. Default to 0.2. If a tuple (target_entropy, log_alpha, alpha_optim) is provided, then alpha is automatatically tuned. - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. - :param BaseNoise exploration_noise: add a noise to action for exploration, - defaults to None. This is useful when solving hard-exploration problem. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. + :param BaseNoise exploration_noise: add a noise to action for exploration. + Default to None. This is useful when solving hard-exploration problem. :param bool deterministic_eval: whether to use deterministic action (mean - of Gaussian policy) instead of stochastic action sampled by the policy, - defaults to True. + of Gaussian policy) instead of stochastic action sampled by the policy. + Default to True. .. seealso:: @@ -57,9 +54,7 @@ def __init__( action_range: Tuple[float, float], tau: float = 0.005, gamma: float = 0.99, - alpha: Union[ - float, Tuple[float, torch.Tensor, torch.optim.Optimizer] - ] = 0.2, + alpha: Union[float, Tuple[float, torch.Tensor, torch.optim.Optimizer]] = 0.2, reward_normalization: bool = False, estimation_step: int = 1, exploration_noise: Optional[BaseNoise] = None, @@ -98,13 +93,9 @@ def train(self, mode: bool = True) -> "SACPolicy": return self def sync_weight(self) -> None: - for o, n in zip( - self.critic1_old.parameters(), self.critic1.parameters() - ): + for o, n in zip(self.critic1_old.parameters(), self.critic1.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) - for o, n in zip( - self.critic2_old.parameters(), self.critic2.parameters() - ): + for o, n in zip(self.critic2_old.parameters(), self.critic2.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) def forward( # type: ignore @@ -128,8 +119,7 @@ def forward( # type: ignore log_prob = dist.log_prob(x).unsqueeze(-1) log_prob = log_prob - torch.log(y).sum(-1, keepdim=True) - return Batch( - logits=logits, act=act, state=h, dist=dist, log_prob=log_prob) + return Batch(logits=logits, act=act, state=h, dist=dist, log_prob=log_prob) def _target_q( self, buffer: ReplayBuffer, indice: np.ndarray diff --git a/tianshou/policy/modelfree/td3.py b/tianshou/policy/modelfree/td3.py index 23e16d88a..bd6572205 100644 --- a/tianshou/policy/modelfree/td3.py +++ b/tianshou/policy/modelfree/td3.py @@ -14,29 +14,26 @@ class TD3Policy(DDPGPolicy): :param torch.nn.Module actor: the actor network following the rules in :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.optim.Optimizer actor_optim: the optimizer for actor network. - :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s, - a)) + :param torch.nn.Module critic1: the first critic network. (s, a -> Q(s, a)) :param torch.optim.Optimizer critic1_optim: the optimizer for the first critic network. - :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s, - a)) + :param torch.nn.Module critic2: the second critic network. (s, a -> Q(s, a)) :param torch.optim.Optimizer critic2_optim: the optimizer for the second critic network. :param action_range: the action range (minimum, maximum). :type action_range: Tuple[float, float] - :param float tau: param for soft update of the target network, defaults to - 0.005. - :param float gamma: discount factor, in [0, 1], defaults to 0.99. - :param float exploration_noise: the exploration noise, add to the action, - defaults to ``GaussianNoise(sigma=0.1)`` - :param float policy_noise: the noise used in updating policy network, - default to 0.2. - :param int update_actor_freq: the update frequency of actor network, - default to 2. - :param float noise_clip: the clipping range used in updating policy - network, default to 0.5. - :param bool reward_normalization: normalize the reward to Normal(0, 1), - defaults to False. + :param float tau: param for soft update of the target network. Default to 0.005. + :param float gamma: discount factor, in [0, 1]. Default to 0.99. + :param float exploration_noise: the exploration noise, add to the action. + Default to ``GaussianNoise(sigma=0.1)`` + :param float policy_noise: the noise used in updating policy network. + Default to 0.2. + :param int update_actor_freq: the update frequency of actor network. + Default to 2. + :param float noise_clip: the clipping range used in updating policy network. + Default to 0.5. + :param bool reward_normalization: normalize the reward to Normal(0, 1). + Default to False. .. seealso:: @@ -88,18 +85,12 @@ def train(self, mode: bool = True) -> "TD3Policy": def sync_weight(self) -> None: for o, n in zip(self.actor_old.parameters(), self.actor.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) - for o, n in zip( - self.critic1_old.parameters(), self.critic1.parameters() - ): + for o, n in zip(self.critic1_old.parameters(), self.critic1.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) - for o, n in zip( - self.critic2_old.parameters(), self.critic2.parameters() - ): + for o, n in zip(self.critic2_old.parameters(), self.critic2.parameters()): o.data.copy_(o.data * (1.0 - self._tau) + n.data * self._tau) - def _target_q( - self, buffer: ReplayBuffer, indice: np.ndarray - ) -> torch.Tensor: + def _target_q(self, buffer: ReplayBuffer, indice: np.ndarray) -> torch.Tensor: batch = buffer[indice] # batch.obs: s_{t+n} a_ = self(batch, model="actor_old", input="obs_next").act dev = a_.device @@ -134,8 +125,7 @@ def learn(self, batch: Batch, **kwargs: Any) -> Dict[str, float]: self.critic2_optim.step() batch.weight = (td1 + td2) / 2.0 # prio-buffer if self._cnt % self._freq == 0: - actor_loss = -self.critic1( - batch.obs, self(batch, eps=0.0).act).mean() + actor_loss = -self.critic1(batch.obs, self(batch, eps=0.0).act).mean() self.actor_optim.zero_grad() actor_loss.backward() self._last = actor_loss.item() diff --git a/tianshou/utils/net/common.py b/tianshou/utils/net/common.py index 1da33650b..b41346e9a 100644 --- a/tianshou/utils/net/common.py +++ b/tianshou/utils/net/common.py @@ -34,7 +34,7 @@ class MLP(nn.Module): :param hidden_sizes: shape of MLP passed in as a list, not incluing input_dim and output_dim. :param norm_layer: use which normalization before activation, e.g., - ``nn.LayerNorm`` and ``nn.BatchNorm1d``, defaults to no normalization. + ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization. You can also pass a list of normalization modules with the same length of hidden_sizes, to use different normalization module in different layers. Default to no normalization. @@ -103,7 +103,7 @@ class Net(nn.Module): :param action_shape: int or a sequence of int of the shape of action. :param hidden_sizes: shape of MLP passed in as a list. :param norm_layer: use which normalization before activation, e.g., - ``nn.LayerNorm`` and ``nn.BatchNorm1d``, defaults to no normalization. + ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization. You can also pass a list of normalization modules with the same length of hidden_sizes, to use different normalization module in different layers. Default to no normalization. @@ -118,13 +118,13 @@ class Net(nn.Module): :param bool concat: whether the input shape is concatenated by state_shape and action_shape. If it is True, ``action_shape`` is not the output shape, but affects the input shape only. - :param int num_atoms: in order to expand to the net of distributional RL, - defaults to 1 (not use). + :param int num_atoms: in order to expand to the net of distributional RL. + Default to 1 (not use). :param bool dueling_param: whether to use dueling network to calculate Q values (for Dueling DQN). If you want to use dueling option, you should pass a tuple of two dict (first for Q and second for V) stating self-defined arguments as stated in - class:`~tianshou.utils.net.common.MLP`. Defaults to None. + class:`~tianshou.utils.net.common.MLP`. Default to None. .. seealso:: From 76be2466872ded7bd674b6b74fe76f7f7e551384 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 21:26:49 +0800 Subject: [PATCH 04/13] action=store_true --- test/continuous/test_ddpg.py | 5 ++--- test/continuous/test_sac_with_il.py | 2 +- test/continuous/test_td3.py | 5 ++--- test/discrete/test_a2c_with_il.py | 2 +- test/discrete/test_sac.py | 5 ++--- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index 6d7020ad4..16aa475dd 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -31,13 +31,12 @@ def get_args(): parser.add_argument('--step-per-collect', type=int, default=4) parser.add_argument('--update-per-step', type=float, default=0.25) parser.add_argument('--batch-size', type=int, default=128) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) + parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) parser.add_argument('--training-num', type=int, default=4) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--rew-norm', type=int, default=0) + parser.add_argument('--rew-norm', action="store_true", default=False) parser.add_argument('--n-step', type=int, default=1) parser.add_argument( '--device', type=str, diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py index 900c6e0c4..b1ca67bd4 100644 --- a/test/continuous/test_sac_with_il.py +++ b/test/continuous/test_sac_with_il.py @@ -40,7 +40,7 @@ def get_args(): parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--rew-norm', type=int, default=0) + parser.add_argument('--rew-norm', action="store_true", default=False) parser.add_argument('--n-step', type=int, default=4) parser.add_argument( '--device', type=str, diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 340cb261d..9a0f3a9c3 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -34,13 +34,12 @@ def get_args(): parser.add_argument('--step-per-collect', type=int, default=10) parser.add_argument('--update-per-step', type=float, default=0.1) parser.add_argument('--batch-size', type=int, default=128) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) + parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) parser.add_argument('--training-num', type=int, default=10) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--rew-norm', type=int, default=0) + parser.add_argument('--rew-norm', action="store_true", default=False) parser.add_argument('--n-step', type=int, default=1) parser.add_argument( '--device', type=str, diff --git a/test/discrete/test_a2c_with_il.py b/test/discrete/test_a2c_with_il.py index 196dc28b5..882cb440a 100644 --- a/test/discrete/test_a2c_with_il.py +++ b/test/discrete/test_a2c_with_il.py @@ -47,7 +47,7 @@ def get_args(): parser.add_argument('--ent-coef', type=float, default=0.0) parser.add_argument('--max-grad-norm', type=float, default=None) parser.add_argument('--gae-lambda', type=float, default=1.) - parser.add_argument('--rew-norm', type=bool, default=False) + parser.add_argument('--rew-norm', action="store_true", default=False) args = parser.parse_known_args()[0] return args diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py index 465331a99..5f5a3f0cd 100644 --- a/test/discrete/test_sac.py +++ b/test/discrete/test_sac.py @@ -32,13 +32,12 @@ def get_args(): parser.add_argument('--step-per-collect', type=int, default=5) parser.add_argument('--update-per-step', type=float, default=0.2) parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) + parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) parser.add_argument('--training-num', type=int, default=5) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.0) - parser.add_argument('--rew-norm', type=int, default=0) + parser.add_argument('--rew-norm', action="store_true", default=False) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') From 6f766181d2cb022a6626e5fc951c989262a9afe2 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 21:40:00 +0800 Subject: [PATCH 05/13] fix test_ddpg: pass 10 seed within avg 30s --- test/continuous/test_ddpg.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index 16aa475dd..093aed196 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -26,18 +26,18 @@ def get_args(): parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--exploration-noise', type=float, default=0.1) - parser.add_argument('--epoch', type=int, default=20) - parser.add_argument('--step-per-epoch', type=int, default=9600) - parser.add_argument('--step-per-collect', type=int, default=4) - parser.add_argument('--update-per-step', type=float, default=0.25) + parser.add_argument('--epoch', type=int, default=5) + parser.add_argument('--step-per-epoch', type=int, default=20000) + parser.add_argument('--step-per-collect', type=int, default=8) + parser.add_argument('--update-per-step', type=float, default=0.125) parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=4) + parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) parser.add_argument('--rew-norm', action="store_true", default=False) - parser.add_argument('--n-step', type=int, default=1) + parser.add_argument('--n-step', type=int, default=3) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') From 9291d5ac3cf95ba0dbd4d3af4c09e72c1e8355ae Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 21:52:57 +0800 Subject: [PATCH 06/13] fix test_td3: pass 10 seed within avg 35s --- test/continuous/test_td3.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 9a0f3a9c3..41bb2f835 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -21,7 +21,7 @@ def get_args(): parser.add_argument('--task', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=3e-4) + parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) @@ -29,18 +29,18 @@ def get_args(): parser.add_argument('--policy-noise', type=float, default=0.2) parser.add_argument('--noise-clip', type=float, default=0.5) parser.add_argument('--update-actor-freq', type=int, default=2) - parser.add_argument('--epoch', type=int, default=20) + parser.add_argument('--epoch', type=int, default=5) parser.add_argument('--step-per-epoch', type=int, default=20000) - parser.add_argument('--step-per-collect', type=int, default=10) - parser.add_argument('--update-per-step', type=float, default=0.1) + parser.add_argument('--step-per-collect', type=int, default=8) + parser.add_argument('--update-per-step', type=float, default=0.125) parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=10) + parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) parser.add_argument('--rew-norm', action="store_true", default=False) - parser.add_argument('--n-step', type=int, default=1) + parser.add_argument('--n-step', type=int, default=3) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') From 72e074d52d9c52ba3fa15edb2093261ca4c2b02d Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 22:29:48 +0800 Subject: [PATCH 07/13] fix test_drqn: 10 seed avg < 20s --- test/continuous/test_ppo.py | 2 +- test/continuous/test_sac_with_il.py | 2 +- test/discrete/test_drqn.py | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index b4fd383c7..762c58838 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -23,7 +23,7 @@ def get_args(): parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=20) + parser.add_argument('--epoch', type=int, default=5) parser.add_argument('--step-per-epoch', type=int, default=150000) parser.add_argument('--episode-per-collect', type=int, default=16) parser.add_argument('--repeat-per-collect', type=int, default=2) diff --git a/test/continuous/test_sac_with_il.py b/test/continuous/test_sac_with_il.py index b1ca67bd4..ac533fcf4 100644 --- a/test/continuous/test_sac_with_il.py +++ b/test/continuous/test_sac_with_il.py @@ -26,7 +26,7 @@ def get_args(): parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--alpha', type=float, default=0.2) - parser.add_argument('--epoch', type=int, default=20) + parser.add_argument('--epoch', type=int, default=5) parser.add_argument('--step-per-epoch', type=int, default=24000) parser.add_argument('--il-step-per-epoch', type=int, default=500) parser.add_argument('--step-per-collect', type=int, default=10) diff --git a/test/discrete/test_drqn.py b/test/discrete/test_drqn.py index 33f0432a3..39bef8dbc 100644 --- a/test/discrete/test_drqn.py +++ b/test/discrete/test_drqn.py @@ -24,15 +24,15 @@ def get_args(): parser.add_argument('--stack-num', type=int, default=4) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=0.95) - parser.add_argument('--n-step', type=int, default=4) + parser.add_argument('--n-step', type=int, default=3) parser.add_argument('--target-update-freq', type=int, default=320) - parser.add_argument('--epoch', type=int, default=10) - parser.add_argument('--step-per-epoch', type=int, default=10000) - parser.add_argument('--update-per-step', type=float, default=0.1) - parser.add_argument('--step-per-collect', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--layer-num', type=int, default=3) - parser.add_argument('--training-num', type=int, default=10) + parser.add_argument('--epoch', type=int, default=5) + parser.add_argument('--step-per-epoch', type=int, default=20000) + parser.add_argument('--update-per-step', type=float, default=1 / 16) + parser.add_argument('--step-per-collect', type=int, default=16) + parser.add_argument('--batch-size', type=int, default=128) + parser.add_argument('--layer-num', type=int, default=2) + parser.add_argument('--training-num', type=int, default=16) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.) From f6ef057cf178875627d5841549498454cb58edd0 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 22:31:45 +0800 Subject: [PATCH 08/13] test td3 seed --- test/continuous/test_td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 41bb2f835..6b53a79ff 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -19,7 +19,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v0') - parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) From 19a66f864af13da33a973c26a749300dcbd9ab55 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 22:53:05 +0800 Subject: [PATCH 09/13] fix test_sac --- test/discrete/test_sac.py | 17 +++++++++-------- tianshou/trainer/offline.py | 2 +- tianshou/trainer/offpolicy.py | 4 ++-- tianshou/trainer/onpolicy.py | 4 ++-- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/test/discrete/test_sac.py b/test/discrete/test_sac.py index 5f5a3f0cd..ad594dbfc 100644 --- a/test/discrete/test_sac.py +++ b/test/discrete/test_sac.py @@ -20,24 +20,25 @@ def get_args(): parser.add_argument('--task', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=1626) parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=3e-4) + parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) parser.add_argument('--alpha-lr', type=float, default=3e-4) parser.add_argument('--gamma', type=float, default=0.95) parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--alpha', type=float, default=0.05) - parser.add_argument('--auto_alpha', type=int, default=0) + parser.add_argument('--auto-alpha', action="store_true", default=False) parser.add_argument('--epoch', type=int, default=5) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=5) - parser.add_argument('--update-per-step', type=float, default=0.2) - parser.add_argument('--batch-size', type=int, default=64) + parser.add_argument('--step-per-epoch', type=int, default=10000) + parser.add_argument('--step-per-collect', type=int, default=10) + parser.add_argument('--update-per-step', type=float, default=0.1) + parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=5) + parser.add_argument('--training-num', type=int, default=10) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') parser.add_argument('--render', type=float, default=0.0) parser.add_argument('--rew-norm', action="store_true", default=False) + parser.add_argument('--n-step', type=int, default=3) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -85,7 +86,7 @@ def test_discrete_sac(args=get_args()): policy = DiscreteSACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, - args.tau, args.gamma, args.alpha, + args.tau, args.gamma, args.alpha, estimation_step=args.n_step, reward_normalization=args.rew_norm) # collector train_collector = Collector( diff --git a/tianshou/trainer/offline.py b/tianshou/trainer/offline.py index b3588ae5b..13f96faeb 100644 --- a/tianshou/trainer/offline.py +++ b/tianshou/trainer/offline.py @@ -79,7 +79,7 @@ def offline_trainer( for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() - data[k] = f"{losses[k]:.6f}" + data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) # test diff --git a/tianshou/trainer/offpolicy.py b/tianshou/trainer/offpolicy.py index 5f233bfef..72a243d9a 100644 --- a/tianshou/trainer/offpolicy.py +++ b/tianshou/trainer/offpolicy.py @@ -106,7 +106,7 @@ def offpolicy_trainer( data = { "env_step": str(env_step), "rew": f"{last_rew:.2f}", - "len": str(last_len), + "len": str(int(last_len)), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), } @@ -130,7 +130,7 @@ def offpolicy_trainer( for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() - data[k] = f"{losses[k]:.6f}" + data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) if t.n <= t.total: diff --git a/tianshou/trainer/onpolicy.py b/tianshou/trainer/onpolicy.py index 5f5254d66..dae20a741 100644 --- a/tianshou/trainer/onpolicy.py +++ b/tianshou/trainer/onpolicy.py @@ -113,7 +113,7 @@ def onpolicy_trainer( data = { "env_step": str(env_step), "rew": f"{last_rew:.2f}", - "len": str(last_len), + "len": str(int(last_len)), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), } @@ -140,7 +140,7 @@ def onpolicy_trainer( for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() - data[k] = f"{losses[k]:.6f}" + data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) if t.n <= t.total: From 33e6ae09f52973fcc4da95c1f6b50a53f8a8309c Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 22:55:53 +0800 Subject: [PATCH 10/13] td3 seed=1 --- test/continuous/test_td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 6b53a79ff..86331e993 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -19,7 +19,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='Pendulum-v0') - parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=20000) parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) From ec1096ba8f7ed85cb5f8f0aa03259597a125d75a Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Fri, 26 Feb 2021 23:28:55 +0800 Subject: [PATCH 11/13] change psrl seed to see what happens --- test/modelbase/test_psrl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/modelbase/test_psrl.py b/test/modelbase/test_psrl.py index c04261868..d89a7f4bc 100644 --- a/test/modelbase/test_psrl.py +++ b/test/modelbase/test_psrl.py @@ -16,7 +16,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='NChain-v0') - parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--seed', type=int, default=1) parser.add_argument('--buffer-size', type=int, default=50000) parser.add_argument('--epoch', type=int, default=5) parser.add_argument('--step-per-epoch', type=int, default=1000) @@ -29,7 +29,7 @@ def get_args(): parser.add_argument('--rew-std-prior', type=float, default=1.0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--eps', type=float, default=0.01) - parser.add_argument('--add-done-loop', action='store_true') + parser.add_argument('--add-done-loop', action="store_true", default=False) return parser.parse_known_args()[0] From 41465345c5227beecbf03cde13e299e3eee13715 Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Sat, 27 Feb 2021 10:42:31 +0800 Subject: [PATCH 12/13] remove runnable/ --- examples/atari/runnable/atari.py | 133 ---- examples/atari/runnable/pong_a2c.py | 110 ---- examples/atari/runnable/pong_ppo.py | 115 ---- examples/mujoco/runnable/ant_v2_ddpg.py | 108 ---- examples/mujoco/runnable/ant_v2_td3.py | 117 ---- .../runnable/halfcheetahBullet_v0_sac.py | 121 ---- examples/mujoco/runnable/mujoco/__init__.py | 0 .../mujoco/runnable/mujoco/assets/point.xml | 34 -- .../mujoco/runnable/mujoco/maze_env_utils.py | 196 ------ examples/mujoco/runnable/mujoco/point.py | 93 --- .../mujoco/runnable/mujoco/point_maze_env.py | 568 ------------------ examples/mujoco/runnable/mujoco/register.py | 27 - examples/mujoco/runnable/point_maze_td3.py | 126 ---- 13 files changed, 1748 deletions(-) delete mode 100644 examples/atari/runnable/atari.py delete mode 100644 examples/atari/runnable/pong_a2c.py delete mode 100644 examples/atari/runnable/pong_ppo.py delete mode 100644 examples/mujoco/runnable/ant_v2_ddpg.py delete mode 100644 examples/mujoco/runnable/ant_v2_td3.py delete mode 100644 examples/mujoco/runnable/halfcheetahBullet_v0_sac.py delete mode 100644 examples/mujoco/runnable/mujoco/__init__.py delete mode 100644 examples/mujoco/runnable/mujoco/assets/point.xml delete mode 100644 examples/mujoco/runnable/mujoco/maze_env_utils.py delete mode 100644 examples/mujoco/runnable/mujoco/point.py delete mode 100644 examples/mujoco/runnable/mujoco/point_maze_env.py delete mode 100644 examples/mujoco/runnable/mujoco/register.py delete mode 100644 examples/mujoco/runnable/point_maze_td3.py diff --git a/examples/atari/runnable/atari.py b/examples/atari/runnable/atari.py deleted file mode 100644 index 8e2ea5168..000000000 --- a/examples/atari/runnable/atari.py +++ /dev/null @@ -1,133 +0,0 @@ -import cv2 -import gym -import numpy as np -from gym.spaces.box import Box -from tianshou.data import Batch - -SIZE = 84 -FRAME = 4 - - -def create_atari_environment(name=None, sticky_actions=True, - max_episode_steps=2000): - game_version = 'v0' if sticky_actions else 'v4' - name = '{}NoFrameskip-{}'.format(name, game_version) - env = gym.make(name) - env = env.env - env = preprocessing(env, max_episode_steps=max_episode_steps) - return env - - -def preprocess_fn(obs=None, act=None, rew=None, done=None, - obs_next=None, info=None, policy=None, **kwargs): - if obs_next is not None: - obs_next = np.reshape(obs_next, (-1, *obs_next.shape[2:])) - obs_next = np.moveaxis(obs_next, 0, -1) - obs_next = cv2.resize(obs_next, (SIZE, SIZE)) - obs_next = np.asanyarray(obs_next, dtype=np.uint8) - obs_next = np.reshape(obs_next, (-1, FRAME, SIZE, SIZE)) - obs_next = np.moveaxis(obs_next, 1, -1) - elif obs is not None: - obs = np.reshape(obs, (-1, *obs.shape[2:])) - obs = np.moveaxis(obs, 0, -1) - obs = cv2.resize(obs, (SIZE, SIZE)) - obs = np.asanyarray(obs, dtype=np.uint8) - obs = np.reshape(obs, (-1, FRAME, SIZE, SIZE)) - obs = np.moveaxis(obs, 1, -1) - - return Batch(obs=obs, act=act, rew=rew, done=done, - obs_next=obs_next, info=info) - - -class preprocessing(object): - def __init__(self, env, frame_skip=4, terminal_on_life_loss=False, - size=84, max_episode_steps=2000): - self.max_episode_steps = max_episode_steps - self.env = env - self.terminal_on_life_loss = terminal_on_life_loss - self.frame_skip = frame_skip - self.size = size - self.count = 0 - obs_dims = self.env.observation_space - - self.screen_buffer = [ - np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8), - np.empty((obs_dims.shape[0], obs_dims.shape[1]), dtype=np.uint8) - ] - - self.game_over = False - self.lives = 0 - - @property - def observation_space(self): - return Box(low=0, high=255, - shape=(self.size, self.size, self.frame_skip), - dtype=np.uint8) - - def action_space(self): - return self.env.action_space - - def reward_range(self): - return self.env.reward_range - - def metadata(self): - return self.env.metadata - - def close(self): - return self.env.close() - - def reset(self): - self.count = 0 - self.env.reset() - self.lives = self.env.ale.lives() - self._grayscale_obs(self.screen_buffer[0]) - self.screen_buffer[1].fill(0) - - return np.array([self._pool_and_resize() - for _ in range(self.frame_skip)]) - - def render(self, mode='human'): - return self.env.render(mode) - - def step(self, action): - total_reward = 0. - observation = [] - for t in range(self.frame_skip): - self.count += 1 - _, reward, terminal, info = self.env.step(action) - total_reward += reward - - if self.terminal_on_life_loss: - lives = self.env.ale.lives() - is_terminal = terminal or lives < self.lives - self.lives = lives - else: - is_terminal = terminal - - if is_terminal: - break - elif t >= self.frame_skip - 2: - t_ = t - (self.frame_skip - 2) - self._grayscale_obs(self.screen_buffer[t_]) - - observation.append(self._pool_and_resize()) - if len(observation) == 0: - observation = [self._pool_and_resize() - for _ in range(self.frame_skip)] - while len(observation) > 0 and \ - len(observation) < self.frame_skip: - observation.append(observation[-1]) - terminal = self.count >= self.max_episode_steps - return np.array(observation), total_reward, \ - (terminal or is_terminal), info - - def _grayscale_obs(self, output): - self.env.ale.getScreenGrayscale(output) - return output - - def _pool_and_resize(self): - if self.frame_skip > 1: - np.maximum(self.screen_buffer[0], self.screen_buffer[1], - out=self.screen_buffer[0]) - - return self.screen_buffer[0] diff --git a/examples/atari/runnable/pong_a2c.py b/examples/atari/runnable/pong_a2c.py deleted file mode 100644 index 023824ce6..000000000 --- a/examples/atari/runnable/pong_a2c.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import torch -import pprint -import argparse -import numpy as np -from torch.utils.tensorboard import SummaryWriter - -from tianshou.policy import A2CPolicy -from tianshou.utils import BasicLogger -from tianshou.env import SubprocVectorEnv -from tianshou.utils.net.common import Net -from tianshou.trainer import onpolicy_trainer -from tianshou.data import Collector, VectorReplayBuffer -from tianshou.utils.net.discrete import Actor, Critic - -from atari import create_atari_environment, preprocess_fn - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Pong') - parser.add_argument('--seed', type=int, default=1626) - parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--lr', type=float, default=3e-4) - parser.add_argument('--gamma', type=float, default=0.9) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=1000) - parser.add_argument('--episode-per-collect', type=int, default=10) - parser.add_argument('--repeat-per-collect', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128, 128]) - parser.add_argument('--training-num', type=int, default=8) - parser.add_argument('--test-num', type=int, default=8) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - - parser.add_argument( - '--device', type=str, - default='cuda' if torch.cuda.is_available() else 'cpu') - # a2c special - parser.add_argument('--vf-coef', type=float, default=0.5) - parser.add_argument('--ent-coef', type=float, default=0.001) - parser.add_argument('--max-grad-norm', type=float, default=None) - parser.add_argument('--max-episode-steps', type=int, default=2000) - return parser.parse_args() - - -def test_a2c(args=get_args()): - env = create_atari_environment(args.task) - args.state_shape = env.observation_space.shape or env.observation_space.n - args.action_shape = env.env.action_space.shape or env.env.action_space.n - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: create_atari_environment(args.task) - for _ in range(args.training_num)]) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: create_atari_environment(args.task) - for _ in range(args.test_num)]) - # seed - np.random.seed(args.seed) - torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) - # model - net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, - device=args.device) - actor = Actor(net, args.action_shape, device=args.device).to(args.device) - critic = Critic(net, device=args.device).to(args.device) - optim = torch.optim.Adam(set( - actor.parameters()).union(critic.parameters()), lr=args.lr) - dist = torch.distributions.Categorical - policy = A2CPolicy( - actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef, - ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) - # collector - train_collector = Collector( - policy, train_envs, - VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)), - preprocess_fn=preprocess_fn, exploration_noise=True) - test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn) - # log - log_path = os.path.join(args.logdir, args.task, 'a2c') - writer = SummaryWriter(log_path) - logger = BasicLogger(writer) - - def stop_fn(mean_rewards): - if env.env.spec.reward_threshold: - return mean_rewards >= env.spec.reward_threshold - else: - return False - - # trainer - result = onpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, - episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, logger=logger) - if __name__ == '__main__': - pprint.pprint(result) - # Let's watch its performance! - env = create_atari_environment(args.task) - collector = Collector(policy, env, preprocess_fn=preprocess_fn) - result = collector.collect(n_episode=1, render=args.render) - rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews.mean()}, length: {lens.mean()}") - - -if __name__ == '__main__': - test_a2c() diff --git a/examples/atari/runnable/pong_ppo.py b/examples/atari/runnable/pong_ppo.py deleted file mode 100644 index 36728de6f..000000000 --- a/examples/atari/runnable/pong_ppo.py +++ /dev/null @@ -1,115 +0,0 @@ -import os -import torch -import pprint -import argparse -import numpy as np -from torch.utils.tensorboard import SummaryWriter - -from tianshou.policy import PPOPolicy -from tianshou.utils import BasicLogger -from tianshou.env import SubprocVectorEnv -from tianshou.utils.net.common import Net -from tianshou.trainer import onpolicy_trainer -from tianshou.utils.net.discrete import Actor, Critic -from tianshou.data import Collector, VectorReplayBuffer - -from atari import create_atari_environment, preprocess_fn - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Pong') - parser.add_argument('--seed', type=int, default=1626) - parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=1000) - parser.add_argument('--episode-per-collect', type=int, default=10) - parser.add_argument('--repeat-per-collect', type=int, default=2) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=8) - parser.add_argument('--test-num', type=int, default=8) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument( - '--device', type=str, - default='cuda' if torch.cuda.is_available() else 'cpu') - # ppo special - parser.add_argument('--vf-coef', type=float, default=0.5) - parser.add_argument('--ent-coef', type=float, default=0.0) - parser.add_argument('--eps-clip', type=float, default=0.2) - parser.add_argument('--max-grad-norm', type=float, default=0.5) - parser.add_argument('--max-episode-steps', type=int, default=2000) - return parser.parse_args() - - -def test_ppo(args=get_args()): - env = create_atari_environment(args.task) - args.state_shape = env.observation_space.shape or env.observation_space.n - args.action_shape = env.action_space().shape or env.action_space().n - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv([ - lambda: create_atari_environment(args.task) - for _ in range(args.training_num)]) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv([ - lambda: create_atari_environment(args.task) - for _ in range(args.test_num)]) - # seed - np.random.seed(args.seed) - torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) - # model - net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, - device=args.device) - actor = Actor(net, args.action_shape, device=args.device).to(args.device) - critic = Critic(net, device=args.device).to(args.device) - optim = torch.optim.Adam(set( - actor.parameters()).union(critic.parameters()), lr=args.lr) - dist = torch.distributions.Categorical - policy = PPOPolicy( - actor, critic, optim, dist, args.gamma, - max_grad_norm=args.max_grad_norm, - eps_clip=args.eps_clip, - vf_coef=args.vf_coef, - ent_coef=args.ent_coef, - action_range=None) - # collector - train_collector = Collector( - policy, train_envs, - VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)), - preprocess_fn=preprocess_fn, exploration_noise=True) - test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn) - # log - log_path = os.path.join(args.logdir, args.task, 'ppo') - writer = SummaryWriter(log_path) - logger = BasicLogger(writer) - - def stop_fn(mean_rewards): - if env.env.spec.reward_threshold: - return mean_rewards >= env.spec.reward_threshold - else: - return False - - # trainer - result = onpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, - episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, logger=logger) - - if __name__ == '__main__': - pprint.pprint(result) - # Let's watch its performance! - env = create_atari_environment(args.task) - collector = Collector(policy, env, preprocess_fn=preprocess_fn) - result = collector.collect(n_step=2000, render=args.render) - rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews.mean()}, length: {lens.mean()}") - - -if __name__ == '__main__': - test_ppo() diff --git a/examples/mujoco/runnable/ant_v2_ddpg.py b/examples/mujoco/runnable/ant_v2_ddpg.py deleted file mode 100644 index ce42434c0..000000000 --- a/examples/mujoco/runnable/ant_v2_ddpg.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import gym -import torch -import pprint -import argparse -import numpy as np -from torch.utils.tensorboard import SummaryWriter - -from tianshou.policy import DDPGPolicy -from tianshou.utils import BasicLogger -from tianshou.env import SubprocVectorEnv -from tianshou.utils.net.common import Net -from tianshou.trainer import offpolicy_trainer -from tianshou.exploration import GaussianNoise -from tianshou.data import Collector, VectorReplayBuffer -from tianshou.utils.net.continuous import Actor, Critic - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v2') - parser.add_argument('--seed', type=int, default=1626) - parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=1e-4) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=2400) - parser.add_argument('--step-per-collect', type=int, default=4) - parser.add_argument('--batch-size', type=int, default=128) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=4) - parser.add_argument('--test-num', type=int, default=100) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument( - '--device', type=str, - default='cuda' if torch.cuda.is_available() else 'cpu') - return parser.parse_args() - - -def test_ddpg(args=get_args()): - env = gym.make(args.task) - args.state_shape = env.observation_space.shape or env.observation_space.n - args.action_shape = env.action_space.shape or env.action_space.n - args.max_action = env.action_space.high[0] - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)]) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)]) - # seed - np.random.seed(args.seed) - torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) - # model - net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, - device=args.device) - actor = Actor(net, args.action_shape, max_action=args.max_action, - device=args.device).to(args.device) - actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, concat=True, device=args.device) - critic = Critic(net, device=args.device).to(args.device) - critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) - policy = DDPGPolicy( - actor, actor_optim, critic, critic_optim, - action_range=[env.action_space.low[0], env.action_space.high[0]], - tau=args.tau, gamma=args.gamma, - exploration_noise=GaussianNoise(sigma=args.exploration_noise)) - # collector - train_collector = Collector( - policy, train_envs, - VectorReplayBuffer(args.buffer_size, len(train_envs)), - exploration_noise=True) - test_collector = Collector(policy, test_envs) - # log - log_path = os.path.join(args.logdir, args.task, 'ddpg') - writer = SummaryWriter(log_path) - logger = BasicLogger(writer) - - def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold - - # trainer - result = offpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.step_per_collect, args.test_num, - args.batch_size, stop_fn=stop_fn, logger=logger) - assert stop_fn(result['best_reward']) - if __name__ == '__main__': - pprint.pprint(result) - # Let's watch its performance! - policy.eval() - test_envs.seed(args.seed) - test_collector.reset() - result = test_collector.collect(n_episode=args.test_num, render=args.render) - rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews.mean()}, length: {lens.mean()}") - - -if __name__ == '__main__': - test_ddpg() diff --git a/examples/mujoco/runnable/ant_v2_td3.py b/examples/mujoco/runnable/ant_v2_td3.py deleted file mode 100644 index 5e33c33da..000000000 --- a/examples/mujoco/runnable/ant_v2_td3.py +++ /dev/null @@ -1,117 +0,0 @@ -import os -import gym -import torch -import pprint -import argparse -import numpy as np -from torch.utils.tensorboard import SummaryWriter - -from tianshou.policy import TD3Policy -from tianshou.utils import BasicLogger -from tianshou.env import SubprocVectorEnv -from tianshou.utils.net.common import Net -from tianshou.exploration import GaussianNoise -from tianshou.trainer import offpolicy_trainer -from tianshou.data import Collector, VectorReplayBuffer -from tianshou.utils.net.continuous import Actor, Critic - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v2') - parser.add_argument('--seed', type=int, default=1626) - parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=3e-4) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) - parser.add_argument('--policy-noise', type=float, default=0.2) - parser.add_argument('--noise-clip', type=float, default=0.5) - parser.add_argument('--update-actor-freq', type=int, default=2) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=2400) - parser.add_argument('--step-per-collect', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=128) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=10) - parser.add_argument('--test-num', type=int, default=100) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument( - '--device', type=str, - default='cuda' if torch.cuda.is_available() else 'cpu') - return parser.parse_args() - - -def test_td3(args=get_args()): - env = gym.make(args.task) - args.state_shape = env.observation_space.shape or env.observation_space.n - args.action_shape = env.action_space.shape or env.action_space.n - args.max_action = env.action_space.high[0] - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)]) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)]) - # seed - np.random.seed(args.seed) - torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) - # model - net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, - device=args.device) - actor = Actor(net, args.action_shape, max_action=args.max_action, - device=args.device).to(args.device) - actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, concat=True, device=args.device) - critic1 = Critic(net, device=args.device).to(args.device) - critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) - critic2 = Critic(net, device=args.device).to(args.device) - critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) - policy = TD3Policy( - actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, - action_range=[env.action_space.low[0], env.action_space.high[0]], - tau=args.tau, gamma=args.gamma, - exploration_noise=GaussianNoise(sigma=args.exploration_noise), - policy_noise=args.policy_noise, - update_actor_freq=args.update_actor_freq, - noise_clip=args.noise_clip) - # collector - train_collector = Collector( - policy, train_envs, - VectorReplayBuffer(args.buffer_size, len(train_envs)), - exploration_noise=True) - test_collector = Collector(policy, test_envs) - # train_collector.collect(n_step=args.buffer_size) - # log - log_path = os.path.join(args.logdir, args.task, 'td3') - writer = SummaryWriter(log_path) - logger = BasicLogger(writer) - - def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold - - # trainer - result = offpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.step_per_collect, args.test_num, - args.batch_size, stop_fn=stop_fn, logger=logger) - assert stop_fn(result['best_reward']) - if __name__ == '__main__': - pprint.pprint(result) - # Let's watch its performance! - policy.eval() - test_envs.seed(args.seed) - test_collector.reset() - result = test_collector.collect(n_episode=args.test_num, render=args.render) - rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews.mean()}, length: {lens.mean()}") - - -if __name__ == '__main__': - test_td3() diff --git a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py b/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py deleted file mode 100644 index 618492771..000000000 --- a/examples/mujoco/runnable/halfcheetahBullet_v0_sac.py +++ /dev/null @@ -1,121 +0,0 @@ -import os -import gym -import torch -import pprint -import datetime -import argparse -import numpy as np -import pybullet_envs -from torch.utils.tensorboard import SummaryWriter - -from tianshou.policy import SACPolicy -from tianshou.utils import BasicLogger -from tianshou.utils.net.common import Net -from tianshou.env import SubprocVectorEnv -from tianshou.trainer import offpolicy_trainer -from tianshou.data import Collector, VectorReplayBuffer -from tianshou.utils.net.continuous import ActorProb, Critic - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetahBulletEnv-v0') - parser.add_argument('--run-id', type=str, default='test') - parser.add_argument('--seed', type=int, default=1626) - parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=3e-4) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--alpha', type=float, default=0.2) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=1000) - parser.add_argument('--step-per-collect', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=128) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=10) - parser.add_argument('--test-num', type=int, default=4) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--log-interval', type=int, default=100) - parser.add_argument('--render', type=float, default=0.) - parser.add_argument( - '--device', type=str, - default='cuda' if torch.cuda.is_available() else 'cpu') - return parser.parse_args() - - -def test_sac(args=get_args()): - torch.set_num_threads(1) - env = gym.make(args.task) - args.state_shape = env.observation_space.shape or env.observation_space.n - args.action_shape = env.action_space.shape or env.action_space.n - args.max_action = env.action_space.high[0] - # you can also use tianshou.env.SubprocVectorEnv - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)]) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)]) - # seed - np.random.seed(args.seed) - torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) - # model - net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, - device=args.device) - actor = ActorProb(net, args.action_shape, max_action=args.max_action, - device=args.device, unbounded=True).to(args.device) - actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, concat=True, device=args.device) - critic1 = Critic(net, device=args.device).to(args.device) - critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, concat=True, device=args.device) - critic2 = Critic(net, device=args.device).to(args.device) - critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) - policy = SACPolicy( - actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, - action_range=[env.action_space.low[0], env.action_space.high[0]], - tau=args.tau, gamma=args.gamma, alpha=args.alpha) - # collector - train_collector = Collector( - policy, train_envs, - VectorReplayBuffer(args.buffer_size, len(train_envs)), - exploration_noise=True) - test_collector = Collector(policy, test_envs) - # train_collector.collect(n_step=args.buffer_size) - # log - log_path = os.path.join(args.logdir, args.task, 'sac', 'seed_' + str( - args.seed) + '_' + datetime.datetime.now().strftime('%m%d-%H%M%S')) - writer = SummaryWriter(log_path) - logger = BasicLogger(writer, train_interval=args.log_interval) - - def stop_fn(mean_rewards): - return mean_rewards >= env.spec.reward_threshold - - # trainer - result = offpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.step_per_collect, args.test_num, - args.batch_size, stop_fn=stop_fn, - logger=logger) - assert stop_fn(result['best_reward']) - if __name__ == '__main__': - pprint.pprint(result) - # Let's watch its performance! - policy.eval() - test_envs.seed(args.seed) - test_collector.reset() - result = test_collector.collect(n_episode=args.test_num, - render=args.render) - rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews.mean()}, length: {lens.mean()}") - - -if __name__ == '__main__': - __all__ = ('pybullet_envs',) # Avoid F401 error :) - test_sac() diff --git a/examples/mujoco/runnable/mujoco/__init__.py b/examples/mujoco/runnable/mujoco/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/mujoco/runnable/mujoco/assets/point.xml b/examples/mujoco/runnable/mujoco/assets/point.xml deleted file mode 100644 index 38cc64407..000000000 --- a/examples/mujoco/runnable/mujoco/assets/point.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - diff --git a/examples/mujoco/runnable/mujoco/maze_env_utils.py b/examples/mujoco/runnable/mujoco/maze_env_utils.py deleted file mode 100644 index dafce77f5..000000000 --- a/examples/mujoco/runnable/mujoco/maze_env_utils.py +++ /dev/null @@ -1,196 +0,0 @@ -"""Adapted from rllab maze_env_utils.py.""" -import math - - -class Move(object): - X = 11 - Y = 12 - Z = 13 - XY = 14 - XZ = 15 - YZ = 16 - XYZ = 17 - SpinXY = 18 - - -def can_move_x(movable): - return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ, - Move.SpinXY] - - -def can_move_y(movable): - return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ, - Move.SpinXY] - - -def can_move_z(movable): - return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ] - - -def can_spin(movable): - return movable in [Move.SpinXY] - - -def can_move(movable): - return can_move_x(movable) or can_move_y(movable) or can_move_z(movable) - - -def construct_maze(maze_id='Maze'): - if maze_id == 'Maze': - structure = [ - [1, 1, 1, 1, 1], - [1, 'r', 0, 0, 1], - [1, 1, 1, 0, 1], - [1, 'g', 0, 0, 1], - [1, 1, 1, 1, 1], - ] - elif maze_id == 'Maze1': - structure = [ - [1, 1, 1, 1, 1, 1, 1, 1], - [1, 'r', 1, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 1, 1, 0, 1], - [1, 1, 1, 1, 1, 0, 0, 1], - [1, 0, 0, 0, 1, 0, 1, 1], - [1, 0, 0, 0, 1, 0, 1, 1], - [1, 0, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1], - ] - elif maze_id == 'Maze2': - structure = [ - [0, 0, 0, 0, 0], - [0, 'r', 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - ] - # transfer maze - elif maze_id == 'Maze3': - structure = [ - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 'r', 0, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1], - [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1], - [1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1], - [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 'g', 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - ] - elif maze_id == 'Maze4': - structure = [ - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 'r', 0, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], - [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], - [1, 0, 0, 0, 0, 1, 0, 0, 0, 'g', 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - ] - elif maze_id == 'Push': - structure = [ - [1, 1, 1, 1, 1], - [1, 0, 'r', 1, 1], - [1, 0, Move.XY, 0, 1], - [1, 1, 0, 1, 1], - [1, 1, 1, 1, 1], - ] - elif maze_id == 'Fall': - structure = [ - [1, 1, 1, 1], - [1, 'r', 0, 1], - [1, 0, Move.YZ, 1], - [1, -1, -1, 1], - [1, 0, 0, 1], - [1, 1, 1, 1], - ] - elif maze_id == 'Block': - structure = [ - [1, 1, 1, 1, 1], - [1, 'r', 0, 0, 1], - [1, 0, 0, 0, 1], - [1, 0, 0, 0, 1], - [1, 1, 1, 1, 1], - ] - elif maze_id == 'BlockMaze': - structure = [ - [1, 1, 1, 1], - [1, 'r', 0, 1], - [1, 1, 0, 1], - [1, 0, 0, 1], - [1, 1, 1, 1], - ] - else: - raise NotImplementedError( - 'The provided MazeId %s is not recognized' % maze_id) - - return structure - - -def line_intersect(pt1, pt2, ptA, ptB): - """ - Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html - this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB) - """ - - DET_TOLERANCE = 0.00000001 - - # the first line is pt1 + r*(pt2-pt1) - # in component form: - x1, y1 = pt1 - x2, y2 = pt2 - dx1 = x2 - x1 - dy1 = y2 - y1 - - # the second line is ptA + s*(ptB-ptA) - x, y = ptA - xB, yB = ptB - dx = xB - x - dy = yB - y - - DET = (-dx1 * dy + dy1 * dx) - - if math.fabs(DET) < DET_TOLERANCE: - return (0, 0, 0, 0, 0) - - # now, the determinant should be OK - DETinv = 1.0 / DET - - # find the scalar amount along the "self" segment - r = DETinv * (-dy * (x - x1) + dx * (y - y1)) - - # find the scalar amount along the input line - s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1)) - - # return the average of the two descriptions - xi = (x1 + r * dx1 + x + s * dx) / 2.0 - yi = (y1 + r * dy1 + y + s * dy) / 2.0 - return (xi, yi, 1, r, s) - - -def ray_segment_intersect(ray, segment): - """ - Check if the ray originated from (x, y) with direction theta - intersects the line segment (x1, y1) -- (x2, y2), and return - the intersection point if there is one - """ - (x, y), theta = ray - # (x1, y1), (x2, y2) = segment - pt1 = (x, y) - len = 1 - pt2 = (x + len * math.cos(theta), y + len * math.sin(theta)) - xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment) - if valid and r >= 0 and 0 <= s <= 1: - return (xo, yo) - return None - - -def point_distance(p1, p2): - x1, y1 = p1 - x2, y2 = p2 - return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5 diff --git a/examples/mujoco/runnable/mujoco/point.py b/examples/mujoco/runnable/mujoco/point.py deleted file mode 100644 index 2a6a08d41..000000000 --- a/examples/mujoco/runnable/mujoco/point.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Wrapper for creating the ant environment in gym_mujoco.""" - -import math -import numpy as np -from gym import utils -from gym.envs.mujoco import mujoco_env - - -class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle): - FILE = "point.xml" - ORI_IND = 2 - - def __init__(self, file_path=None, expose_all_qpos=True, noisy_init=False): - self._expose_all_qpos = expose_all_qpos - self.noisy_init = noisy_init - mujoco_env.MujocoEnv.__init__(self, file_path, 1) - utils.EzPickle.__init__(self) - - @property - def physics(self): - return self.model - - def _step(self, a): - return self.step(a) - - def step(self, action): - # action[0] is velocity, action[1] is direction - action[0] = 0.2 * action[0] - qpos = np.copy(self.data.qpos) - qpos[2] += action[1] - ori = qpos[2] - # compute increment in each direction - dx = math.cos(ori) * action[0] - dy = math.sin(ori) * action[0] - # ensure that the robot is within reasonable range - qpos[0] = np.clip(qpos[0] + dx, -100, 100) - qpos[1] = np.clip(qpos[1] + dy, -100, 100) - qvel = np.squeeze(self.data.qvel) - self.set_state(qpos, qvel) - for _ in range(0, self.frame_skip): - # self.physics.step() - self.sim.step() - next_obs = self._get_obs() - reward = 0 - done = False - info = {} - return next_obs, reward, done, info - - def _get_obs(self): - if self._expose_all_qpos: - return np.concatenate([ - self.data.qpos.flat[:3], # Only point-relevant coords. - self.data.qvel.flat[:3]]) - return np.concatenate([ - self.data.qpos.flat[2:3], - self.data.qvel.flat[:3]]) - - def reset_model(self): - if self.noisy_init: - qpos = self.init_qpos + self.np_random.uniform( - size=self.model.nq, low=-.1, high=.1) - qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 - - else: - qpos = self.init_qpos - qvel = self.init_qvel - - # Set everything other than point to original position and 0 velocity. - qpos[3:] = self.init_qpos[3:] - qvel[3:] = 0. - self.set_state(qpos, qvel) - return self._get_obs() - - def get_ori(self): - return self.data.qpos[self.__class__.ORI_IND] - - def set_xy(self, xy): - qpos = np.copy(self.data.qpos) - qpos[0] = xy[0] - qpos[1] = xy[1] - - qvel = self.data.qvel - self.set_state(qpos, qvel) - - def get_xy(self): - qpos = np.copy(self.data.qpos) - return qpos[:2] - - def viewer_setup(self): - - self.viewer.cam.trackbodyid = -1 - self.viewer.cam.distance = 80 - self.viewer.cam.elevation = -90 diff --git a/examples/mujoco/runnable/mujoco/point_maze_env.py b/examples/mujoco/runnable/mujoco/point_maze_env.py deleted file mode 100644 index c8e8ef84b..000000000 --- a/examples/mujoco/runnable/mujoco/point_maze_env.py +++ /dev/null @@ -1,568 +0,0 @@ -"""Adapted from rllab maze_env.py.""" - -import os -import tempfile -import xml.etree.ElementTree as ET -import math -import numpy as np -import gym -from . import maze_env_utils -from .point import PointEnv -from gym.utils import seeding - -# Directory that contains mujoco xml files. -MODEL_DIR = os.path.join(os.path.dirname(__file__), 'assets') - - -class PointMazeEnv(gym.Env): - MODEL_CLASS = PointEnv - - MAZE_HEIGHT = None - MAZE_SIZE_SCALING = None - - def __init__( - self, - maze_id=None, - maze_height=0.5, - maze_size_scaling=8, - n_bins=0, - sensor_range=3., - sensor_span=2 * math.pi, - observe_blocks=False, - put_spin_near_agent=False, - top_down_view=False, - manual_collision=False, - goal=None, - EPS=0.25, - max_episode_steps=2000, - *args, - **kwargs): - self._maze_id = maze_id - - model_cls = self.__class__.MODEL_CLASS - if model_cls is None: - raise "MODEL_CLASS unspecified!" - xml_path = os.path.join(MODEL_DIR, model_cls.FILE) - self.tree = tree = ET.parse(xml_path) - self.worldbody = worldbody = tree.find(".//worldbody") - self.visualize_goal = False - self.max_episode_steps = max_episode_steps - self.t = 0 - self.MAZE_HEIGHT = height = maze_height - self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling - self._n_bins = n_bins - self._sensor_range = sensor_range * size_scaling - self._sensor_span = sensor_span - self._observe_blocks = observe_blocks - self._put_spin_near_agent = put_spin_near_agent - self._top_down_view = top_down_view - self._manual_collision = manual_collision - - self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze( - maze_id=self._maze_id) - # Elevate the maze to allow for falling. - self.elevated = any(-1 in row for row in structure) - self.blocks = any( - any(maze_env_utils.can_move(r) for r in row) - for row in structure) # Are there any movable blocks? - - torso_x, torso_y = self._find_robot() # x, y coordinates - self._init_torso_x = torso_x - self._init_torso_y = torso_y - self._init_positions = [ - (x - torso_x, y - torso_y) - for x, y in self._find_all_robots()] - - self._view = np.zeros([5, 5, 3]) - - height_offset = 0. - if self.elevated: - height_offset = height * size_scaling - torso = tree.find(".//body[@name='torso']") - torso.set('pos', '0 0 %.2f' % (0.75 + height_offset)) - if self.blocks: - default = tree.find(".//default") - default.find('.//geom').set('solimp', '.995 .995 .01') - - self.movable_blocks = [] - for i in range(len(structure)): - for j in range(len(structure[0])): - struct = structure[i][j] - if struct == 'r' and self._put_spin_near_agent: - struct = maze_env_utils.Move.SpinXY - if self.elevated and struct not in [-1]: - # Create elevated platform. - ET.SubElement( - worldbody, "geom", - name="elevated_%d_%d" % (i, j), - pos="%f %f %f" % (j * size_scaling - torso_x, - i * size_scaling - torso_y, - height / 2 * size_scaling), - size="%f %f %f" % (0.5 * size_scaling, - 0.5 * size_scaling, - height / 2 * size_scaling), - type="box", - material="", - contype="1", - conaffinity="1", - rgba="0.9 0.9 0.9 1", - ) - if struct == 1: # Unmovable block. - # Offset all coordinates so that robot starts at the origin - ET.SubElement( - worldbody, "geom", - name="block_%d_%d" % (i, j), - pos="%f %f %f" % (j * size_scaling - torso_x, - i * size_scaling - torso_y, - height_offset + - height / 2 * size_scaling), - size="%f %f %f" % (0.5 * size_scaling, - 0.5 * size_scaling, - height / 2 * size_scaling), - type="box", - material="", - contype="1", - conaffinity="1", - rgba="0.4 0.4 0.4 1", - ) - elif maze_env_utils.can_move(struct): - name = "movable_%d_%d" % (i, j) - self.movable_blocks.append((name, struct)) - falling = maze_env_utils.can_move_z(struct) - spinning = maze_env_utils.can_spin(struct) - x_offset = 0.25 * size_scaling if spinning else 0.0 - y_offset = 0.0 - shrink = 0.1 if spinning else 0.99 if falling else 1.0 - height_shrink = 0.1 if spinning else 1.0 - _x = j * size_scaling - torso_x + x_offset - _y = i * size_scaling - torso_y + y_offset - _z = height / 2 * size_scaling * height_shrink - movable_body = ET.SubElement( - worldbody, "body", - name=name, - pos="%f %f %f" % (_x, _y, height_offset + _z), - ) - ET.SubElement( - movable_body, "geom", - name="block_%d_%d" % (i, j), - pos="0 0 0", - size="%f %f %f" % (0.5 * size_scaling * shrink, - 0.5 * size_scaling * shrink, - _z), - type="box", - material="", - mass="0.001" if falling else "0.0002", - contype="1", - conaffinity="1", - rgba="0.9 0.1 0.1 1" - ) - if maze_env_utils.can_move_x(struct): - ET.SubElement( - movable_body, "joint", - armature="0", - axis="1 0 0", - damping="0.0", - limited="true" if falling else "false", - range="%f %f" % (-size_scaling, size_scaling), - margin="0.01", - name="movable_x_%d_%d" % (i, j), - pos="0 0 0", - type="slide" - ) - if maze_env_utils.can_move_y(struct): - ET.SubElement( - movable_body, "joint", - armature="0", - axis="0 1 0", - damping="0.0", - limited="true" if falling else "false", - range="%f %f" % (-size_scaling, size_scaling), - margin="0.01", - name="movable_y_%d_%d" % (i, j), - pos="0 0 0", - type="slide" - ) - if maze_env_utils.can_move_z(struct): - ET.SubElement( - movable_body, "joint", - armature="0", - axis="0 0 1", - damping="0.0", - limited="true", - range="%f 0" % (-height_offset), - margin="0.01", - name="movable_z_%d_%d" % (i, j), - pos="0 0 0", - type="slide" - ) - if maze_env_utils.can_spin(struct): - ET.SubElement( - movable_body, "joint", - armature="0", - axis="0 0 1", - damping="0.0", - limited="false", - name="spinable_%d_%d" % (i, j), - pos="0 0 0", - type="ball" - ) - - torso = tree.find(".//body[@name='torso']") - geoms = torso.findall(".//geom") - for geom in geoms: - if 'name' not in geom.attrib: - raise Exception("Every geom of the torso must have a name " - "defined") - - _, file_path = tempfile.mkstemp(text=True, suffix='.xml') - tree.write(file_path) - - self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs) - self.args = args - self.kwargs = kwargs - self.GOAL = goal - if self.GOAL is not None: - self.GOAL = self.unwrapped._rowcol_to_xy(*self.GOAL) - self.EPS = EPS - - def get_ori(self): - return self.wrapped_env.get_ori() - - def get_top_down_view(self): - self._view = np.zeros_like(self._view) - - def valid(row, col): - return self._view.shape[0] > row >= 0 \ - and self._view.shape[1] > col >= 0 - - def update_view(x, y, d, row=None, col=None): - if row is None or col is None: - x = x - self._robot_x - y = y - self._robot_y - - row, col = self._xy_to_rowcol(x, y) - update_view(x, y, d, row=row, col=col) - return - - row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1 - if row_frac < 0: - row_frac += 1 - if col_frac < 0: - col_frac += 1 - - if valid(row, col): - self._view[row, col, d] += ( - (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) * - (min(1., col_frac + 0.5) - max(0., col_frac - 0.5))) - if valid(row - 1, col): - self._view[row - 1, col, d] += ( - (max(0., 0.5 - row_frac)) * - (min(1., col_frac + 0.5) - max(0., col_frac - 0.5))) - if valid(row + 1, col): - self._view[row + 1, col, d] += ( - (max(0., row_frac - 0.5)) * - (min(1., col_frac + 0.5) - max(0., col_frac - 0.5))) - if valid(row, col - 1): - self._view[row, col - 1, d] += ( - (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) * - (max(0., 0.5 - col_frac))) - if valid(row, col + 1): - self._view[row, col + 1, d] += ( - (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) * - (max(0., col_frac - 0.5))) - if valid(row - 1, col - 1): - self._view[row - 1, col - 1, d] += ( - (max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac)) - if valid(row - 1, col + 1): - self._view[row - 1, col + 1, d] += ( - (max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5)) - if valid(row + 1, col + 1): - self._view[row + 1, col + 1, d] += ( - (max(0., row_frac - 0.5)) * max(0., col_frac - 0.5)) - if valid(row + 1, col - 1): - self._view[row + 1, col - 1, d] += ( - (max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac)) - - # Draw ant. - robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2] - self._robot_x = robot_x - self._robot_y = robot_y - self._robot_ori = self.get_ori() - - structure = self.MAZE_STRUCTURE - size_scaling = self.MAZE_SIZE_SCALING - - # Draw immovable blocks and chasms. - for i in range(len(structure)): - for j in range(len(structure[0])): - if structure[i][j] == 1: # Wall. - update_view(j * size_scaling - self._init_torso_x, - i * size_scaling - self._init_torso_y, - 0) - if structure[i][j] == -1: # Chasm. - update_view(j * size_scaling - self._init_torso_x, - i * size_scaling - self._init_torso_y, - 1) - - # Draw movable blocks. - for block_name, block_type in self.movable_blocks: - block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2] - update_view(block_x, block_y, 2) - - import cv2 - cv2.imshow('x.jpg', cv2.resize( - np.uint8(self._view * 255), (512, 512), - interpolation=cv2.INTER_CUBIC)) - cv2.waitKey(0) - - return self._view - - def get_range_sensor_obs(self): - """Returns egocentric range sensor observations of maze.""" - robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3] - ori = self.get_ori() - - structure = self.MAZE_STRUCTURE - size_scaling = self.MAZE_SIZE_SCALING - height = self.MAZE_HEIGHT - - segments = [] - # Get line segments (corresponding to outer boundary) of each immovable - # block or drop-off. - for i in range(len(structure)): - for j in range(len(structure[0])): - if structure[i][j] in [1, -1]: # There's a wall or drop-off. - cx = j * size_scaling - self._init_torso_x - cy = i * size_scaling - self._init_torso_y - x1 = cx - 0.5 * size_scaling - x2 = cx + 0.5 * size_scaling - y1 = cy - 0.5 * size_scaling - y2 = cy + 0.5 * size_scaling - struct_segments = [ - ((x1, y1), (x2, y1)), - ((x2, y1), (x2, y2)), - ((x2, y2), (x1, y2)), - ((x1, y2), (x1, y1)), - ] - for seg in struct_segments: - segments.append(dict( - segment=seg, - type=structure[i][j], - )) - - for block_name, block_type in self.movable_blocks: - block_x, block_y, block_z = \ - self.wrapped_env.get_body_com(block_name)[:3] - if (block_z + height * size_scaling / 2 >= robot_z and - robot_z >= block_z - height * size_scaling / 2): - # Block in view. - x1 = block_x - 0.5 * size_scaling - x2 = block_x + 0.5 * size_scaling - y1 = block_y - 0.5 * size_scaling - y2 = block_y + 0.5 * size_scaling - struct_segments = [ - ((x1, y1), (x2, y1)), - ((x2, y1), (x2, y2)), - ((x2, y2), (x1, y2)), - ((x1, y2), (x1, y1)), - ] - for seg in struct_segments: - segments.append(dict( - segment=seg, - type=block_type, - )) - - # 3 for wall, drop-off, block - sensor_readings = np.zeros((self._n_bins, 3)) - for ray_idx in range(self._n_bins): - ray_ori = (ori - self._sensor_span * 0.5 + ( - 2 * ray_idx + 1.0) / - (2 * self._n_bins) * self._sensor_span) - ray_segments = [] - # Get all segments that intersect with ray. - for seg in segments: - p = maze_env_utils.ray_segment_intersect( - ray=((robot_x, robot_y), ray_ori), - segment=seg["segment"]) - if p is not None: - ray_segments.append(dict( - segment=seg["segment"], - type=seg["type"], - ray_ori=ray_ori, - distance=maze_env_utils.point_distance( - p, (robot_x, robot_y)), - )) - if len(ray_segments) > 0: - # Find out which segment is intersected first. - first_seg = sorted( - ray_segments, key=lambda x: x["distance"])[0] - seg_type = first_seg["type"] - idx = (0 if seg_type == 1 else # Wall. - 1 if seg_type == -1 else # Drop-off. - 2 if maze_env_utils.can_move(seg_type) else # Block. - None) - if first_seg["distance"] <= self._sensor_range: - sensor_readings[ray_idx][idx] = \ - (self._sensor_range - first_seg[ - "distance"]) / self._sensor_range - return sensor_readings - - def _get_obs(self): - wrapped_obs = self.wrapped_env._get_obs() - if self._top_down_view: - self.get_top_down_view() - - if self._observe_blocks: - additional_obs = [] - for block_name, block_type in self.movable_blocks: - additional_obs.append( - self.wrapped_env.get_body_com(block_name)) - wrapped_obs = np.concatenate([wrapped_obs[:3]] + additional_obs + - [wrapped_obs[3:]]) - - self.get_range_sensor_obs() - return wrapped_obs - - def seed(self, seed=None): - self.np_random, seed = seeding.np_random(seed) - return [seed] - - def reset(self, goal=None): - self.goal = goal - - if self.visualize_goal: # remove the prev goal and add a new goal - goal_x, goal_y = goal[0], goal[1] - size_scaling = self.MAZE_SIZE_SCALING - # remove the original goal - try: - self.worldbody.remove(self.goal_element) - except AttributeError: - pass - # offset all coordinates so that robot starts at the origin - self.goal_element = \ - ET.SubElement( - self.worldbody, "geom", - name="goal_%d_%d" % (goal_x, goal_y), - pos="%f %f %f" % (goal_x, - goal_y, - self.MAZE_HEIGHT / 2 * size_scaling), - # smaller than the block to prevent collision - size="%f %f %f" % (0.1 * size_scaling, - 0.1 * size_scaling, - self.MAZE_HEIGHT / 2 * size_scaling), - type="box", - material="", - contype="1", - conaffinity="1", - rgba="1.0 0.0 0.0 0.5" - ) - # Note: running the lines below will make the robot position wrong! - # (because the graph is rebuilt) - torso = self.tree.find(".//body[@name='torso']") - geoms = torso.findall(".//geom") - for geom in geoms: - if 'name' not in geom.attrib: - raise Exception("Every geom of the torso must have a name " - "defined") - _, file_path = tempfile.mkstemp(text=True, suffix='.xml') - self.tree.write(file_path) - # here we write a temporal file with the robot specifications. - # Why not the original one?? - - model_cls = self.__class__.MODEL_CLASS - # file to the robot specifications; model_cls is AntEnv - self.wrapped_env = model_cls( - *self.args, file_path=file_path, **self.kwargs) - - self.t = 0 - self.trajectory = [] - self.wrapped_env.reset() - if len(self._init_positions) > 1: - xy = self._init_positions[self.np_random.randint( - len(self._init_positions))] - self.wrapped_env.set_xy(xy) - return self._get_obs() - - @property - def viewer(self): - return self.wrapped_env.viewer - - def render(self, *args, **kwargs): - return self.wrapped_env.render(*args, **kwargs) - - @property - def observation_space(self): - shape = self._get_obs().shape - high = np.inf * np.ones(shape) - low = -high - return gym.spaces.Box(low, high) - - @property - def action_space(self): - return self.wrapped_env.action_space - - def _find_robot(self): - structure = self.MAZE_STRUCTURE - size_scaling = self.MAZE_SIZE_SCALING - for i in range(len(structure)): - for j in range(len(structure[0])): - if structure[i][j] == 'r': - return j * size_scaling, i * size_scaling - assert False, 'No robot in maze specification.' - - def _find_all_robots(self): - structure = self.MAZE_STRUCTURE - size_scaling = self.MAZE_SIZE_SCALING - coords = [] - for i in range(len(structure)): - for j in range(len(structure[0])): - if structure[i][j] == 'r': - coords.append((j * size_scaling, i * size_scaling)) - return coords - - def _is_in_collision(self, pos): - x, y = pos - structure = self.MAZE_STRUCTURE - scale = self.MAZE_SIZE_SCALING - for i in range(len(structure)): - for j in range(len(structure[0])): - if structure[i][j] == 1: - minx = j * scale - scale * 0.5 - self._init_torso_x - maxx = j * scale + scale * 0.5 - self._init_torso_x - miny = i * scale - scale * 0.5 - self._init_torso_y - maxy = i * scale + scale * 0.5 - self._init_torso_y - if minx <= x <= maxx and miny <= y <= maxy: - return True - return False - - def _rowcol_to_xy(self, j, i): - scale = self.MAZE_SIZE_SCALING - minx = j * scale - scale * 0.5 - self._init_torso_x - maxx = j * scale + scale * 0.5 - self._init_torso_x - miny = i * scale - scale * 0.5 - self._init_torso_y - maxy = i * scale + scale * 0.5 - self._init_torso_y - return (minx + maxx) / 2, (miny + maxy) / 2 - - def step(self, action): - self.t += 1 - if self._manual_collision: - old_pos = self.wrapped_env.get_xy() - inner_next_obs, inner_reward, inner_done, info = \ - self.wrapped_env.step(action) - new_pos = self.wrapped_env.get_xy() - if self._is_in_collision(new_pos): - self.wrapped_env.set_xy(old_pos) - else: - inner_next_obs, inner_reward, inner_done, info = \ - self.wrapped_env.step(action) - next_obs = self._get_obs() - done = False - if self.goal is not None: - done = bool(((next_obs[:2] - self.goal[:2]) ** 2).sum() < self.EPS) - - new_pos = self.wrapped_env.get_xy() - if self._is_in_collision(new_pos) or inner_done: - done = True - if self.t >= self.max_episode_steps: - done = True - return next_obs, inner_reward, done, info diff --git a/examples/mujoco/runnable/mujoco/register.py b/examples/mujoco/runnable/mujoco/register.py deleted file mode 100644 index 82acac2af..000000000 --- a/examples/mujoco/runnable/mujoco/register.py +++ /dev/null @@ -1,27 +0,0 @@ -from gym.envs.registration import register - - -def reg(): - register( - id='PointMaze-v0', - entry_point='mujoco.point_maze_env:PointMazeEnv', - kwargs={ - "maze_size_scaling": 4, - "maze_id": "Maze2", - "maze_height": 0.5, - "manual_collision": True, - "goal": (1, 3), - } - ) - - register( - id='PointMaze-v1', - entry_point='mujoco.point_maze_env:PointMazeEnv', - kwargs={ - "maze_size_scaling": 2, - "maze_id": "Maze2", - "maze_height": 0.5, - "manual_collision": True, - "goal": (1, 3), - } - ) diff --git a/examples/mujoco/runnable/point_maze_td3.py b/examples/mujoco/runnable/point_maze_td3.py deleted file mode 100644 index eda299244..000000000 --- a/examples/mujoco/runnable/point_maze_td3.py +++ /dev/null @@ -1,126 +0,0 @@ -import os -import gym -import torch -import pprint -import argparse -import numpy as np -from torch.utils.tensorboard import SummaryWriter - -from tianshou.policy import TD3Policy -from tianshou.utils import BasicLogger -from tianshou.utils.net.common import Net -from tianshou.env import SubprocVectorEnv -from tianshou.exploration import GaussianNoise -from tianshou.trainer import offpolicy_trainer -from tianshou.data import Collector, VectorReplayBuffer -from tianshou.utils.net.continuous import Actor, Critic - -from mujoco.register import reg - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='PointMaze-v1') - parser.add_argument('--seed', type=int, default=1626) - parser.add_argument('--buffer-size', type=int, default=20000) - parser.add_argument('--actor-lr', type=float, default=3e-5) - parser.add_argument('--critic-lr', type=float, default=1e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) - parser.add_argument('--policy-noise', type=float, default=0.2) - parser.add_argument('--noise-clip', type=float, default=0.5) - parser.add_argument('--update-actor-freq', type=int, default=2) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=2400) - parser.add_argument('--step-per-collect', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=128) - parser.add_argument('--hidden-sizes', type=int, - nargs='*', default=[128, 128]) - parser.add_argument('--training-num', type=int, default=10) - parser.add_argument('--test-num', type=int, default=100) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument( - '--device', type=str, - default='cuda' if torch.cuda.is_available() else 'cpu') - return parser.parse_args() - - -def test_td3(args=get_args()): - reg() - env = gym.make(args.task) - args.state_shape = env.observation_space.shape or env.observation_space.n - args.action_shape = env.action_space.shape or env.action_space.n - args.max_action = env.action_space.high[0] - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)]) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)]) - # seed - np.random.seed(args.seed) - torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) - # model - net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, - device=args.device) - actor = Actor(net, args.action_shape, max_action=args.max_action, - device=args.device).to(args.device) - actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, concat=True, device=args.device) - critic1 = Critic(net, device=args.device).to(args.device) - critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, concat=True, device=args.device) - critic2 = Critic(net, device=args.device).to(args.device) - critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) - policy = TD3Policy( - actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, - action_range=[env.action_space.low[0], env.action_space.high[0]], - tau=args.tau, gamma=args.gamma, - exploration_noise=GaussianNoise(sigma=args.exploration_noise), - policy_noise=args.policy_noise, - update_actor_freq=args.update_actor_freq, - noise_clip=args.noise_clip) - # collector - train_collector = Collector( - policy, train_envs, - VectorReplayBuffer(args.buffer_size, len(train_envs)), - exploration_noise=True) - test_collector = Collector(policy, test_envs) - # train_collector.collect(n_step=args.buffer_size) - # log - log_path = os.path.join(args.logdir, args.task, 'td3') - writer = SummaryWriter(log_path) - logger = BasicLogger(writer) - - def stop_fn(mean_rewards): - if env.spec.reward_threshold: - return mean_rewards >= env.spec.reward_threshold - else: - return False - - # trainer - result = offpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.step_per_collect, args.test_num, - args.batch_size, stop_fn=stop_fn, logger=logger) - assert stop_fn(result['best_reward']) - if __name__ == '__main__': - pprint.pprint(result) - # Let's watch its performance! - policy.eval() - test_envs.seed(args.seed) - test_collector.reset() - result = test_collector.collect(n_episode=args.test_num, - render=args.render) - rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews.mean()}, length: {lens.mean()}") - - -if __name__ == '__main__': - test_td3() From c530139d4a1385ed4d61055c6683a3dd0dd7211f Mon Sep 17 00:00:00 2001 From: Trinkle23897 Date: Sat, 27 Feb 2021 10:51:12 +0800 Subject: [PATCH 13/13] greater --- tianshou/data/buffer.py | 2 +- tianshou/policy/base.py | 4 ++-- tianshou/policy/imitation/discrete_bcq.py | 2 +- tianshou/policy/modelfree/c51.py | 4 ++-- tianshou/policy/modelfree/ddpg.py | 2 +- tianshou/policy/modelfree/dqn.py | 4 ++-- tianshou/policy/modelfree/qrdqn.py | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tianshou/data/buffer.py b/tianshou/data/buffer.py index 2ceb8081f..05f5ecef9 100644 --- a/tianshou/data/buffer.py +++ b/tianshou/data/buffer.py @@ -61,7 +61,7 @@ def __init__( } super().__init__() self.maxsize = size - assert stack_num > 0, "stack_num should greater than 0" + assert stack_num > 0, "stack_num should be greater than 0" self.stack_num = stack_num self._indices = np.arange(size) self._save_obs_next = not ignore_obs_next diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py index 7e910996b..730ee28b0 100644 --- a/tianshou/policy/base.py +++ b/tianshou/policy/base.py @@ -286,8 +286,8 @@ def compute_nstep_return( :return: a Batch. The result will be stored in batch.returns as a torch.Tensor with the same shape as target_q_fn's return tensor. """ - assert not rew_norm, ( - "Reward normalization in computing n-step return is unsupported for now.") + assert not rew_norm, \ + "Reward normalization in computing n-step returns is unsupported now." rew = buffer.rew bsz = len(indice) indices = [indice] diff --git a/tianshou/policy/imitation/discrete_bcq.py b/tianshou/policy/imitation/discrete_bcq.py index 610b164f1..5d7082243 100644 --- a/tianshou/policy/imitation/discrete_bcq.py +++ b/tianshou/policy/imitation/discrete_bcq.py @@ -17,7 +17,7 @@ class DiscreteBCQPolicy(DQNPolicy): :class:`~tianshou.policy.BasePolicy`. (s -> imtation_logits) :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param float discount_factor: in [0, 1]. - :param int estimation_step: greater than 1, the number of steps to look ahead. + :param int estimation_step: the number of steps to look ahead. Default to 1. :param int target_update_freq: the target network update frequency. :param float eval_eps: the epsilon-greedy noise added in evaluation. :param float unlikely_action_threshold: the threshold (tau) for unlikely diff --git a/tianshou/policy/modelfree/c51.py b/tianshou/policy/modelfree/c51.py index eb24f0eb8..20ef89c1a 100644 --- a/tianshou/policy/modelfree/c51.py +++ b/tianshou/policy/modelfree/c51.py @@ -19,9 +19,9 @@ class C51Policy(DQNPolicy): Default to -10.0. :param float v_max: the value of the largest atom in the support set. Default to 10.0. - :param int estimation_step: greater than 1, the number of steps to look ahead. + :param int estimation_step: the number of steps to look ahead. Default to 1. :param int target_update_freq: the target network update frequency (0 if - you do not use the target network). + you do not use the target network). Default to 0. :param bool reward_normalization: normalize the reward to Normal(0, 1). Default to False. diff --git a/tianshou/policy/modelfree/ddpg.py b/tianshou/policy/modelfree/ddpg.py index c858b29e2..9a4dad062 100644 --- a/tianshou/policy/modelfree/ddpg.py +++ b/tianshou/policy/modelfree/ddpg.py @@ -24,7 +24,7 @@ class DDPGPolicy(BasePolicy): add to the action. Default to ``GaussianNoise(sigma=0.1)``. :param bool reward_normalization: normalize the reward to Normal(0, 1), Default to False. - :param int estimation_step: greater than 1, the number of steps to look ahead. + :param int estimation_step: the number of steps to look ahead. Default to 1. .. seealso:: diff --git a/tianshou/policy/modelfree/dqn.py b/tianshou/policy/modelfree/dqn.py index a4ad772fb..bd1fea14a 100644 --- a/tianshou/policy/modelfree/dqn.py +++ b/tianshou/policy/modelfree/dqn.py @@ -19,9 +19,9 @@ class DQNPolicy(BasePolicy): :class:`~tianshou.policy.BasePolicy`. (s -> logits) :param torch.optim.Optimizer optim: a torch.optim for optimizing the model. :param float discount_factor: in [0, 1]. - :param int estimation_step: greater than 1, the number of steps to look ahead. + :param int estimation_step: the number of steps to look ahead. Default to 1. :param int target_update_freq: the target network update frequency (0 if - you do not use the target network). + you do not use the target network). Default to 0. :param bool reward_normalization: normalize the reward to Normal(0, 1). Default to False. diff --git a/tianshou/policy/modelfree/qrdqn.py b/tianshou/policy/modelfree/qrdqn.py index ffc93aeee..7e154e7f7 100644 --- a/tianshou/policy/modelfree/qrdqn.py +++ b/tianshou/policy/modelfree/qrdqn.py @@ -17,7 +17,7 @@ class QRDQNPolicy(DQNPolicy): :param float discount_factor: in [0, 1]. :param int num_quantiles: the number of quantile midpoints in the inverse cumulative distribution function of the value. Default to 200. - :param int estimation_step: greater than 1, the number of steps to look ahead. + :param int estimation_step: the number of steps to look ahead. Default to 1. :param int target_update_freq: the target network update frequency (0 if you do not use the target network). :param bool reward_normalization: normalize the reward to Normal(0, 1).