From e45884545f44c84ac177a8eb4dad522cd1a74445 Mon Sep 17 00:00:00 2001 From: mehooz Date: Thu, 26 Mar 2020 02:06:48 +0800 Subject: [PATCH 01/17] update atari.py --- setup.py | 4 +++- test/base/test_buffer.py | 1 + test/base/test_collector.py | 1 + tianshou/__init__.py | 2 +- tianshou/data/batch.py | 2 +- tianshou/data/buffer.py | 2 +- tianshou/data/collector.py | 6 +++--- tianshou/env/__init__.py | 2 +- tianshou/env/vecenv.py | 3 ++- tianshou/exploration/random.py | 2 +- tianshou/policy/a2c.py | 4 ++-- tianshou/policy/ddpg.py | 2 ++ tianshou/policy/pg.py | 2 +- tianshou/policy/ppo.py | 6 +++--- 14 files changed, 23 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 0e2b66de6..ca3582e38 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,9 @@ 'examples', 'examples.*', 'docs', 'docs.*']), install_requires=[ - 'gym', + 'gym>=0.15.0', + 'atari_py', + 'mujoco_py' 'tqdm', 'numpy', 'cloudpickle', diff --git a/test/base/test_buffer.py b/test/base/test_buffer.py index b2bfe126f..38d3bc557 100644 --- a/test/base/test_buffer.py +++ b/test/base/test_buffer.py @@ -1,4 +1,5 @@ from tianshou.data import ReplayBuffer + if __name__ == '__main__': from env import MyTestEnv else: # pytest diff --git a/test/base/test_collector.py b/test/base/test_collector.py index 2f50d7532..483b01b73 100644 --- a/test/base/test_collector.py +++ b/test/base/test_collector.py @@ -11,6 +11,7 @@ class MyPolicy(BasePolicy): """docstring for MyPolicy""" + def __init__(self): super().__init__() diff --git a/tianshou/__init__.py b/tianshou/__init__.py index 35280106f..95a94beb4 100644 --- a/tianshou/__init__.py +++ b/tianshou/__init__.py @@ -1,4 +1,4 @@ -from tianshou import data, env, utils, policy, trainer,\ +from tianshou import data, env, utils, policy, trainer, \ exploration __version__ = '0.2.0' diff --git a/tianshou/data/batch.py b/tianshou/data/batch.py index 13777abbd..d2af6b552 100644 --- a/tianshou/data/batch.py +++ b/tianshou/data/batch.py @@ -37,7 +37,7 @@ def append(self, batch): else: raise TypeError( 'No support for append with type {} in class Batch.' - .format(type(batch.__dict__[k]))) + .format(type(batch.__dict__[k]))) def split(self, size=None, permute=True): length = min([ diff --git a/tianshou/data/buffer.py b/tianshou/data/buffer.py index ccb0e92af..82a34bf89 100644 --- a/tianshou/data/buffer.py +++ b/tianshou/data/buffer.py @@ -47,7 +47,7 @@ def add(self, obs, act, rew, done, obs_next=0, info={}, weight=None): ''' weight: importance weights, disabled here ''' - assert isinstance(info, dict),\ + assert isinstance(info, dict), \ 'You should return a dict in the last argument of env.step().' self._add_to_buffer('obs', obs) self._add_to_buffer('act', act) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index 036d199c8..1c2fe9ddf 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -31,8 +31,8 @@ def __init__(self, policy, env, buffer=None, stat_size=100): if self._multi_env: self.env_num = len(env) if isinstance(self.buffer, list): - assert len(self.buffer) == self.env_num,\ - 'The number of data buffer does not match the number of '\ + assert len(self.buffer) == self.env_num, \ + 'The number of data buffer does not match the number of ' \ 'input env.' self._multi_buf = True elif isinstance(self.buffer, ReplayBuffer): @@ -87,7 +87,7 @@ def collect(self, n_step=0, n_episode=0, render=0): if not self._multi_env: n_episode = np.sum(n_episode) start_time = time.time() - assert sum([(n_step != 0), (n_episode != 0)]) == 1,\ + assert sum([(n_step != 0), (n_episode != 0)]) == 1, \ "One and only one collection number specification permitted!" cur_step = 0 cur_episode = np.zeros(self.env_num) if self._multi_env else 0 diff --git a/tianshou/env/__init__.py b/tianshou/env/__init__.py index 045f703af..81f20145b 100644 --- a/tianshou/env/__init__.py +++ b/tianshou/env/__init__.py @@ -1,6 +1,6 @@ from tianshou.env.utils import CloudpickleWrapper from tianshou.env.common import EnvWrapper, FrameStack -from tianshou.env.vecenv import BaseVectorEnv, VectorEnv,\ +from tianshou.env.vecenv import BaseVectorEnv, VectorEnv, \ SubprocVectorEnv, RayVectorEnv __all__ = [ diff --git a/tianshou/env/vecenv.py b/tianshou/env/vecenv.py index 40b4dcb33..4f43a6591 100644 --- a/tianshou/env/vecenv.py +++ b/tianshou/env/vecenv.py @@ -1,6 +1,7 @@ import numpy as np from abc import ABC, abstractmethod from multiprocessing import Process, Pipe + try: import ray except ImportError: @@ -122,7 +123,7 @@ def __init__(self, env_fns): zip(*[Pipe() for _ in range(self.env_num)]) self.processes = [ Process(target=worker, args=( - parent, child, CloudpickleWrapper(env_fn)), daemon=True) + parent, child, CloudpickleWrapper(env_fn)), daemon=True) for (parent, child, env_fn) in zip( self.parent_remote, self.child_remote, env_fns) ] diff --git a/tianshou/exploration/random.py b/tianshou/exploration/random.py index 011afbed8..12d92eaa0 100644 --- a/tianshou/exploration/random.py +++ b/tianshou/exploration/random.py @@ -14,7 +14,7 @@ def __call__(self, size, mu=.1): if self.x is None or self.x.shape != size: self.x = 0 self.x = self.x + self.alpha * (mu - self.x) + \ - self.beta * np.random.normal(size=size) + self.beta * np.random.normal(size=size) return self.x def reset(self): diff --git a/tianshou/policy/a2c.py b/tianshou/policy/a2c.py index a1b649079..4de99cf26 100644 --- a/tianshou/policy/a2c.py +++ b/tianshou/policy/a2c.py @@ -40,8 +40,8 @@ def learn(self, batch, batch_size=None, repeat=1): vf_loss = F.mse_loss(r[:, None], v) ent_loss = dist.entropy().mean() loss = actor_loss \ - + self._w_vf * vf_loss \ - - self._w_ent * ent_loss + + self._w_vf * vf_loss \ + - self._w_ent * ent_loss loss.backward() if self._grad_norm: nn.utils.clip_grad_norm_( diff --git a/tianshou/policy/ddpg.py b/tianshou/policy/ddpg.py index 231b44822..810e09bca 100644 --- a/tianshou/policy/ddpg.py +++ b/tianshou/policy/ddpg.py @@ -5,6 +5,8 @@ from tianshou.data import Batch from tianshou.policy import BasePolicy + + # from tianshou.exploration import OUNoise diff --git a/tianshou/policy/pg.py b/tianshou/policy/pg.py index 740089328..2f5223368 100644 --- a/tianshou/policy/pg.py +++ b/tianshou/policy/pg.py @@ -35,7 +35,7 @@ def __call__(self, batch, state=None): def learn(self, batch, batch_size=None, repeat=1): losses = [] batch.returns = (batch.returns - batch.returns.mean()) \ - / (batch.returns.std() + self._eps) + / (batch.returns.std() + self._eps) for _ in range(repeat): for b in batch.split(batch_size): self.optim.zero_grad() diff --git a/tianshou/policy/ppo.py b/tianshou/policy/ppo.py index 0fead3b06..f972a26d4 100644 --- a/tianshou/policy/ppo.py +++ b/tianshou/policy/ppo.py @@ -59,7 +59,7 @@ def sync_weight(self): def learn(self, batch, batch_size=None, repeat=1): losses, clip_losses, vf_losses, ent_losses = [], [], [], [] batch.returns = (batch.returns - batch.returns.mean()) \ - / (batch.returns.std() + self._eps) + / (batch.returns.std() + self._eps) batch.act = torch.tensor(batch.act) batch.returns = torch.tensor(batch.returns)[:, None] for _ in range(repeat): @@ -82,13 +82,13 @@ def learn(self, batch, batch_size=None, repeat=1): ent_loss = dist.entropy().mean() ent_losses.append(ent_loss.detach().cpu().numpy()) loss = clip_loss \ - + self._w_vf * vf_loss - self._w_ent * ent_loss + + self._w_vf * vf_loss - self._w_ent * ent_loss losses.append(loss.detach().cpu().numpy()) self.optim.zero_grad() loss.backward() nn.utils.clip_grad_norm_(list( self.actor.parameters()) + list(self.critic.parameters()), - self._max_grad_norm) + self._max_grad_norm) self.optim.step() self.sync_weight() return { From 920b398d445ed7083e7e9f6954d2ab34b621fd0e Mon Sep 17 00:00:00 2001 From: mehooz Date: Thu, 26 Mar 2020 08:38:34 +0800 Subject: [PATCH 02/17] fix setup.py pass the pytest --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ca3582e38..66fb7a5ce 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ 'docs', 'docs.*']), install_requires=[ 'gym>=0.15.0', - 'atari_py', 'mujoco_py' 'tqdm', 'numpy', @@ -46,4 +45,12 @@ 'tensorboard', 'torch>=1.4.0', ], + extras_require={ + 'atari': [ + 'atari_py', + ], + 'mujoco': [ + 'mujoco_py', + ] + }, ) From 3c6b1305bda3bca7111e9dfad5a419889adef510 Mon Sep 17 00:00:00 2001 From: mehooz Date: Thu, 26 Mar 2020 08:45:25 +0800 Subject: [PATCH 03/17] fix setup.py pass the pytest --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 66fb7a5ce..590d5fa4d 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ 'docs', 'docs.*']), install_requires=[ 'gym>=0.15.0', - 'mujoco_py' 'tqdm', 'numpy', 'cloudpickle', From 0a9c3bc3939908e96faf31971ec68e252fb676a0 Mon Sep 17 00:00:00 2001 From: mehooz Date: Thu, 26 Mar 2020 11:56:45 +0800 Subject: [PATCH 04/17] add args "render" --- test/continuous/test_ddpg.py | 3 ++- test/continuous/test_ppo.py | 3 ++- test/continuous/test_sac.py | 3 ++- test/continuous/test_td3.py | 3 ++- test/discrete/test_a2c.py | 4 +++- test/discrete/test_dqn.py | 3 ++- test/discrete/test_pg.py | 3 ++- test/discrete/test_ppo.py | 3 ++- 8 files changed, 17 insertions(+), 8 deletions(-) diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index cb100da4c..b181cf281 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -34,6 +34,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -97,7 +98,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index 3867becf2..bbab03547 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -32,6 +32,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=16) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -106,7 +107,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/continuous/test_sac.py b/test/continuous/test_sac.py index a900940d3..54ba68328 100644 --- a/test/continuous/test_sac.py +++ b/test/continuous/test_sac.py @@ -34,6 +34,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -102,7 +103,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 78db4934e..074efc316 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -37,6 +37,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -106,7 +107,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_a2c.py b/test/discrete/test_a2c.py index 20ef0efc0..4fc1ad866 100644 --- a/test/discrete/test_a2c.py +++ b/test/discrete/test_a2c.py @@ -32,6 +32,8 @@ def get_args(): parser.add_argument('--training-num', type=int, default=32) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -91,7 +93,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_dqn.py b/test/discrete/test_dqn.py index 3a119936f..e3665e11c 100644 --- a/test/discrete/test_dqn.py +++ b/test/discrete/test_dqn.py @@ -35,6 +35,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -99,7 +100,7 @@ def test_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_pg.py b/test/discrete/test_pg.py index e0f4a083b..0b605e02a 100644 --- a/test/discrete/test_pg.py +++ b/test/discrete/test_pg.py @@ -86,6 +86,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -139,7 +140,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index d21dd6c02..5346df952 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -32,6 +32,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=32) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -96,7 +97,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() From d83c0e7e0c0d25fda9b063ebe8eb3eeaba9cf338 Mon Sep 17 00:00:00 2001 From: mehooz Date: Thu, 26 Mar 2020 18:45:04 +0800 Subject: [PATCH 05/17] change the tensorboard writter --- .idea/workspace.xml | 203 +++++++++++++++++++--------------- test/continuous/test_ddpg.py | 10 +- test/continuous/test_ppo.py | 10 +- test/continuous/test_sac.py | 10 +- test/continuous/test_td3.py | 10 +- test/discrete/test_a2c.py | 4 +- test/discrete/test_dqn.py | 4 +- test/discrete/test_pg.py | 4 +- test/discrete/test_ppo.py | 4 +- tianshou/data/collector.py | 4 +- tianshou/policy/a2c.py | 45 ++++---- tianshou/policy/pg.py | 83 +++++++------- tianshou/policy/ppo.py | 101 ++++++++--------- tianshou/trainer/offpolicy.py | 6 +- tianshou/trainer/onpolicy.py | 6 +- 15 files changed, 266 insertions(+), 238 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 9b1b570e8..cf9f34f89 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,13 +2,21 @@ - - - + + + + + + + + + + + - - + + - - + + @@ -31,11 +39,11 @@ - - + + - - + + @@ -43,11 +51,11 @@ - - + + - - + + @@ -55,25 +63,23 @@ - - + + - - - - - + + + - - + + - - + + - + @@ -82,8 +88,8 @@ - - + + @@ -94,8 +100,8 @@ - - + + @@ -103,11 +109,11 @@ - + - - + + @@ -118,8 +124,8 @@ - - + + @@ -127,12 +133,14 @@ - - + + - - - + + + + + @@ -146,6 +154,11 @@ + + + SummaryWriter + + - - + - + + @@ -316,7 +326,6 @@ - @@ -333,7 +342,9 @@ \ No newline at end of file From d8c41520436b55efe382f0b6d10f3f7e8849d583 Mon Sep 17 00:00:00 2001 From: mehooz Date: Fri, 27 Mar 2020 09:27:33 +0800 Subject: [PATCH 09/17] remove some wrong local files --- .idea/deployment.xml | 50 --- .idea/modules.xml | 8 - .idea/tianshou.iml | 10 - .idea/vcs.xml | 6 - .idea/workspace.xml | 877 ------------------------------------------- 5 files changed, 951 deletions(-) delete mode 100644 .idea/deployment.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/tianshou.iml delete mode 100644 .idea/vcs.xml delete mode 100644 .idea/workspace.xml diff --git a/.idea/deployment.xml b/.idea/deployment.xml deleted file mode 100644 index ed4b5ca00..000000000 --- a/.idea/deployment.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 449f73cb0..000000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/tianshou.iml b/.idea/tianshou.iml deleted file mode 100644 index bb8c7e04c..000000000 --- a/.idea/tianshou.iml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7f4..000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml deleted file mode 100644 index 2b26e03f2..000000000 --- a/.idea/workspace.xml +++ /dev/null @@ -1,877 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - SummaryWriter - - - - - - - - - - - true - DEFINITION_ORDER - - - - - - - - - - - - - - - - - - - - - - - - - - -