diff --git a/setup.py b/setup.py index 0e2b66de6..590d5fa4d 100644 --- a/setup.py +++ b/setup.py @@ -37,11 +37,19 @@ 'examples', 'examples.*', 'docs', 'docs.*']), install_requires=[ - 'gym', + 'gym>=0.15.0', 'tqdm', 'numpy', 'cloudpickle', 'tensorboard', 'torch>=1.4.0', ], + extras_require={ + 'atari': [ + 'atari_py', + ], + 'mujoco': [ + 'mujoco_py', + ] + }, ) diff --git a/test/base/test_buffer.py b/test/base/test_buffer.py index b2bfe126f..38d3bc557 100644 --- a/test/base/test_buffer.py +++ b/test/base/test_buffer.py @@ -1,4 +1,5 @@ from tianshou.data import ReplayBuffer + if __name__ == '__main__': from env import MyTestEnv else: # pytest diff --git a/test/base/test_collector.py b/test/base/test_collector.py index 2f50d7532..483b01b73 100644 --- a/test/base/test_collector.py +++ b/test/base/test_collector.py @@ -11,6 +11,7 @@ class MyPolicy(BasePolicy): """docstring for MyPolicy""" + def __init__(self): super().__init__() diff --git a/tianshou/__init__.py b/tianshou/__init__.py index 35280106f..95a94beb4 100644 --- a/tianshou/__init__.py +++ b/tianshou/__init__.py @@ -1,4 +1,4 @@ -from tianshou import data, env, utils, policy, trainer,\ +from tianshou import data, env, utils, policy, trainer, \ exploration __version__ = '0.2.0' diff --git a/tianshou/data/batch.py b/tianshou/data/batch.py index 13777abbd..d2af6b552 100644 --- a/tianshou/data/batch.py +++ b/tianshou/data/batch.py @@ -37,7 +37,7 @@ def append(self, batch): else: raise TypeError( 'No support for append with type {} in class Batch.' - .format(type(batch.__dict__[k]))) + .format(type(batch.__dict__[k]))) def split(self, size=None, permute=True): length = min([ diff --git a/tianshou/data/buffer.py b/tianshou/data/buffer.py index ccb0e92af..82a34bf89 100644 --- a/tianshou/data/buffer.py +++ b/tianshou/data/buffer.py @@ -47,7 +47,7 @@ def add(self, obs, act, rew, done, obs_next=0, info={}, weight=None): ''' weight: importance weights, disabled here ''' - assert isinstance(info, dict),\ + assert isinstance(info, dict), \ 'You should return a dict in the last argument of env.step().' self._add_to_buffer('obs', obs) self._add_to_buffer('act', act) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index 036d199c8..1c2fe9ddf 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -31,8 +31,8 @@ def __init__(self, policy, env, buffer=None, stat_size=100): if self._multi_env: self.env_num = len(env) if isinstance(self.buffer, list): - assert len(self.buffer) == self.env_num,\ - 'The number of data buffer does not match the number of '\ + assert len(self.buffer) == self.env_num, \ + 'The number of data buffer does not match the number of ' \ 'input env.' self._multi_buf = True elif isinstance(self.buffer, ReplayBuffer): @@ -87,7 +87,7 @@ def collect(self, n_step=0, n_episode=0, render=0): if not self._multi_env: n_episode = np.sum(n_episode) start_time = time.time() - assert sum([(n_step != 0), (n_episode != 0)]) == 1,\ + assert sum([(n_step != 0), (n_episode != 0)]) == 1, \ "One and only one collection number specification permitted!" cur_step = 0 cur_episode = np.zeros(self.env_num) if self._multi_env else 0 diff --git a/tianshou/env/__init__.py b/tianshou/env/__init__.py index 045f703af..81f20145b 100644 --- a/tianshou/env/__init__.py +++ b/tianshou/env/__init__.py @@ -1,6 +1,6 @@ from tianshou.env.utils import CloudpickleWrapper from tianshou.env.common import EnvWrapper, FrameStack -from tianshou.env.vecenv import BaseVectorEnv, VectorEnv,\ +from tianshou.env.vecenv import BaseVectorEnv, VectorEnv, \ SubprocVectorEnv, RayVectorEnv __all__ = [ diff --git a/tianshou/env/vecenv.py b/tianshou/env/vecenv.py index 40b4dcb33..4f43a6591 100644 --- a/tianshou/env/vecenv.py +++ b/tianshou/env/vecenv.py @@ -1,6 +1,7 @@ import numpy as np from abc import ABC, abstractmethod from multiprocessing import Process, Pipe + try: import ray except ImportError: @@ -122,7 +123,7 @@ def __init__(self, env_fns): zip(*[Pipe() for _ in range(self.env_num)]) self.processes = [ Process(target=worker, args=( - parent, child, CloudpickleWrapper(env_fn)), daemon=True) + parent, child, CloudpickleWrapper(env_fn)), daemon=True) for (parent, child, env_fn) in zip( self.parent_remote, self.child_remote, env_fns) ] diff --git a/tianshou/exploration/random.py b/tianshou/exploration/random.py index 011afbed8..12d92eaa0 100644 --- a/tianshou/exploration/random.py +++ b/tianshou/exploration/random.py @@ -14,7 +14,7 @@ def __call__(self, size, mu=.1): if self.x is None or self.x.shape != size: self.x = 0 self.x = self.x + self.alpha * (mu - self.x) + \ - self.beta * np.random.normal(size=size) + self.beta * np.random.normal(size=size) return self.x def reset(self): diff --git a/tianshou/policy/a2c.py b/tianshou/policy/a2c.py index a1b649079..4de99cf26 100644 --- a/tianshou/policy/a2c.py +++ b/tianshou/policy/a2c.py @@ -40,8 +40,8 @@ def learn(self, batch, batch_size=None, repeat=1): vf_loss = F.mse_loss(r[:, None], v) ent_loss = dist.entropy().mean() loss = actor_loss \ - + self._w_vf * vf_loss \ - - self._w_ent * ent_loss + + self._w_vf * vf_loss \ + - self._w_ent * ent_loss loss.backward() if self._grad_norm: nn.utils.clip_grad_norm_( diff --git a/tianshou/policy/ddpg.py b/tianshou/policy/ddpg.py index 231b44822..810e09bca 100644 --- a/tianshou/policy/ddpg.py +++ b/tianshou/policy/ddpg.py @@ -5,6 +5,8 @@ from tianshou.data import Batch from tianshou.policy import BasePolicy + + # from tianshou.exploration import OUNoise diff --git a/tianshou/policy/pg.py b/tianshou/policy/pg.py index 740089328..2f5223368 100644 --- a/tianshou/policy/pg.py +++ b/tianshou/policy/pg.py @@ -35,7 +35,7 @@ def __call__(self, batch, state=None): def learn(self, batch, batch_size=None, repeat=1): losses = [] batch.returns = (batch.returns - batch.returns.mean()) \ - / (batch.returns.std() + self._eps) + / (batch.returns.std() + self._eps) for _ in range(repeat): for b in batch.split(batch_size): self.optim.zero_grad() diff --git a/tianshou/policy/ppo.py b/tianshou/policy/ppo.py index 0fead3b06..f972a26d4 100644 --- a/tianshou/policy/ppo.py +++ b/tianshou/policy/ppo.py @@ -59,7 +59,7 @@ def sync_weight(self): def learn(self, batch, batch_size=None, repeat=1): losses, clip_losses, vf_losses, ent_losses = [], [], [], [] batch.returns = (batch.returns - batch.returns.mean()) \ - / (batch.returns.std() + self._eps) + / (batch.returns.std() + self._eps) batch.act = torch.tensor(batch.act) batch.returns = torch.tensor(batch.returns)[:, None] for _ in range(repeat): @@ -82,13 +82,13 @@ def learn(self, batch, batch_size=None, repeat=1): ent_loss = dist.entropy().mean() ent_losses.append(ent_loss.detach().cpu().numpy()) loss = clip_loss \ - + self._w_vf * vf_loss - self._w_ent * ent_loss + + self._w_vf * vf_loss - self._w_ent * ent_loss losses.append(loss.detach().cpu().numpy()) self.optim.zero_grad() loss.backward() nn.utils.clip_grad_norm_(list( self.actor.parameters()) + list(self.critic.parameters()), - self._max_grad_norm) + self._max_grad_norm) self.optim.step() self.sync_weight() return {