From f3782c532ec52eaa152ea3f6b572754087345cc9 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Wed, 4 May 2022 18:31:01 -0400 Subject: [PATCH 1/5] add vecenv wrappers --- .github/workflows/extra_sys.yml | 2 +- .github/workflows/gputest.yml | 2 +- .github/workflows/profile.yml | 2 +- .github/workflows/pytest.yml | 2 +- Makefile | 2 +- README.md | 15 +--- setup.py | 2 +- test/base/test_env.py | 83 +++++++++++++++++++++- tianshou/__init__.py | 2 +- tianshou/env/__init__.py | 3 + tianshou/env/venv_wrappers.py | 119 ++++++++++++++++++++++++++++++++ tianshou/env/venvs.py | 48 +++---------- 12 files changed, 222 insertions(+), 60 deletions(-) create mode 100644 tianshou/env/venv_wrappers.py diff --git a/.github/workflows/extra_sys.yml b/.github/workflows/extra_sys.yml index 4e6ecdad0..89599525e 100644 --- a/.github/workflows/extra_sys.yml +++ b/.github/workflows/extra_sys.yml @@ -28,4 +28,4 @@ jobs: wandb login e2366d661b89f2bee877c40bee15502d67b7abef - name: Test with pytest run: | - pytest test/base test/continuous --cov=tianshou --durations=0 -v + pytest test/base test/continuous --cov=tianshou --durations=0 -v --color=yes diff --git a/.github/workflows/gputest.yml b/.github/workflows/gputest.yml index 8032bd3f9..5b409b8eb 100644 --- a/.github/workflows/gputest.yml +++ b/.github/workflows/gputest.yml @@ -24,4 +24,4 @@ jobs: - name: Test with pytest # ignore test/throughput which only profiles the code run: | - pytest test --ignore-glob='*profile.py' --cov=tianshou --cov-report=xml --durations=0 -v + pytest test --ignore-glob='*profile.py' --cov=tianshou --cov-report=xml --durations=0 -v --color=yes diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 82c793e99..541ea0219 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -20,4 +20,4 @@ jobs: python -m pip install ".[dev]" --upgrade - name: Test with pytest run: | - pytest test/throughput --durations=0 -v + pytest test/throughput --durations=0 -v --color=yes diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index cd44457cd..f7ace0751 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -27,7 +27,7 @@ jobs: - name: Test with pytest # ignore test/throughput which only profiles the code run: | - pytest test --ignore-glob='*profile.py' --ignore="test/3rd_party" --cov=tianshou --cov-report=xml --cov-report=term-missing --durations=0 -v + pytest test --ignore-glob='*profile.py' --ignore="test/3rd_party" --cov=tianshou --cov-report=xml --cov-report=term-missing --durations=0 -v --color=yes - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: diff --git a/Makefile b/Makefile index b9967f886..5bc170698 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ pytest: $(call check_install, pytest) $(call check_install, pytest_cov) $(call check_install, pytest_xdist) - pytest test --cov ${PROJECT_PATH} --durations 0 -v --cov-report term-missing + pytest test --cov ${PROJECT_PATH} --durations 0 -v --cov-report term-missing --color=yes mypy: $(call check_install, mypy) diff --git a/README.md b/README.md index 46e3f9a15..0807fd8bb 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,7 @@ --- -[![PyPI](https://img.shields.io/pypi/v/tianshou)](https://pypi.org/project/tianshou/) -[![Conda](https://img.shields.io/conda/vn/conda-forge/tianshou)](https://github.com/conda-forge/tianshou-feedstock) -[![Read the Docs](https://img.shields.io/readthedocs/tianshou)](https://tianshou.readthedocs.io/en/master) -[![Read the Docs](https://img.shields.io/readthedocs/tianshou-docs-zh-cn?label=%E4%B8%AD%E6%96%87%E6%96%87%E6%A1%A3)](https://tianshou.readthedocs.io/zh/master/) -[![Unittest](https://github.com/thu-ml/tianshou/workflows/Unittest/badge.svg?branch=master)](https://github.com/thu-ml/tianshou/actions) -[![codecov](https://img.shields.io/codecov/c/gh/thu-ml/tianshou)](https://codecov.io/gh/thu-ml/tianshou) -[![GitHub issues](https://img.shields.io/github/issues/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/issues) -[![GitHub stars](https://img.shields.io/github/stars/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/stargazers) -[![GitHub forks](https://img.shields.io/github/forks/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/network) -[![GitHub license](https://img.shields.io/github/license/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/blob/master/LICENSE) +[![PyPI](https://img.shields.io/pypi/v/tianshou)](https://pypi.org/project/tianshou/) [![Conda](https://img.shields.io/conda/vn/conda-forge/tianshou)](https://github.com/conda-forge/tianshou-feedstock) [![Read the Docs](https://img.shields.io/readthedocs/tianshou)](https://tianshou.readthedocs.io/en/master) [![Read the Docs](https://img.shields.io/readthedocs/tianshou-docs-zh-cn?label=%E4%B8%AD%E6%96%87%E6%96%87%E6%A1%A3)](https://tianshou.readthedocs.io/zh/master/) [![Unittest](https://github.com/thu-ml/tianshou/workflows/Unittest/badge.svg?branch=master)](https://github.com/thu-ml/tianshou/actions) [![codecov](https://img.shields.io/codecov/c/gh/thu-ml/tianshou)](https://codecov.io/gh/thu-ml/tianshou) [![GitHub issues](https://img.shields.io/github/issues/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/issues) [![GitHub stars](https://img.shields.io/github/stars/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/stargazers) [![GitHub forks](https://img.shields.io/github/forks/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/network) [![GitHub license](https://img.shields.io/github/license/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/blob/master/LICENSE) **Tianshou** ([天授](https://baike.baidu.com/item/%E5%A4%A9%E6%8E%88)) is a reinforcement learning platform based on pure PyTorch. Unlike existing reinforcement learning libraries, which are mainly based on TensorFlow, have many nested classes, unfriendly API, or slow-speed, Tianshou provides a fast-speed modularized framework and pythonic API for building the deep reinforcement learning agent with the least number of lines of code. The supported interface algorithms currently include: @@ -48,7 +39,7 @@ - [Posterior Sampling Reinforcement Learning (PSRL)](https://www.ece.uvic.ca/~bctill/papers/learning/Strens_2000.pdf) - [Intrinsic Curiosity Module (ICM)](https://arxiv.org/pdf/1705.05363.pdf) -Here is Tianshou's other features: +Here are Tianshou's other features: - Elegant framework, using only ~4000 lines of code - State-of-the-art [MuJoCo benchmark](https://github.com/thu-ml/tianshou/tree/master/examples/mujoco) for REINFORCE/A2C/TRPO/PPO/DDPG/TD3/SAC algorithms @@ -132,7 +123,7 @@ The example scripts are under [test/](https://github.com/thu-ml/tianshou/blob/ma (2): not all algorithms support this feature -(3): TQC and QR-DQN in [sb3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib) instead of main repo +(3): TQC and QR-DQN in [sb3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib) instead of main repo (4): super fast APPO! diff --git a/setup.py b/setup.py index 83ac36580..0ae409056 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ def get_extras_require() -> str: "pybullet": ["pybullet"], } if sys.platform == "linux": - req["dev"].append("envpool>=0.4.5") + req["dev"].append("envpool>=0.5.3") return req diff --git a/test/base/test_env.py b/test/base/test_env.py index 7e507c4ed..6aec96d47 100644 --- a/test/base/test_env.py +++ b/test/base/test_env.py @@ -2,16 +2,29 @@ import time import numpy as np +import pytest from gym.spaces.discrete import Discrete from tianshou.data import Batch -from tianshou.env import DummyVectorEnv, RayVectorEnv, ShmemVectorEnv, SubprocVectorEnv +from tianshou.env import ( + DummyVectorEnv, + RayVectorEnv, + ShmemVectorEnv, + SubprocVectorEnv, + VectorEnvNormObs, +) +from tianshou.utils import RunningMeanStd if __name__ == '__main__': from env import MyTestEnv, NXEnv else: # pytest from test.base.env import MyTestEnv, NXEnv +try: + import envpool +except ImportError: + envpool = None + def has_ray(): try: @@ -195,7 +208,7 @@ def assert_get(v, expected): v.close() -def test_env_obs(): +def test_env_obs_dtype(): for obs_type in ["array", "object"]: envs = SubprocVectorEnv( [lambda i=x: NXEnv(i, obs_type) for x in [5, 10, 15, 20]] @@ -206,8 +219,72 @@ def test_env_obs(): assert obs.dtype == object +def run_align_norm_obs(raw_env, train_env, test_env, action_list): + eps = np.finfo(np.float32).eps.item() + raw_obs, train_obs = [raw_env.reset()], [train_env.reset()] + for action in action_list: + obs, rew, done, info = raw_env.step(action) + raw_obs.append(obs) + if np.any(done): + raw_obs.append(raw_env.reset(np.where(done)[0])) + obs, rew, done, info = train_env.step(action) + train_obs.append(obs) + if np.any(done): + train_obs.append(train_env.reset(np.where(done)[0])) + ref_rms = RunningMeanStd() + for ro, to in zip(raw_obs, train_obs): + ref_rms.update(ro) + no = (ro - ref_rms.mean) / np.sqrt(ref_rms.var + eps) + assert np.allclose(no, to) + assert np.allclose(ref_rms.mean, train_env.get_obs_rms().mean) + assert np.allclose(ref_rms.var, train_env.get_obs_rms().var) + assert np.allclose(ref_rms.mean, test_env.get_obs_rms().mean) + assert np.allclose(ref_rms.var, test_env.get_obs_rms().var) + test_obs = [test_env.reset()] + for action in action_list: + obs, rew, done, info = test_env.step(action) + test_obs.append(obs) + if np.any(done): + test_obs.append(test_env.reset(np.where(done)[0])) + for ro, to in zip(raw_obs, test_obs): + no = (ro - ref_rms.mean) / np.sqrt(ref_rms.var + eps) + assert np.allclose(no, to) + + +def test_venv_norm_obs(): + sizes = np.array([5, 10, 15, 20]) + action = np.array([1, 1, 1, 1]) + total_step = 30 + action_list = [action] * total_step + env_fns = [lambda i=x: MyTestEnv(size=i, array_state=True) for x in sizes] + raw = DummyVectorEnv(env_fns) + train_env = VectorEnvNormObs(DummyVectorEnv(env_fns)) + print(train_env.observation_space) + test_env = VectorEnvNormObs( + DummyVectorEnv(env_fns), update_obs_rms=False, obs_rms=train_env.get_obs_rms() + ) + run_align_norm_obs(raw, train_env, test_env, action_list) + + +@pytest.mark.skipif(envpool is None, reason="EnvPool doesn't support this platform") +def test_venv_wrapper_envpool(): + raw = envpool.make_gym("Ant-v3", num_envs=4) + train = VectorEnvNormObs(envpool.make_gym("Ant-v3", num_envs=4)) + test = VectorEnvNormObs( + envpool.make_gym("Ant-v3", num_envs=4), + update_obs_rms=False, + obs_rms=train.get_obs_rms() + ) + actions = [ + np.array([raw.action_space.sample() for _ in range(4)]) for i in range(30) + ] + run_align_norm_obs(raw, train, test, actions) + + if __name__ == '__main__': - test_env_obs() + test_venv_norm_obs() + test_venv_wrapper_envpool() + test_env_obs_dtype() test_vecenv() test_async_env() test_async_check_id() diff --git a/tianshou/__init__.py b/tianshou/__init__.py index 5a9abbf1b..1d5ee077a 100644 --- a/tianshou/__init__.py +++ b/tianshou/__init__.py @@ -1,6 +1,6 @@ from tianshou import data, env, exploration, policy, trainer, utils -__version__ = "0.4.7" +__version__ = "0.4.8" __all__ = [ "env", diff --git a/tianshou/env/__init__.py b/tianshou/env/__init__.py index 3845f2691..66fbc6810 100644 --- a/tianshou/env/__init__.py +++ b/tianshou/env/__init__.py @@ -1,5 +1,6 @@ """Env package.""" +from tianshou.env.venv_wrappers import VectorEnvNormObs, VectorEnvWrapper from tianshou.env.venvs import ( BaseVectorEnv, DummyVectorEnv, @@ -19,5 +20,7 @@ "SubprocVectorEnv", "ShmemVectorEnv", "RayVectorEnv", + "VectorEnvWrapper", + "VectorEnvNormObs", "PettingZooEnv", ] diff --git a/tianshou/env/venv_wrappers.py b/tianshou/env/venv_wrappers.py new file mode 100644 index 000000000..d2bdcf822 --- /dev/null +++ b/tianshou/env/venv_wrappers.py @@ -0,0 +1,119 @@ +from typing import Any, List, Optional, Tuple, Union + +import numpy as np + +from tianshou.env.venvs import GYM_RESERVED_KEYS, BaseVectorEnv +from tianshou.utils import RunningMeanStd + + +class VectorEnvWrapper(BaseVectorEnv): + """Base class for vectorized environments wrapper.""" + + def __init__(self, venv: BaseVectorEnv) -> None: + self.venv = venv + + def __len__(self) -> int: + return len(self.venv) + + def __getattribute__(self, key: str) -> Any: + if key in GYM_RESERVED_KEYS: # reserved keys in gym.Env + return getattr(self.venv, key) + else: + return super().__getattribute__(key) + + def get_env_attr( + self, + key: str, + id: Optional[Union[int, List[int], np.ndarray]] = None, + ) -> List[Any]: + return self.venv.get_env_attr(key, id) + + def set_env_attr( + self, + key: str, + value: Any, + id: Optional[Union[int, List[int], np.ndarray]] = None, + ) -> None: + return self.venv.set_env_attr(key, value, id) + + # TODO: compatible issue with reset -> (obs, info) + def reset( + self, id: Optional[Union[int, List[int], np.ndarray]] = None + ) -> np.ndarray: + return self.venv.reset(id) + + def step( + self, + action: np.ndarray, + id: Optional[Union[int, List[int], np.ndarray]] = None + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + return self.venv.step(action, id) + + def seed( + self, + seed: Optional[Union[int, List[int]]] = None + ) -> List[Optional[List[int]]]: + return self.venv.seed(seed) + + def render(self, **kwargs: Any) -> List[Any]: + return self.venv.render(**kwargs) + + def close(self) -> None: + self.venv.close() + + +class VectorEnvNormObs(VectorEnvWrapper): + """An observation normalization wrapper for vectorized environments. + + :param obs_rms: class to track mean&std of observation. If not given, it will + initialize a new one. Usually in envs that is used to evaluate algorithm, + obs_rms should be passed in. Default to None. + :param bool update_obs_rms: whether to update obs_rms. Default to True. + :param float clip_obs: the maximum absolute value for observation. Default to + 10.0. + :param float epsilon: To avoid division by zero. + """ + + def __init__( + self, + venv: BaseVectorEnv, + obs_rms: Optional[RunningMeanStd] = None, + update_obs_rms: bool = True, + clip_obs: float = 10.0, + epsilon: float = np.finfo(np.float32).eps.item(), + ) -> None: + super().__init__(venv) + # initialize observation running mean/std + self.update_obs_rms = update_obs_rms + self.obs_rms = RunningMeanStd() if obs_rms is None else obs_rms + self.clip_max = clip_obs + self.eps = epsilon + + # TODO: compatible issue with reset -> (obs, info) + def reset( + self, id: Optional[Union[int, List[int], np.ndarray]] = None + ) -> np.ndarray: + obs = self.venv.reset(id) + if self.obs_rms and self.update_obs_rms: + self.obs_rms.update(obs) + return self._norm_obs(obs) + + def step( + self, + action: np.ndarray, + id: Optional[Union[int, List[int], np.ndarray]] = None + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + obs, rew, done, info = self.venv.step(action, id) + if self.obs_rms and self.update_obs_rms: + self.obs_rms.update(obs) + return self._norm_obs(obs), rew, done, info + + def _norm_obs(self, obs: np.ndarray) -> np.ndarray: + if self.obs_rms: + obs = (obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.eps) + obs = np.clip(obs, -self.clip_max, self.clip_max) + return obs + + def get_obs_rms(self) -> RunningMeanStd: + """Return observation running mean/std.""" + return self.obs_rms diff --git a/tianshou/env/venvs.py b/tianshou/env/venvs.py index 044ecaaa4..0a714e564 100644 --- a/tianshou/env/venvs.py +++ b/tianshou/env/venvs.py @@ -9,11 +9,14 @@ RayEnvWorker, SubprocEnvWorker, ) -from tianshou.utils import RunningMeanStd + +GYM_RESERVED_KEYS = [ + "metadata", "reward_range", "spec", "action_space", "observation_space" +] class BaseVectorEnv(object): - """Base class for vectorized environments wrapper. + """Base class for vectorized environments. Usage: :: @@ -61,13 +64,6 @@ def seed(self, seed): :param float timeout: use in asynchronous simulation same as above, in each vectorized step it only deal with those environments spending time within ``timeout`` seconds. - :param bool norm_obs: Whether to track mean/std of data and normalize observation - on return. For now, observation normalization only support observation of - type np.ndarray. - :param obs_rms: class to track mean&std of observation. If not given, it will - initialize a new one. Usually in envs that is used to evaluate algorithm, - obs_rms should be passed in. Default to None. - :param bool update_obs_rms: Whether to update obs_rms. Default to True. """ def __init__( @@ -76,9 +72,6 @@ def __init__( worker_fn: Callable[[Callable[[], gym.Env]], EnvWorker], wait_num: Optional[int] = None, timeout: Optional[float] = None, - norm_obs: bool = False, - obs_rms: Optional[RunningMeanStd] = None, - update_obs_rms: bool = True, ) -> None: self._env_fns = env_fns # A VectorEnv contains a pool of EnvWorkers, which corresponds to @@ -106,12 +99,6 @@ def __init__( self.ready_id = list(range(self.env_num)) self.is_closed = False - # initialize observation running mean/std - self.norm_obs = norm_obs - self.update_obs_rms = update_obs_rms - self.obs_rms = RunningMeanStd() if obs_rms is None and norm_obs else obs_rms - self.__eps = np.finfo(np.float32).eps.item() - def _assert_is_not_closed(self) -> None: assert not self.is_closed, \ f"Methods of {self.__class__.__name__} cannot be called after close." @@ -127,9 +114,7 @@ def __getattribute__(self, key: str) -> Any: ``action_space``. However, we would like the attribute lookup to go straight into the worker (in fact, this vector env's action_space is always None). """ - if key in [ - 'metadata', 'reward_range', 'spec', 'action_space', 'observation_space' - ]: # reserved keys in gym.Env + if key in GYM_RESERVED_KEYS: # reserved keys in gym.Env return self.get_env_attr(key) else: return super().__getattribute__(key) @@ -137,7 +122,7 @@ def __getattribute__(self, key: str) -> Any: def get_env_attr( self, key: str, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> List[Any]: """Get an attribute from the underlying environments. @@ -162,7 +147,7 @@ def set_env_attr( self, key: str, value: Any, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> None: """Set an attribute in the underlying environments. @@ -218,9 +203,7 @@ def reset( obs = np.stack(obs_list) except ValueError: # different len(obs) obs = np.array(obs_list, dtype=object) - if self.obs_rms and self.update_obs_rms: - self.obs_rms.update(obs) - return self.normalize_obs(obs) + return obs def step( self, @@ -299,9 +282,7 @@ def step( rew_stack, done_stack, info_stack = map( np.stack, [rew_list, done_list, info_list] ) - if self.obs_rms and self.update_obs_rms: - self.obs_rms.update(obs_stack) - return self.normalize_obs(obs_stack), rew_stack, done_stack, info_stack + return obs_stack, rew_stack, done_stack, info_stack def seed( self, @@ -347,15 +328,6 @@ def close(self) -> None: w.close() self.is_closed = True - def normalize_obs(self, obs: np.ndarray) -> np.ndarray: - """Normalize observations by statistics in obs_rms.""" - if self.obs_rms and self.norm_obs: - clip_max = 10.0 # this magic number is from openai baselines - # see baselines/common/vec_env/vec_normalize.py#L10 - obs = (obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.__eps) - obs = np.clip(obs, -clip_max, clip_max) - return obs - class DummyVectorEnv(BaseVectorEnv): """Dummy vectorized environment wrapper, implemented in for-loop. From 96902d1e540e4596419ed5b44840b28be2b9c734 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Wed, 4 May 2022 22:02:26 -0400 Subject: [PATCH 2/5] using envpool for mujoco example --- examples/mujoco/analysis.py | 4 +-- examples/mujoco/mujoco_a2c.py | 45 ++++++++++++----------------- examples/mujoco/mujoco_ddpg.py | 30 ++++++------------- examples/mujoco/mujoco_env.py | 41 ++++++++++++++++++++++++++ examples/mujoco/mujoco_npg.py | 39 ++++++++++--------------- examples/mujoco/mujoco_ppo.py | 43 +++++++++++---------------- examples/mujoco/mujoco_redq.py | 28 +++++------------- examples/mujoco/mujoco_reinforce.py | 41 ++++++++++---------------- examples/mujoco/mujoco_sac.py | 34 +++++++--------------- examples/mujoco/mujoco_td3.py | 32 +++++++------------- examples/mujoco/mujoco_trpo.py | 41 ++++++++++---------------- test/base/test_env.py | 10 +++---- tianshou/env/venv_wrappers.py | 11 +++---- 13 files changed, 173 insertions(+), 226 deletions(-) create mode 100644 examples/mujoco/mujoco_env.py diff --git a/examples/mujoco/analysis.py b/examples/mujoco/analysis.py index ed0bb6872..00df841eb 100755 --- a/examples/mujoco/analysis.py +++ b/examples/mujoco/analysis.py @@ -9,7 +9,7 @@ from tools import csv2numpy, find_all_files, group_files -def numerical_anysis(root_dir, xlim, norm=False): +def numerical_analysis(root_dir, xlim, norm=False): file_pattern = re.compile(r".*/test_rew_\d+seeds.csv$") norm_group_pattern = re.compile(r"(/|^)\w+?\-v(\d|$)") output_group_pattern = re.compile(r".*?(?=(/|^)\w+?\-v\d)") @@ -96,4 +96,4 @@ def numerical_anysis(root_dir, xlim, norm=False): help="Normalize all results according to environment." ) args = parser.parse_args() - numerical_anysis(args.root_dir, args.xlim, norm=args.norm) + numerical_analysis(args.root_dir, args.xlim, norm=args.norm) diff --git a/examples/mujoco/mujoco_a2c.py b/examples/mujoco/mujoco_a2c.py index b8fc3370c..d79d80445 100755 --- a/examples/mujoco/mujoco_a2c.py +++ b/examples/mujoco/mujoco_a2c.py @@ -5,16 +5,15 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import A2CPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger @@ -24,7 +23,7 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') + parser.add_argument('--task', type=str, default='Ant-v3') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=4096) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) @@ -56,55 +55,43 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_a2c(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -125,7 +112,7 @@ def test_a2c(args=get_args()): list(actor.parameters()) + list(critic.parameters()), lr=args.lr, eps=1e-5, - alpha=0.99 + alpha=0.99, ) lr_scheduler = None @@ -156,12 +143,15 @@ def dist(*logits): action_scaling=True, action_bound_method=args.bound_action_method, lr_scheduler=lr_scheduler, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -180,7 +170,8 @@ def dist(*logits): logger = TensorboardLogger(writer, update_interval=100, train_interval=100) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -196,7 +187,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_ddpg.py b/examples/mujoco/mujoco_ddpg.py index 51955fe71..d5d335649 100755 --- a/examples/mujoco/mujoco_ddpg.py +++ b/examples/mujoco/mujoco_ddpg.py @@ -5,13 +5,12 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.exploration import GaussianNoise from tianshou.policy import DDPGPolicy from tianshou.trainer import offpolicy_trainer @@ -50,13 +49,15 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_ddpg(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] @@ -64,22 +65,9 @@ def test_ddpg(args=get_args()): print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor( @@ -91,7 +79,7 @@ def test_ddpg(args=get_args()): args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) @@ -104,7 +92,7 @@ def test_ddpg(args=get_args()): gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), estimation_step=args.n_step, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy @@ -129,7 +117,7 @@ def test_ddpg(args=get_args()): logger = TensorboardLogger(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -145,7 +133,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_env.py b/examples/mujoco/mujoco_env.py new file mode 100644 index 000000000..e6524a4ac --- /dev/null +++ b/examples/mujoco/mujoco_env.py @@ -0,0 +1,41 @@ +import warnings + +try: + import envpool +except ImportError: + envpool = None + +import gym + +from tianshou.env import ShmemVectorEnv, VectorEnvNormObs + + +def make_mujoco_env(task, seed, training_num, test_num, obs_norm): + """Wrapper function for Mujoco env. + + If EnvPool is installed, it will automatically switch to EnvPool's Mujoco env. + + :return: a tuple of (single env, training envs, test envs). + """ + if envpool is not None: + train_envs = env = envpool.make_gym(task, num_envs=training_num, seed=seed) + test_envs = envpool.make_gym(task, num_envs=training_num, seed=seed) + else: + warnings.warn( + "Recommend using envpool (pip install envpool) " + "to run Mujoco environments more efficiently." + ) + env = gym.make(task) + train_envs = ShmemVectorEnv( + [lambda: gym.make(task) for _ in range(training_num)] + ) + test_envs = ShmemVectorEnv([lambda: gym.make(task) for _ in range(test_num)]) + env.seed(seed) + train_envs.seed(seed) + test_envs.seed(seed) + if obs_norm: + # obs norm wrapper + train_envs = VectorEnvNormObs(train_envs) + test_envs = VectorEnvNormObs(test_envs, update_obs_rms=False) + test_envs.set_obs_rms(train_envs.get_obs_rms()) + return env, train_envs, test_envs diff --git a/examples/mujoco/mujoco_npg.py b/examples/mujoco/mujoco_npg.py index fd681bca3..1319d9759 100755 --- a/examples/mujoco/mujoco_npg.py +++ b/examples/mujoco/mujoco_npg.py @@ -5,16 +5,15 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import NPGPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger @@ -24,7 +23,7 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') + parser.add_argument('--task', type=str, default='Ant-v3') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=4096) parser.add_argument( @@ -58,55 +57,43 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_npg(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -157,7 +144,10 @@ def dist(*logits): # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -176,7 +166,8 @@ def dist(*logits): logger = TensorboardLogger(writer, update_interval=100, train_interval=100) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer diff --git a/examples/mujoco/mujoco_ppo.py b/examples/mujoco/mujoco_ppo.py index 392f1c22a..5b5242049 100755 --- a/examples/mujoco/mujoco_ppo.py +++ b/examples/mujoco/mujoco_ppo.py @@ -5,16 +5,15 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import PPOPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger @@ -24,7 +23,7 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') + parser.add_argument('--task', type=str, default='Ant-v3') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=4096) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) @@ -61,55 +60,43 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_ppo(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -163,12 +150,15 @@ def dist(*logits): value_clip=args.value_clip, dual_clip=args.dual_clip, advantage_normalization=args.norm_adv, - recompute_advantage=args.recompute_adv + recompute_advantage=args.recompute_adv, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -187,7 +177,8 @@ def dist(*logits): logger = TensorboardLogger(writer, update_interval=100, train_interval=100) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -203,7 +194,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_redq.py b/examples/mujoco/mujoco_redq.py index d580ef9b3..b6e7ec30c 100755 --- a/examples/mujoco/mujoco_redq.py +++ b/examples/mujoco/mujoco_redq.py @@ -5,13 +5,12 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import REDQPolicy from tianshou.trainer import offpolicy_trainer from tianshou.utils import TensorboardLogger @@ -56,35 +55,24 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_redq(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb( @@ -93,7 +81,7 @@ def test_redq(args=get_args()): max_action=args.max_action, device=args.device, unbounded=True, - conditioned_sigma=True + conditioned_sigma=True, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) @@ -160,7 +148,7 @@ def linear(x, y): logger = TensorboardLogger(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -176,7 +164,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_reinforce.py b/examples/mujoco/mujoco_reinforce.py index 7d331af43..1e17ebb9a 100755 --- a/examples/mujoco/mujoco_reinforce.py +++ b/examples/mujoco/mujoco_reinforce.py @@ -5,16 +5,15 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import PGPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger @@ -24,7 +23,7 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') + parser.add_argument('--task', type=str, default='Ant-v3') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=4096) parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) @@ -53,49 +52,37 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_reinforce(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) for m in actor.modules(): @@ -135,12 +122,15 @@ def dist(*logits): action_scaling=True, action_bound_method=args.action_bound_method, lr_scheduler=lr_scheduler, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -159,7 +149,8 @@ def dist(*logits): logger = TensorboardLogger(writer, update_interval=10, train_interval=100) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -175,7 +166,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_sac.py b/examples/mujoco/mujoco_sac.py index eb2afe70f..305ef842a 100755 --- a/examples/mujoco/mujoco_sac.py +++ b/examples/mujoco/mujoco_sac.py @@ -5,13 +5,12 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import SACPolicy from tianshou.trainer import offpolicy_trainer from tianshou.utils import TensorboardLogger @@ -51,35 +50,24 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_sac(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb( @@ -88,7 +76,7 @@ def test_sac(args=get_args()): max_action=args.max_action, device=args.device, unbounded=True, - conditioned_sigma=True + conditioned_sigma=True, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net( @@ -96,14 +84,14 @@ def test_sac(args=get_args()): args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) @@ -127,7 +115,7 @@ def test_sac(args=get_args()): gamma=args.gamma, alpha=args.alpha, estimation_step=args.n_step, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy @@ -152,7 +140,7 @@ def test_sac(args=get_args()): logger = TensorboardLogger(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -168,7 +156,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_td3.py b/examples/mujoco/mujoco_td3.py index d2a9bd7bc..1ebab8bef 100755 --- a/examples/mujoco/mujoco_td3.py +++ b/examples/mujoco/mujoco_td3.py @@ -5,13 +5,12 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.exploration import GaussianNoise from tianshou.policy import TD3Policy from tianshou.trainer import offpolicy_trainer @@ -53,13 +52,15 @@ def get_args(): '--watch', default=False, action='store_true', - help='watch the play of pre-trained policy only' + help='watch the play of pre-trained policy only', ) return parser.parse_args() def test_td3(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] @@ -69,22 +70,9 @@ def test_td3(args=get_args()): print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor( @@ -96,14 +84,14 @@ def test_td3(args=get_args()): args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) @@ -124,7 +112,7 @@ def test_td3(args=get_args()): update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, estimation_step=args.n_step, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy @@ -149,7 +137,7 @@ def test_td3(args=get_args()): logger = TensorboardLogger(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -165,7 +153,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/examples/mujoco/mujoco_trpo.py b/examples/mujoco/mujoco_trpo.py index dd2ce5334..849f51e48 100755 --- a/examples/mujoco/mujoco_trpo.py +++ b/examples/mujoco/mujoco_trpo.py @@ -5,16 +5,15 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import TRPOPolicy from tianshou.trainer import onpolicy_trainer from tianshou.utils import TensorboardLogger @@ -24,7 +23,7 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') + parser.add_argument('--task', type=str, default='Ant-v3') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--buffer-size', type=int, default=4096) parser.add_argument( @@ -67,49 +66,37 @@ def get_args(): def test_trpo(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -157,12 +144,15 @@ def dist(*logits): optim_critic_iters=args.optim_critic_iters, max_kl=args.max_kl, backtrack_coeff=args.backtrack_coeff, - max_backtracks=args.max_backtracks + max_backtracks=args.max_backtracks, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -181,7 +171,8 @@ def dist(*logits): logger = TensorboardLogger(writer, update_interval=100, train_interval=100) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -197,7 +188,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) diff --git a/test/base/test_env.py b/test/base/test_env.py index 6aec96d47..3b3a74c31 100644 --- a/test/base/test_env.py +++ b/test/base/test_env.py @@ -260,9 +260,8 @@ def test_venv_norm_obs(): raw = DummyVectorEnv(env_fns) train_env = VectorEnvNormObs(DummyVectorEnv(env_fns)) print(train_env.observation_space) - test_env = VectorEnvNormObs( - DummyVectorEnv(env_fns), update_obs_rms=False, obs_rms=train_env.get_obs_rms() - ) + test_env = VectorEnvNormObs(DummyVectorEnv(env_fns), update_obs_rms=False) + test_env.set_obs_rms(train_env.get_obs_rms()) run_align_norm_obs(raw, train_env, test_env, action_list) @@ -271,10 +270,9 @@ def test_venv_wrapper_envpool(): raw = envpool.make_gym("Ant-v3", num_envs=4) train = VectorEnvNormObs(envpool.make_gym("Ant-v3", num_envs=4)) test = VectorEnvNormObs( - envpool.make_gym("Ant-v3", num_envs=4), - update_obs_rms=False, - obs_rms=train.get_obs_rms() + envpool.make_gym("Ant-v3", num_envs=4), update_obs_rms=False ) + test.set_obs_rms(train.get_obs_rms()) actions = [ np.array([raw.action_space.sample() for _ in range(4)]) for i in range(30) ] diff --git a/tianshou/env/venv_wrappers.py b/tianshou/env/venv_wrappers.py index d2bdcf822..0f6f28a6b 100644 --- a/tianshou/env/venv_wrappers.py +++ b/tianshou/env/venv_wrappers.py @@ -11,6 +11,7 @@ class VectorEnvWrapper(BaseVectorEnv): def __init__(self, venv: BaseVectorEnv) -> None: self.venv = venv + self.is_async = venv.is_async def __len__(self) -> int: return len(self.venv) @@ -65,9 +66,6 @@ def close(self) -> None: class VectorEnvNormObs(VectorEnvWrapper): """An observation normalization wrapper for vectorized environments. - :param obs_rms: class to track mean&std of observation. If not given, it will - initialize a new one. Usually in envs that is used to evaluate algorithm, - obs_rms should be passed in. Default to None. :param bool update_obs_rms: whether to update obs_rms. Default to True. :param float clip_obs: the maximum absolute value for observation. Default to 10.0. @@ -77,7 +75,6 @@ class VectorEnvNormObs(VectorEnvWrapper): def __init__( self, venv: BaseVectorEnv, - obs_rms: Optional[RunningMeanStd] = None, update_obs_rms: bool = True, clip_obs: float = 10.0, epsilon: float = np.finfo(np.float32).eps.item(), @@ -85,7 +82,7 @@ def __init__( super().__init__(venv) # initialize observation running mean/std self.update_obs_rms = update_obs_rms - self.obs_rms = RunningMeanStd() if obs_rms is None else obs_rms + self.obs_rms = RunningMeanStd() self.clip_max = clip_obs self.eps = epsilon @@ -114,6 +111,10 @@ def _norm_obs(self, obs: np.ndarray) -> np.ndarray: obs = np.clip(obs, -self.clip_max, self.clip_max) return obs + def set_obs_rms(self, obs_rms: RunningMeanStd) -> None: + """Set with given observation running mean/std.""" + self.obs_rms = obs_rms + def get_obs_rms(self) -> RunningMeanStd: """Return observation running mean/std.""" return self.obs_rms From b441433734e8d6b9ad18d7d11261bfb1fc2b8498 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Wed, 4 May 2022 22:28:32 -0400 Subject: [PATCH 3/5] add wandb in examples/mujoco --- examples/mujoco/analysis.py | 8 +-- examples/mujoco/gen_json.py | 4 +- examples/mujoco/mujoco_a2c.py | 89 ++++++++++++++++---------- examples/mujoco/mujoco_ddpg.py | 83 +++++++++++++++--------- examples/mujoco/mujoco_npg.py | 93 +++++++++++++++++---------- examples/mujoco/mujoco_ppo.py | 99 ++++++++++++++++++----------- examples/mujoco/mujoco_redq.py | 93 +++++++++++++++++---------- examples/mujoco/mujoco_reinforce.py | 81 ++++++++++++++--------- examples/mujoco/mujoco_sac.py | 87 +++++++++++++++---------- examples/mujoco/mujoco_td3.py | 89 ++++++++++++++++---------- examples/mujoco/mujoco_trpo.py | 93 +++++++++++++++++---------- examples/mujoco/plotter.py | 4 +- examples/mujoco/tools.py | 26 ++++---- 13 files changed, 529 insertions(+), 320 deletions(-) diff --git a/examples/mujoco/analysis.py b/examples/mujoco/analysis.py index 00df841eb..ea2c49696 100755 --- a/examples/mujoco/analysis.py +++ b/examples/mujoco/analysis.py @@ -10,7 +10,7 @@ def numerical_analysis(root_dir, xlim, norm=False): - file_pattern = re.compile(r".*/test_rew_\d+seeds.csv$") + file_pattern = re.compile(r".*/test_reward_\d+seeds.csv$") norm_group_pattern = re.compile(r"(/|^)\w+?\-v(\d|$)") output_group_pattern = re.compile(r".*?(?=(/|^)\w+?\-v\d)") csv_files = find_all_files(root_dir, file_pattern) @@ -23,13 +23,13 @@ def numerical_analysis(root_dir, xlim, norm=False): if norm: result = np.stack( [ - result['env_step'], result['rew'] - result['rew'][0], - result['rew:shaded'] + result['env_step'], result['reward'] - result['reward'][0], + result['reward:shaded'] ] ) else: result = np.stack( - [result['env_step'], result['rew'], result['rew:shaded']] + [result['env_step'], result['reward'], result['reward:shaded']] ) if result[0, -1] < xlim: diff --git a/examples/mujoco/gen_json.py b/examples/mujoco/gen_json.py index 99cad74a3..516cffd6c 100755 --- a/examples/mujoco/gen_json.py +++ b/examples/mujoco/gen_json.py @@ -22,8 +22,8 @@ def merge(rootdir): result.append( { 'env_step': int(row['env_step']), - 'rew': float(row['rew']), - 'rew_std': float(row['rew:shaded']), + 'rew': float(row['reward']), + 'rew_std': float(row['reward:shaded']), 'Agent': algo, } ) diff --git a/examples/mujoco/mujoco_a2c.py b/examples/mujoco/mujoco_a2c.py index d79d80445..19d596172 100755 --- a/examples/mujoco/mujoco_a2c.py +++ b/examples/mujoco/mujoco_a2c.py @@ -16,46 +16,54 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import A2CPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) - parser.add_argument('--lr', type=float, default=7e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=80) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) + parser.add_argument("--lr", type=float, default=7e-4) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=80) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=16) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=16) + parser.add_argument("--test-num", type=int, default=10) # a2c special - parser.add_argument('--rew-norm', type=int, default=True) - parser.add_argument('--vf-coef', type=float, default=0.5) - parser.add_argument('--ent-coef', type=float, default=0.01) - parser.add_argument('--gae-lambda', type=float, default=0.95) - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--max-grad-norm', type=float, default=0.5) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--rew-norm", type=int, default=True) + parser.add_argument("--vf-coef", type=float, default=0.5) + parser.add_argument("--ent-coef", type=float, default=0.01) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--max-grad-norm", type=float, default=0.5) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -161,13 +169,28 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_a2c' - log_path = os.path.join(args.logdir, args.task, 'a2c', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "a2c" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} @@ -199,5 +222,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_a2c() diff --git a/examples/mujoco/mujoco_ddpg.py b/examples/mujoco/mujoco_ddpg.py index d5d335649..3e70c5986 100755 --- a/examples/mujoco/mujoco_ddpg.py +++ b/examples/mujoco/mujoco_ddpg.py @@ -14,42 +14,50 @@ from tianshou.exploration import GaussianNoise from tianshou.policy import DDPGPolicy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import Actor, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--actor-lr', type=float, default=1e-3) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--actor-lr", type=float, default=1e-3) + parser.add_argument("--critic-lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--exploration-noise", type=float, default=0.1) parser.add_argument("--start-timesteps", type=int, default=25000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=1) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=1) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -108,13 +116,28 @@ def test_ddpg(args=get_args()): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_ddpg' - log_path = os.path.join(args.logdir, args.task, 'ddpg', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "ddpg" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) @@ -145,5 +168,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_ddpg() diff --git a/examples/mujoco/mujoco_npg.py b/examples/mujoco/mujoco_npg.py index 1319d9759..1bd2d19dc 100755 --- a/examples/mujoco/mujoco_npg.py +++ b/examples/mujoco/mujoco_npg.py @@ -16,48 +16,56 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import NPGPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) parser.add_argument( - '--hidden-sizes', type=int, nargs='*', default=[64, 64] + "--hidden-sizes", type=int, nargs="*", default=[64, 64] ) # baselines [32, 32] - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=1024) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=1024) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=16) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=16) + parser.add_argument("--test-num", type=int, default=10) # npg special - parser.add_argument('--rew-norm', type=int, default=True) - parser.add_argument('--gae-lambda', type=float, default=0.95) - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--norm-adv', type=int, default=1) - parser.add_argument('--optim-critic-iters', type=int, default=20) - parser.add_argument('--actor-step-size', type=float, default=0.1) + parser.add_argument("--rew-norm", type=int, default=True) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) + parser.add_argument("--norm-adv", type=int, default=1) + parser.add_argument("--optim-critic-iters", type=int, default=20) + parser.add_argument("--actor-step-size", type=float, default=0.1) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -139,7 +147,7 @@ def dist(*logits): action_space=env.action_space, advantage_normalization=args.norm_adv, optim_critic_iters=args.optim_critic_iters, - actor_step_size=args.actor_step_size + actor_step_size=args.actor_step_size, ) # load a previous policy @@ -157,13 +165,28 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_npg' - log_path = os.path.join(args.logdir, args.task, 'npg', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "npg" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} @@ -183,7 +206,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -195,5 +218,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_npg() diff --git a/examples/mujoco/mujoco_ppo.py b/examples/mujoco/mujoco_ppo.py index 5b5242049..14c5a429f 100755 --- a/examples/mujoco/mujoco_ppo.py +++ b/examples/mujoco/mujoco_ppo.py @@ -16,51 +16,59 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import PPOPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) - parser.add_argument('--lr', type=float, default=3e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=2048) - parser.add_argument('--repeat-per-collect', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--training-num', type=int, default=64) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) + parser.add_argument("--lr", type=float, default=3e-4) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=2048) + parser.add_argument("--repeat-per-collect", type=int, default=10) + parser.add_argument("--batch-size", type=int, default=64) + parser.add_argument("--training-num", type=int, default=64) + parser.add_argument("--test-num", type=int, default=10) # ppo special - parser.add_argument('--rew-norm', type=int, default=True) + parser.add_argument("--rew-norm", type=int, default=True) # In theory, `vf-coef` will not make any difference if using Adam optimizer. - parser.add_argument('--vf-coef', type=float, default=0.25) - parser.add_argument('--ent-coef', type=float, default=0.0) - parser.add_argument('--gae-lambda', type=float, default=0.95) - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--max-grad-norm', type=float, default=0.5) - parser.add_argument('--eps-clip', type=float, default=0.2) - parser.add_argument('--dual-clip', type=float, default=None) - parser.add_argument('--value-clip', type=int, default=0) - parser.add_argument('--norm-adv', type=int, default=0) - parser.add_argument('--recompute-adv', type=int, default=1) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--vf-coef", type=float, default=0.25) + parser.add_argument("--ent-coef", type=float, default=0.0) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--max-grad-norm", type=float, default=0.5) + parser.add_argument("--eps-clip", type=float, default=0.2) + parser.add_argument("--dual-clip", type=float, default=None) + parser.add_argument("--value-clip", type=int, default=0) + parser.add_argument("--norm-adv", type=int, default=0) + parser.add_argument("--recompute-adv", type=int, default=1) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -168,13 +176,28 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_ppo' - log_path = os.path.join(args.logdir, args.task, 'ppo', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "ppo" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} @@ -206,5 +229,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_ppo() diff --git a/examples/mujoco/mujoco_redq.py b/examples/mujoco/mujoco_redq.py index b6e7ec30c..312d1d676 100755 --- a/examples/mujoco/mujoco_redq.py +++ b/examples/mujoco/mujoco_redq.py @@ -13,49 +13,57 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import REDQPolicy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import EnsembleLinear, Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--ensemble-size', type=int, default=10) - parser.add_argument('--subset-size', type=int, default=2) - parser.add_argument('--actor-lr', type=float, default=1e-3) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--alpha', type=float, default=0.2) - parser.add_argument('--auto-alpha', default=False, action='store_true') - parser.add_argument('--alpha-lr', type=float, default=3e-4) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--ensemble-size", type=int, default=10) + parser.add_argument("--subset-size", type=int, default=2) + parser.add_argument("--actor-lr", type=float, default=1e-3) + parser.add_argument("--critic-lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--alpha", type=float, default=0.2) + parser.add_argument("--auto-alpha", default=False, action="store_true") + parser.add_argument("--alpha-lr", type=float, default=3e-4) parser.add_argument("--start-timesteps", type=int, default=10000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=20) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=20) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) parser.add_argument( - '--target-mode', type=str, choices=('min', 'mean'), default='min' + "--target-mode", type=str, choices=("min", "mean"), default="min" ) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -139,13 +147,28 @@ def linear(x, y): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_redq' - log_path = os.path.join(args.logdir, args.task, 'redq', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "redq" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) @@ -176,5 +199,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_redq() diff --git a/examples/mujoco/mujoco_reinforce.py b/examples/mujoco/mujoco_reinforce.py index 1e17ebb9a..40f0a1fb1 100755 --- a/examples/mujoco/mujoco_reinforce.py +++ b/examples/mujoco/mujoco_reinforce.py @@ -16,43 +16,51 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import PGPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=2048) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=2048) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=64) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=64) + parser.add_argument("--test-num", type=int, default=10) # reinforce special - parser.add_argument('--rew-norm', type=int, default=True) + parser.add_argument("--rew-norm", type=int, default=True) # "clip" option also works well. - parser.add_argument('--action-bound-method', type=str, default="tanh") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--action-bound-method", type=str, default="tanh") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -140,13 +148,28 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_reinforce' - log_path = os.path.join(args.logdir, args.task, 'reinforce', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "reinforce" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=10, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} @@ -178,5 +201,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_reinforce() diff --git a/examples/mujoco/mujoco_sac.py b/examples/mujoco/mujoco_sac.py index 305ef842a..0450c586a 100755 --- a/examples/mujoco/mujoco_sac.py +++ b/examples/mujoco/mujoco_sac.py @@ -13,44 +13,52 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import SACPolicy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--actor-lr', type=float, default=1e-3) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--alpha', type=float, default=0.2) - parser.add_argument('--auto-alpha', default=False, action='store_true') - parser.add_argument('--alpha-lr', type=float, default=3e-4) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--actor-lr", type=float, default=1e-3) + parser.add_argument("--critic-lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--alpha", type=float, default=0.2) + parser.add_argument("--auto-alpha", default=False, action="store_true") + parser.add_argument("--alpha-lr", type=float, default=3e-4) parser.add_argument("--start-timesteps", type=int, default=10000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=1) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=1) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -131,13 +139,28 @@ def test_sac(args=get_args()): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_sac' - log_path = os.path.join(args.logdir, args.task, 'sac', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "sac" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) @@ -168,5 +191,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_sac() diff --git a/examples/mujoco/mujoco_td3.py b/examples/mujoco/mujoco_td3.py index 1ebab8bef..b0b472b31 100755 --- a/examples/mujoco/mujoco_td3.py +++ b/examples/mujoco/mujoco_td3.py @@ -14,45 +14,53 @@ from tianshou.exploration import GaussianNoise from tianshou.policy import TD3Policy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import Actor, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--actor-lr', type=float, default=3e-4) - parser.add_argument('--critic-lr', type=float, default=3e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) - parser.add_argument('--policy-noise', type=float, default=0.2) - parser.add_argument('--noise-clip', type=float, default=0.5) - parser.add_argument('--update-actor-freq', type=int, default=2) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--actor-lr", type=float, default=3e-4) + parser.add_argument("--critic-lr", type=float, default=3e-4) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--exploration-noise", type=float, default=0.1) + parser.add_argument("--policy-noise", type=float, default=0.2) + parser.add_argument("--noise-clip", type=float, default=0.5) + parser.add_argument("--update-actor-freq", type=int, default=2) parser.add_argument("--start-timesteps", type=int, default=25000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=1) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=1) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only', + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() @@ -128,13 +136,28 @@ def test_td3(args=get_args()): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_td3' - log_path = os.path.join(args.logdir, args.task, 'td3', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "td3" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) @@ -165,5 +188,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_td3() diff --git a/examples/mujoco/mujoco_trpo.py b/examples/mujoco/mujoco_trpo.py index 849f51e48..7754c0c8d 100755 --- a/examples/mujoco/mujoco_trpo.py +++ b/examples/mujoco/mujoco_trpo.py @@ -16,51 +16,59 @@ from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer from tianshou.policy import TRPOPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) parser.add_argument( - '--hidden-sizes', type=int, nargs='*', default=[64, 64] + "--hidden-sizes", type=int, nargs="*", default=[64, 64] ) # baselines [32, 32] - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=1024) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=1024) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=16) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=16) + parser.add_argument("--test-num", type=int, default=10) # trpo special - parser.add_argument('--rew-norm', type=int, default=True) - parser.add_argument('--gae-lambda', type=float, default=0.95) + parser.add_argument("--rew-norm", type=int, default=True) + parser.add_argument("--gae-lambda", type=float, default=0.95) # TODO tanh support - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--norm-adv', type=int, default=1) - parser.add_argument('--optim-critic-iters', type=int, default=20) - parser.add_argument('--max-kl', type=float, default=0.01) - parser.add_argument('--backtrack-coeff', type=float, default=0.8) - parser.add_argument('--max-backtracks', type=int, default=10) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) + parser.add_argument("--norm-adv", type=int, default=1) + parser.add_argument("--optim-critic-iters", type=int, default=20) + parser.add_argument("--max-kl", type=float, default=0.01) + parser.add_argument("--backtrack-coeff", type=float, default=0.8) + parser.add_argument("--max-backtracks", type=int, default=10) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only" ) return parser.parse_args() @@ -162,13 +170,28 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_trpo' - log_path = os.path.join(args.logdir, args.task, 'trpo', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "trpo" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} @@ -200,5 +223,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_trpo() diff --git a/examples/mujoco/plotter.py b/examples/mujoco/plotter.py index e3e7057e4..8f16cd01a 100755 --- a/examples/mujoco/plotter.py +++ b/examples/mujoco/plotter.py @@ -91,8 +91,8 @@ def plot_ax( ylabel=None, title=None, xlim=None, - xkey='env_step', - ykey='rew', + xkey="env_step", + ykey="reward", smooth_radius=0, shaded_std=True, legend_outside=False, diff --git a/examples/mujoco/tools.py b/examples/mujoco/tools.py index 3ed4791fd..b5bd2e382 100755 --- a/examples/mujoco/tools.py +++ b/examples/mujoco/tools.py @@ -26,7 +26,7 @@ def group_files(file_list, pattern): res = defaultdict(list) for f in file_list: match = re.search(pattern, f) - key = match.group() if match else '' + key = match.group() if match else "" res[key].append(f) return res @@ -41,7 +41,7 @@ def csv2numpy(csv_file): def convert_tfevents_to_csv(root_dir, refresh=False): - """Recursively convert test/rew from all tfevent file under root_dir to csv. + """Recursively convert test/reward from all tfevent file under root_dir to csv. This function assumes that there is at most one tfevents file in each directory and will add suffix to that directory. @@ -54,10 +54,12 @@ def convert_tfevents_to_csv(root_dir, refresh=False): with tqdm.tqdm(tfevent_files) as t: for tfevent_file in t: t.set_postfix(file=tfevent_file) - output_file = os.path.join(os.path.split(tfevent_file)[0], "test_rew.csv") + output_file = os.path.join( + os.path.split(tfevent_file)[0], "test_reward.csv" + ) if os.path.exists(output_file) and not refresh: content = list(csv.reader(open(output_file, "r"))) - if content[0] == ["env_step", "rew", "time"]: + if content[0] == ["env_step", "reward", "time"]: for i in range(1, len(content)): content[i] = list(map(eval, content[i])) result[output_file] = content @@ -65,13 +67,13 @@ def convert_tfevents_to_csv(root_dir, refresh=False): ea = event_accumulator.EventAccumulator(tfevent_file) ea.Reload() initial_time = ea._first_event_timestamp - content = [["env_step", "rew", "time"]] - for test_rew in ea.scalars.Items("test/rew"): + content = [["env_step", "reward", "time"]] + for test_reward in ea.scalars.Items("test/reward"): content.append( [ - round(test_rew.step, 4), - round(test_rew.value, 4), - round(test_rew.wall_time - initial_time, 4), + round(test_reward.step, 4), + round(test_reward.value, 4), + round(test_reward.wall_time - initial_time, 4), ] ) csv.writer(open(output_file, 'w')).writerows(content) @@ -89,8 +91,8 @@ def merge_csv(csv_files, root_dir, remove_zero=False): sorted_keys = sorted(csv_files.keys()) sorted_values = [csv_files[k][1:] for k in sorted_keys] content = [ - ["env_step", "rew", "rew:shaded"] + - list(map(lambda f: "rew:" + os.path.relpath(f, root_dir), sorted_keys)) + ["env_step", "reward", "reward:shaded"] + + list(map(lambda f: "reward:" + os.path.relpath(f, root_dir), sorted_keys)) ] for rows in zip(*sorted_values): array = np.array(rows) @@ -98,7 +100,7 @@ def merge_csv(csv_files, root_dir, remove_zero=False): line = [rows[0][0], round(array[:, 1].mean(), 4), round(array[:, 1].std(), 4)] line += array[:, 1].tolist() content.append(line) - output_path = os.path.join(root_dir, f"test_rew_{len(csv_files)}seeds.csv") + output_path = os.path.join(root_dir, f"test_reward_{len(csv_files)}seeds.csv") print(f"Output merged csv file to {output_path} with {len(content[1:])} lines.") csv.writer(open(output_path, "w")).writerows(content) From 04d5306b6bb2b5552e102568cec5df96d8c748f6 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Wed, 4 May 2022 22:36:03 -0400 Subject: [PATCH 4/5] update readme --- examples/mujoco/README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/mujoco/README.md b/examples/mujoco/README.md index 8890466f8..12b480a9d 100644 --- a/examples/mujoco/README.md +++ b/examples/mujoco/README.md @@ -21,7 +21,19 @@ Supported algorithms are listed below: - [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf), [commit id](https://github.com/thu-ml/tianshou/tree/6426a39796db052bafb7cabe85c764db20a722b0) - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/pdf/1502.05477.pdf), [commit id](https://github.com/thu-ml/tianshou/tree/5057b5c89e6168220272c9c28a15b758a72efc32) -#### Usage +## EnvPool + +We highly recommend using envpool to run the following experiments. To install, in a linux machine, type: + +```bash +pip install envpool +``` + +After that, `make_mujoco_env` will automatically switch to envpool's Mujoco env. EnvPool's implementation is much faster (about 2\~3x faster for pure execution speed, 1.5x for overall RL training pipeline in average) than python vectorized env implementation, and it's behavior is consistent to gym's Mujoco env. + +For more information, please refer to EnvPool's [GitHub](https://github.com/sail-sg/envpool/) and [Docs](https://envpool.readthedocs.io/en/latest/api/mujoco.html). + +## Usage Run @@ -46,7 +58,7 @@ This will start 10 experiments with different seeds. Now that all the experiments are finished, we can convert all tfevent files into csv files and then try plotting the results. ```bash -# geenrate csv +# generate csv $ ./tools.py --root-dir ./results/Ant-v3/sac # generate figures $ ./plotter.py --root-dir ./results/Ant-v3 --shaded-std --legend-pattern "\\w+" @@ -54,15 +66,16 @@ $ ./plotter.py --root-dir ./results/Ant-v3 --shaded-std --legend-pattern "\\w+" $ ./analysis.py --root-dir ./results --norm ``` -#### Example benchmark +## Example benchmark -Other graphs can be found under `/examples/mujuco/benchmark/` +Other graphs can be found under `examples/mujuco/benchmark/` For pretrained agents, detailed graphs (single agent, single game) and log details, please refer to [https://cloud.tsinghua.edu.cn/d/f45fcfc5016043bc8fbc/](https://cloud.tsinghua.edu.cn/d/f45fcfc5016043bc8fbc/). ## Offpolicy algorithms + #### Notes 1. In offpolicy algorithms (DDPG, TD3, SAC), the shared hyperparameters are almost the same, and unless otherwise stated, hyperparameters are consistent with those used for benchmark in SpinningUp's implementations (e.g. we use batchsize 256 in DDPG/TD3/SAC while SpinningUp use 100. Minor difference also lies with `start-timesteps`, data loop method `step_per_collect`, method to deal with/bootstrap truncated steps because of timelimit and unfinished/collecting episodes (contribute to performance improvement), etc.). From 12d9e443ec0c2cefad8b4fc1750d9d0500e28310 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Wed, 4 May 2022 22:46:09 -0400 Subject: [PATCH 5/5] commas --- tianshou/env/venv_wrappers.py | 6 +++--- tianshou/env/venvs.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tianshou/env/venv_wrappers.py b/tianshou/env/venv_wrappers.py index 0f6f28a6b..860c390d9 100644 --- a/tianshou/env/venv_wrappers.py +++ b/tianshou/env/venv_wrappers.py @@ -46,13 +46,13 @@ def reset( def step( self, action: np.ndarray, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: return self.venv.step(action, id) def seed( self, - seed: Optional[Union[int, List[int]]] = None + seed: Optional[Union[int, List[int]]] = None, ) -> List[Optional[List[int]]]: return self.venv.seed(seed) @@ -98,7 +98,7 @@ def reset( def step( self, action: np.ndarray, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: obs, rew, done, info = self.venv.step(action, id) if self.obs_rms and self.update_obs_rms: diff --git a/tianshou/env/venvs.py b/tianshou/env/venvs.py index 0a714e564..93558d9ef 100644 --- a/tianshou/env/venvs.py +++ b/tianshou/env/venvs.py @@ -168,7 +168,7 @@ def set_env_attr( def _wrap_id( self, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> Union[List[int], np.ndarray]: if id is None: return list(range(self.env_num)) @@ -208,7 +208,7 @@ def reset( def step( self, action: np.ndarray, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Run one timestep of some environments' dynamics. @@ -286,7 +286,7 @@ def step( def seed( self, - seed: Optional[Union[int, List[int]]] = None + seed: Optional[Union[int, List[int]]] = None, ) -> List[Optional[List[int]]]: """Set the seed for all environments.