diff --git a/.github/workflows/extra_sys.yml b/.github/workflows/extra_sys.yml index 4e6ecdad0..89599525e 100644 --- a/.github/workflows/extra_sys.yml +++ b/.github/workflows/extra_sys.yml @@ -28,4 +28,4 @@ jobs: wandb login e2366d661b89f2bee877c40bee15502d67b7abef - name: Test with pytest run: | - pytest test/base test/continuous --cov=tianshou --durations=0 -v + pytest test/base test/continuous --cov=tianshou --durations=0 -v --color=yes diff --git a/.github/workflows/gputest.yml b/.github/workflows/gputest.yml index 8032bd3f9..5b409b8eb 100644 --- a/.github/workflows/gputest.yml +++ b/.github/workflows/gputest.yml @@ -24,4 +24,4 @@ jobs: - name: Test with pytest # ignore test/throughput which only profiles the code run: | - pytest test --ignore-glob='*profile.py' --cov=tianshou --cov-report=xml --durations=0 -v + pytest test --ignore-glob='*profile.py' --cov=tianshou --cov-report=xml --durations=0 -v --color=yes diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 82c793e99..541ea0219 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -20,4 +20,4 @@ jobs: python -m pip install ".[dev]" --upgrade - name: Test with pytest run: | - pytest test/throughput --durations=0 -v + pytest test/throughput --durations=0 -v --color=yes diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index cd44457cd..f7ace0751 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -27,7 +27,7 @@ jobs: - name: Test with pytest # ignore test/throughput which only profiles the code run: | - pytest test --ignore-glob='*profile.py' --ignore="test/3rd_party" --cov=tianshou --cov-report=xml --cov-report=term-missing --durations=0 -v + pytest test --ignore-glob='*profile.py' --ignore="test/3rd_party" --cov=tianshou --cov-report=xml --cov-report=term-missing --durations=0 -v --color=yes - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: diff --git a/Makefile b/Makefile index b9967f886..5bc170698 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ pytest: $(call check_install, pytest) $(call check_install, pytest_cov) $(call check_install, pytest_xdist) - pytest test --cov ${PROJECT_PATH} --durations 0 -v --cov-report term-missing + pytest test --cov ${PROJECT_PATH} --durations 0 -v --cov-report term-missing --color=yes mypy: $(call check_install, mypy) diff --git a/README.md b/README.md index 46e3f9a15..0807fd8bb 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,7 @@ --- -[![PyPI](https://img.shields.io/pypi/v/tianshou)](https://pypi.org/project/tianshou/) -[![Conda](https://img.shields.io/conda/vn/conda-forge/tianshou)](https://github.com/conda-forge/tianshou-feedstock) -[![Read the Docs](https://img.shields.io/readthedocs/tianshou)](https://tianshou.readthedocs.io/en/master) -[![Read the Docs](https://img.shields.io/readthedocs/tianshou-docs-zh-cn?label=%E4%B8%AD%E6%96%87%E6%96%87%E6%A1%A3)](https://tianshou.readthedocs.io/zh/master/) -[![Unittest](https://github.com/thu-ml/tianshou/workflows/Unittest/badge.svg?branch=master)](https://github.com/thu-ml/tianshou/actions) -[![codecov](https://img.shields.io/codecov/c/gh/thu-ml/tianshou)](https://codecov.io/gh/thu-ml/tianshou) -[![GitHub issues](https://img.shields.io/github/issues/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/issues) -[![GitHub stars](https://img.shields.io/github/stars/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/stargazers) -[![GitHub forks](https://img.shields.io/github/forks/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/network) -[![GitHub license](https://img.shields.io/github/license/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/blob/master/LICENSE) +[![PyPI](https://img.shields.io/pypi/v/tianshou)](https://pypi.org/project/tianshou/) [![Conda](https://img.shields.io/conda/vn/conda-forge/tianshou)](https://github.com/conda-forge/tianshou-feedstock) [![Read the Docs](https://img.shields.io/readthedocs/tianshou)](https://tianshou.readthedocs.io/en/master) [![Read the Docs](https://img.shields.io/readthedocs/tianshou-docs-zh-cn?label=%E4%B8%AD%E6%96%87%E6%96%87%E6%A1%A3)](https://tianshou.readthedocs.io/zh/master/) [![Unittest](https://github.com/thu-ml/tianshou/workflows/Unittest/badge.svg?branch=master)](https://github.com/thu-ml/tianshou/actions) [![codecov](https://img.shields.io/codecov/c/gh/thu-ml/tianshou)](https://codecov.io/gh/thu-ml/tianshou) [![GitHub issues](https://img.shields.io/github/issues/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/issues) [![GitHub stars](https://img.shields.io/github/stars/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/stargazers) [![GitHub forks](https://img.shields.io/github/forks/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/network) [![GitHub license](https://img.shields.io/github/license/thu-ml/tianshou)](https://github.com/thu-ml/tianshou/blob/master/LICENSE) **Tianshou** ([天授](https://baike.baidu.com/item/%E5%A4%A9%E6%8E%88)) is a reinforcement learning platform based on pure PyTorch. Unlike existing reinforcement learning libraries, which are mainly based on TensorFlow, have many nested classes, unfriendly API, or slow-speed, Tianshou provides a fast-speed modularized framework and pythonic API for building the deep reinforcement learning agent with the least number of lines of code. The supported interface algorithms currently include: @@ -48,7 +39,7 @@ - [Posterior Sampling Reinforcement Learning (PSRL)](https://www.ece.uvic.ca/~bctill/papers/learning/Strens_2000.pdf) - [Intrinsic Curiosity Module (ICM)](https://arxiv.org/pdf/1705.05363.pdf) -Here is Tianshou's other features: +Here are Tianshou's other features: - Elegant framework, using only ~4000 lines of code - State-of-the-art [MuJoCo benchmark](https://github.com/thu-ml/tianshou/tree/master/examples/mujoco) for REINFORCE/A2C/TRPO/PPO/DDPG/TD3/SAC algorithms @@ -132,7 +123,7 @@ The example scripts are under [test/](https://github.com/thu-ml/tianshou/blob/ma (2): not all algorithms support this feature -(3): TQC and QR-DQN in [sb3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib) instead of main repo +(3): TQC and QR-DQN in [sb3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib) instead of main repo (4): super fast APPO! diff --git a/examples/mujoco/README.md b/examples/mujoco/README.md index 8890466f8..12b480a9d 100644 --- a/examples/mujoco/README.md +++ b/examples/mujoco/README.md @@ -21,7 +21,19 @@ Supported algorithms are listed below: - [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf), [commit id](https://github.com/thu-ml/tianshou/tree/6426a39796db052bafb7cabe85c764db20a722b0) - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/pdf/1502.05477.pdf), [commit id](https://github.com/thu-ml/tianshou/tree/5057b5c89e6168220272c9c28a15b758a72efc32) -#### Usage +## EnvPool + +We highly recommend using envpool to run the following experiments. To install, in a linux machine, type: + +```bash +pip install envpool +``` + +After that, `make_mujoco_env` will automatically switch to envpool's Mujoco env. EnvPool's implementation is much faster (about 2\~3x faster for pure execution speed, 1.5x for overall RL training pipeline in average) than python vectorized env implementation, and it's behavior is consistent to gym's Mujoco env. + +For more information, please refer to EnvPool's [GitHub](https://github.com/sail-sg/envpool/) and [Docs](https://envpool.readthedocs.io/en/latest/api/mujoco.html). + +## Usage Run @@ -46,7 +58,7 @@ This will start 10 experiments with different seeds. Now that all the experiments are finished, we can convert all tfevent files into csv files and then try plotting the results. ```bash -# geenrate csv +# generate csv $ ./tools.py --root-dir ./results/Ant-v3/sac # generate figures $ ./plotter.py --root-dir ./results/Ant-v3 --shaded-std --legend-pattern "\\w+" @@ -54,15 +66,16 @@ $ ./plotter.py --root-dir ./results/Ant-v3 --shaded-std --legend-pattern "\\w+" $ ./analysis.py --root-dir ./results --norm ``` -#### Example benchmark +## Example benchmark -Other graphs can be found under `/examples/mujuco/benchmark/` +Other graphs can be found under `examples/mujuco/benchmark/` For pretrained agents, detailed graphs (single agent, single game) and log details, please refer to [https://cloud.tsinghua.edu.cn/d/f45fcfc5016043bc8fbc/](https://cloud.tsinghua.edu.cn/d/f45fcfc5016043bc8fbc/). ## Offpolicy algorithms + #### Notes 1. In offpolicy algorithms (DDPG, TD3, SAC), the shared hyperparameters are almost the same, and unless otherwise stated, hyperparameters are consistent with those used for benchmark in SpinningUp's implementations (e.g. we use batchsize 256 in DDPG/TD3/SAC while SpinningUp use 100. Minor difference also lies with `start-timesteps`, data loop method `step_per_collect`, method to deal with/bootstrap truncated steps because of timelimit and unfinished/collecting episodes (contribute to performance improvement), etc.). diff --git a/examples/mujoco/analysis.py b/examples/mujoco/analysis.py index ed0bb6872..ea2c49696 100755 --- a/examples/mujoco/analysis.py +++ b/examples/mujoco/analysis.py @@ -9,8 +9,8 @@ from tools import csv2numpy, find_all_files, group_files -def numerical_anysis(root_dir, xlim, norm=False): - file_pattern = re.compile(r".*/test_rew_\d+seeds.csv$") +def numerical_analysis(root_dir, xlim, norm=False): + file_pattern = re.compile(r".*/test_reward_\d+seeds.csv$") norm_group_pattern = re.compile(r"(/|^)\w+?\-v(\d|$)") output_group_pattern = re.compile(r".*?(?=(/|^)\w+?\-v\d)") csv_files = find_all_files(root_dir, file_pattern) @@ -23,13 +23,13 @@ def numerical_anysis(root_dir, xlim, norm=False): if norm: result = np.stack( [ - result['env_step'], result['rew'] - result['rew'][0], - result['rew:shaded'] + result['env_step'], result['reward'] - result['reward'][0], + result['reward:shaded'] ] ) else: result = np.stack( - [result['env_step'], result['rew'], result['rew:shaded']] + [result['env_step'], result['reward'], result['reward:shaded']] ) if result[0, -1] < xlim: @@ -96,4 +96,4 @@ def numerical_anysis(root_dir, xlim, norm=False): help="Normalize all results according to environment." ) args = parser.parse_args() - numerical_anysis(args.root_dir, args.xlim, norm=args.norm) + numerical_analysis(args.root_dir, args.xlim, norm=args.norm) diff --git a/examples/mujoco/gen_json.py b/examples/mujoco/gen_json.py index 99cad74a3..516cffd6c 100755 --- a/examples/mujoco/gen_json.py +++ b/examples/mujoco/gen_json.py @@ -22,8 +22,8 @@ def merge(rootdir): result.append( { 'env_step': int(row['env_step']), - 'rew': float(row['rew']), - 'rew_std': float(row['rew:shaded']), + 'rew': float(row['reward']), + 'rew_std': float(row['reward:shaded']), 'Agent': algo, } ) diff --git a/examples/mujoco/mujoco_a2c.py b/examples/mujoco/mujoco_a2c.py index b8fc3370c..19d596172 100755 --- a/examples/mujoco/mujoco_a2c.py +++ b/examples/mujoco/mujoco_a2c.py @@ -5,106 +5,101 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import A2CPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) - parser.add_argument('--lr', type=float, default=7e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=80) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) + parser.add_argument("--lr", type=float, default=7e-4) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=80) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=16) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=16) + parser.add_argument("--test-num", type=int, default=10) # a2c special - parser.add_argument('--rew-norm', type=int, default=True) - parser.add_argument('--vf-coef', type=float, default=0.5) - parser.add_argument('--ent-coef', type=float, default=0.01) - parser.add_argument('--gae-lambda', type=float, default=0.95) - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--max-grad-norm', type=float, default=0.5) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--rew-norm", type=int, default=True) + parser.add_argument("--vf-coef", type=float, default=0.5) + parser.add_argument("--ent-coef", type=float, default=0.01) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--max-grad-norm", type=float, default=0.5) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_a2c(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -125,7 +120,7 @@ def test_a2c(args=get_args()): list(actor.parameters()) + list(critic.parameters()), lr=args.lr, eps=1e-5, - alpha=0.99 + alpha=0.99, ) lr_scheduler = None @@ -156,12 +151,15 @@ def dist(*logits): action_scaling=True, action_bound_method=args.bound_action_method, lr_scheduler=lr_scheduler, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -171,16 +169,32 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_a2c' - log_path = os.path.join(args.logdir, args.task, 'a2c', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "a2c" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -196,7 +210,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -208,5 +222,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_a2c() diff --git a/examples/mujoco/mujoco_ddpg.py b/examples/mujoco/mujoco_ddpg.py index 51955fe71..3e70c5986 100755 --- a/examples/mujoco/mujoco_ddpg.py +++ b/examples/mujoco/mujoco_ddpg.py @@ -5,58 +5,67 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.exploration import GaussianNoise from tianshou.policy import DDPGPolicy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import Actor, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--actor-lr', type=float, default=1e-3) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--actor-lr", type=float, default=1e-3) + parser.add_argument("--critic-lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--exploration-noise", type=float, default=0.1) parser.add_argument("--start-timesteps", type=int, default=25000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=1) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=1) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_ddpg(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] @@ -64,22 +73,9 @@ def test_ddpg(args=get_args()): print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor( @@ -91,7 +87,7 @@ def test_ddpg(args=get_args()): args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) @@ -104,7 +100,7 @@ def test_ddpg(args=get_args()): gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), estimation_step=args.n_step, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy @@ -120,16 +116,31 @@ def test_ddpg(args=get_args()): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_ddpg' - log_path = os.path.join(args.logdir, args.task, 'ddpg', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "ddpg" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -145,7 +156,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -157,5 +168,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_ddpg() diff --git a/examples/mujoco/mujoco_env.py b/examples/mujoco/mujoco_env.py new file mode 100644 index 000000000..e6524a4ac --- /dev/null +++ b/examples/mujoco/mujoco_env.py @@ -0,0 +1,41 @@ +import warnings + +try: + import envpool +except ImportError: + envpool = None + +import gym + +from tianshou.env import ShmemVectorEnv, VectorEnvNormObs + + +def make_mujoco_env(task, seed, training_num, test_num, obs_norm): + """Wrapper function for Mujoco env. + + If EnvPool is installed, it will automatically switch to EnvPool's Mujoco env. + + :return: a tuple of (single env, training envs, test envs). + """ + if envpool is not None: + train_envs = env = envpool.make_gym(task, num_envs=training_num, seed=seed) + test_envs = envpool.make_gym(task, num_envs=training_num, seed=seed) + else: + warnings.warn( + "Recommend using envpool (pip install envpool) " + "to run Mujoco environments more efficiently." + ) + env = gym.make(task) + train_envs = ShmemVectorEnv( + [lambda: gym.make(task) for _ in range(training_num)] + ) + test_envs = ShmemVectorEnv([lambda: gym.make(task) for _ in range(test_num)]) + env.seed(seed) + train_envs.seed(seed) + test_envs.seed(seed) + if obs_norm: + # obs norm wrapper + train_envs = VectorEnvNormObs(train_envs) + test_envs = VectorEnvNormObs(test_envs, update_obs_rms=False) + test_envs.set_obs_rms(train_envs.get_obs_rms()) + return env, train_envs, test_envs diff --git a/examples/mujoco/mujoco_npg.py b/examples/mujoco/mujoco_npg.py index fd681bca3..1bd2d19dc 100755 --- a/examples/mujoco/mujoco_npg.py +++ b/examples/mujoco/mujoco_npg.py @@ -5,108 +5,103 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import NPGPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) parser.add_argument( - '--hidden-sizes', type=int, nargs='*', default=[64, 64] + "--hidden-sizes", type=int, nargs="*", default=[64, 64] ) # baselines [32, 32] - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=1024) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=1024) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=16) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=16) + parser.add_argument("--test-num", type=int, default=10) # npg special - parser.add_argument('--rew-norm', type=int, default=True) - parser.add_argument('--gae-lambda', type=float, default=0.95) - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--norm-adv', type=int, default=1) - parser.add_argument('--optim-critic-iters', type=int, default=20) - parser.add_argument('--actor-step-size', type=float, default=0.1) + parser.add_argument("--rew-norm", type=int, default=True) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) + parser.add_argument("--norm-adv", type=int, default=1) + parser.add_argument("--optim-critic-iters", type=int, default=20) + parser.add_argument("--actor-step-size", type=float, default=0.1) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_npg(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -152,12 +147,15 @@ def dist(*logits): action_space=env.action_space, advantage_normalization=args.norm_adv, optim_critic_iters=args.optim_critic_iters, - actor_step_size=args.actor_step_size + actor_step_size=args.actor_step_size, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -167,16 +165,32 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_npg' - log_path = os.path.join(args.logdir, args.task, 'npg', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "npg" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -192,7 +206,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -204,5 +218,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_npg() diff --git a/examples/mujoco/mujoco_ppo.py b/examples/mujoco/mujoco_ppo.py index 392f1c22a..14c5a429f 100755 --- a/examples/mujoco/mujoco_ppo.py +++ b/examples/mujoco/mujoco_ppo.py @@ -5,111 +5,106 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import PPOPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) - parser.add_argument('--lr', type=float, default=3e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=2048) - parser.add_argument('--repeat-per-collect', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--training-num', type=int, default=64) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) + parser.add_argument("--lr", type=float, default=3e-4) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=2048) + parser.add_argument("--repeat-per-collect", type=int, default=10) + parser.add_argument("--batch-size", type=int, default=64) + parser.add_argument("--training-num", type=int, default=64) + parser.add_argument("--test-num", type=int, default=10) # ppo special - parser.add_argument('--rew-norm', type=int, default=True) + parser.add_argument("--rew-norm", type=int, default=True) # In theory, `vf-coef` will not make any difference if using Adam optimizer. - parser.add_argument('--vf-coef', type=float, default=0.25) - parser.add_argument('--ent-coef', type=float, default=0.0) - parser.add_argument('--gae-lambda', type=float, default=0.95) - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--max-grad-norm', type=float, default=0.5) - parser.add_argument('--eps-clip', type=float, default=0.2) - parser.add_argument('--dual-clip', type=float, default=None) - parser.add_argument('--value-clip', type=int, default=0) - parser.add_argument('--norm-adv', type=int, default=0) - parser.add_argument('--recompute-adv', type=int, default=1) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--vf-coef", type=float, default=0.25) + parser.add_argument("--ent-coef", type=float, default=0.0) + parser.add_argument("--gae-lambda", type=float, default=0.95) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--max-grad-norm", type=float, default=0.5) + parser.add_argument("--eps-clip", type=float, default=0.2) + parser.add_argument("--dual-clip", type=float, default=None) + parser.add_argument("--value-clip", type=int, default=0) + parser.add_argument("--norm-adv", type=int, default=0) + parser.add_argument("--recompute-adv", type=int, default=1) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_ppo(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -163,12 +158,15 @@ def dist(*logits): value_clip=args.value_clip, dual_clip=args.dual_clip, advantage_normalization=args.norm_adv, - recompute_advantage=args.recompute_adv + recompute_advantage=args.recompute_adv, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -178,16 +176,32 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_ppo' - log_path = os.path.join(args.logdir, args.task, 'ppo', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "ppo" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -203,7 +217,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -215,5 +229,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_ppo() diff --git a/examples/mujoco/mujoco_redq.py b/examples/mujoco/mujoco_redq.py index d580ef9b3..312d1d676 100755 --- a/examples/mujoco/mujoco_redq.py +++ b/examples/mujoco/mujoco_redq.py @@ -5,86 +5,82 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import REDQPolicy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import EnsembleLinear, Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--ensemble-size', type=int, default=10) - parser.add_argument('--subset-size', type=int, default=2) - parser.add_argument('--actor-lr', type=float, default=1e-3) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--alpha', type=float, default=0.2) - parser.add_argument('--auto-alpha', default=False, action='store_true') - parser.add_argument('--alpha-lr', type=float, default=3e-4) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--ensemble-size", type=int, default=10) + parser.add_argument("--subset-size", type=int, default=2) + parser.add_argument("--actor-lr", type=float, default=1e-3) + parser.add_argument("--critic-lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--alpha", type=float, default=0.2) + parser.add_argument("--auto-alpha", default=False, action="store_true") + parser.add_argument("--alpha-lr", type=float, default=3e-4) parser.add_argument("--start-timesteps", type=int, default=10000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=20) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=20) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) parser.add_argument( - '--target-mode', type=str, choices=('min', 'mean'), default='min' + "--target-mode", type=str, choices=("min", "mean"), default="min" ) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_redq(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb( @@ -93,7 +89,7 @@ def test_redq(args=get_args()): max_action=args.max_action, device=args.device, unbounded=True, - conditioned_sigma=True + conditioned_sigma=True, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) @@ -151,16 +147,31 @@ def linear(x, y): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_redq' - log_path = os.path.join(args.logdir, args.task, 'redq', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "redq" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -176,7 +187,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -188,5 +199,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_redq() diff --git a/examples/mujoco/mujoco_reinforce.py b/examples/mujoco/mujoco_reinforce.py index 7d331af43..40f0a1fb1 100755 --- a/examples/mujoco/mujoco_reinforce.py +++ b/examples/mujoco/mujoco_reinforce.py @@ -5,97 +5,92 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import PGPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[64, 64]) - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=2048) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[64, 64]) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=2048) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=64) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=64) + parser.add_argument("--test-num", type=int, default=10) # reinforce special - parser.add_argument('--rew-norm', type=int, default=True) + parser.add_argument("--rew-norm", type=int, default=True) # "clip" option also works well. - parser.add_argument('--action-bound-method', type=str, default="tanh") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--action-bound-method", type=str, default="tanh") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_reinforce(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) for m in actor.modules(): @@ -135,12 +130,15 @@ def dist(*logits): action_scaling=True, action_bound_method=args.action_bound_method, lr_scheduler=lr_scheduler, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -150,16 +148,32 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_reinforce' - log_path = os.path.join(args.logdir, args.task, 'reinforce', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "reinforce" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=10, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -175,7 +189,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -187,5 +201,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_reinforce() diff --git a/examples/mujoco/mujoco_sac.py b/examples/mujoco/mujoco_sac.py index eb2afe70f..0450c586a 100755 --- a/examples/mujoco/mujoco_sac.py +++ b/examples/mujoco/mujoco_sac.py @@ -5,81 +5,77 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import SACPolicy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--actor-lr', type=float, default=1e-3) - parser.add_argument('--critic-lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--alpha', type=float, default=0.2) - parser.add_argument('--auto-alpha', default=False, action='store_true') - parser.add_argument('--alpha-lr', type=float, default=3e-4) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--actor-lr", type=float, default=1e-3) + parser.add_argument("--critic-lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--alpha", type=float, default=0.2) + parser.add_argument("--auto-alpha", default=False, action="store_true") + parser.add_argument("--alpha-lr", type=float, default=3e-4) parser.add_argument("--start-timesteps", type=int, default=10000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=1) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=1) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_sac(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb( @@ -88,7 +84,7 @@ def test_sac(args=get_args()): max_action=args.max_action, device=args.device, unbounded=True, - conditioned_sigma=True + conditioned_sigma=True, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net( @@ -96,14 +92,14 @@ def test_sac(args=get_args()): args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) @@ -127,7 +123,7 @@ def test_sac(args=get_args()): gamma=args.gamma, alpha=args.alpha, estimation_step=args.n_step, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy @@ -143,16 +139,31 @@ def test_sac(args=get_args()): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_sac' - log_path = os.path.join(args.logdir, args.task, 'sac', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "sac" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -168,7 +179,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -180,5 +191,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_sac() diff --git a/examples/mujoco/mujoco_td3.py b/examples/mujoco/mujoco_td3.py index d2a9bd7bc..b0b472b31 100755 --- a/examples/mujoco/mujoco_td3.py +++ b/examples/mujoco/mujoco_td3.py @@ -5,61 +5,70 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.exploration import GaussianNoise from tianshou.policy import TD3Policy from tianshou.trainer import offpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import Actor, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='Ant-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=1000000) - parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[256, 256]) - parser.add_argument('--actor-lr', type=float, default=3e-4) - parser.add_argument('--critic-lr', type=float, default=3e-4) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--exploration-noise', type=float, default=0.1) - parser.add_argument('--policy-noise', type=float, default=0.2) - parser.add_argument('--noise-clip', type=float, default=0.5) - parser.add_argument('--update-actor-freq', type=int, default=2) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=1000000) + parser.add_argument("--hidden-sizes", type=int, nargs="*", default=[256, 256]) + parser.add_argument("--actor-lr", type=float, default=3e-4) + parser.add_argument("--critic-lr", type=float, default=3e-4) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--tau", type=float, default=0.005) + parser.add_argument("--exploration-noise", type=float, default=0.1) + parser.add_argument("--policy-noise", type=float, default=0.2) + parser.add_argument("--noise-clip", type=float, default=0.5) + parser.add_argument("--update-actor-freq", type=int, default=2) parser.add_argument("--start-timesteps", type=int, default=25000) - parser.add_argument('--epoch', type=int, default=200) - parser.add_argument('--step-per-epoch', type=int, default=5000) - parser.add_argument('--step-per-collect', type=int, default=1) - parser.add_argument('--update-per-step', type=int, default=1) - parser.add_argument('--n-step', type=int, default=1) - parser.add_argument('--batch-size', type=int, default=256) - parser.add_argument('--training-num', type=int, default=1) - parser.add_argument('--test-num', type=int, default=10) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) + parser.add_argument("--epoch", type=int, default=200) + parser.add_argument("--step-per-epoch", type=int, default=5000) + parser.add_argument("--step-per-collect", type=int, default=1) + parser.add_argument("--update-per-step", type=int, default=1) + parser.add_argument("--n-step", type=int, default=1) + parser.add_argument("--batch-size", type=int, default=256) + parser.add_argument("--training-num", type=int, default=1) + parser.add_argument("--test-num", type=int, default=10) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only", ) return parser.parse_args() def test_td3(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=False + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] @@ -69,22 +78,9 @@ def test_td3(args=get_args()): print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - if args.training_num > 1: - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)] - ) - else: - train_envs = gym.make(args.task) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)] - ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor( @@ -96,14 +92,14 @@ def test_td3(args=get_args()): args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, - device=args.device + device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) @@ -124,7 +120,7 @@ def test_td3(args=get_args()): update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, estimation_step=args.n_step, - action_space=env.action_space + action_space=env.action_space, ) # load a previous policy @@ -140,16 +136,31 @@ def test_td3(args=get_args()): train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_td3' - log_path = os.path.join(args.logdir, args.task, 'td3', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "td3" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -165,7 +176,7 @@ def save_best_fn(policy): save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -177,5 +188,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_td3() diff --git a/examples/mujoco/mujoco_trpo.py b/examples/mujoco/mujoco_trpo.py index dd2ce5334..7754c0c8d 100755 --- a/examples/mujoco/mujoco_trpo.py +++ b/examples/mujoco/mujoco_trpo.py @@ -5,111 +5,106 @@ import os import pprint -import gym import numpy as np import torch +from mujoco_env import make_mujoco_env from torch import nn from torch.distributions import Independent, Normal from torch.optim.lr_scheduler import LambdaLR from torch.utils.tensorboard import SummaryWriter from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer -from tianshou.env import SubprocVectorEnv from tianshou.policy import TRPOPolicy from tianshou.trainer import onpolicy_trainer -from tianshou.utils import TensorboardLogger +from tianshou.utils import TensorboardLogger, WandbLogger from tianshou.utils.net.common import Net from tianshou.utils.net.continuous import ActorProb, Critic def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('--task', type=str, default='HalfCheetah-v3') - parser.add_argument('--seed', type=int, default=0) - parser.add_argument('--buffer-size', type=int, default=4096) + parser.add_argument("--task", type=str, default="Ant-v3") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--buffer-size", type=int, default=4096) parser.add_argument( - '--hidden-sizes', type=int, nargs='*', default=[64, 64] + "--hidden-sizes", type=int, nargs="*", default=[64, 64] ) # baselines [32, 32] - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--epoch', type=int, default=100) - parser.add_argument('--step-per-epoch', type=int, default=30000) - parser.add_argument('--step-per-collect', type=int, default=1024) - parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument("--lr", type=float, default=1e-3) + parser.add_argument("--gamma", type=float, default=0.99) + parser.add_argument("--epoch", type=int, default=100) + parser.add_argument("--step-per-epoch", type=int, default=30000) + parser.add_argument("--step-per-collect", type=int, default=1024) + parser.add_argument("--repeat-per-collect", type=int, default=1) # batch-size >> step-per-collect means calculating all data in one singe forward. - parser.add_argument('--batch-size', type=int, default=99999) - parser.add_argument('--training-num', type=int, default=16) - parser.add_argument('--test-num', type=int, default=10) + parser.add_argument("--batch-size", type=int, default=99999) + parser.add_argument("--training-num", type=int, default=16) + parser.add_argument("--test-num", type=int, default=10) # trpo special - parser.add_argument('--rew-norm', type=int, default=True) - parser.add_argument('--gae-lambda', type=float, default=0.95) + parser.add_argument("--rew-norm", type=int, default=True) + parser.add_argument("--gae-lambda", type=float, default=0.95) # TODO tanh support - parser.add_argument('--bound-action-method', type=str, default="clip") - parser.add_argument('--lr-decay', type=int, default=True) - parser.add_argument('--logdir', type=str, default='log') - parser.add_argument('--render', type=float, default=0.) - parser.add_argument('--norm-adv', type=int, default=1) - parser.add_argument('--optim-critic-iters', type=int, default=20) - parser.add_argument('--max-kl', type=float, default=0.01) - parser.add_argument('--backtrack-coeff', type=float, default=0.8) - parser.add_argument('--max-backtracks', type=int, default=10) + parser.add_argument("--bound-action-method", type=str, default="clip") + parser.add_argument("--lr-decay", type=int, default=True) + parser.add_argument("--logdir", type=str, default="log") + parser.add_argument("--render", type=float, default=0.) + parser.add_argument("--norm-adv", type=int, default=1) + parser.add_argument("--optim-critic-iters", type=int, default=20) + parser.add_argument("--max-kl", type=float, default=0.01) + parser.add_argument("--backtrack-coeff", type=float, default=0.8) + parser.add_argument("--max-backtracks", type=int, default=10) parser.add_argument( - '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu' + "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu" ) - parser.add_argument('--resume-path', type=str, default=None) + parser.add_argument("--resume-path", type=str, default=None) + parser.add_argument("--resume-id", type=str, default=None) parser.add_argument( - '--watch', + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + ) + parser.add_argument("--wandb-project", type=str, default="mujoco.benchmark") + parser.add_argument( + "--watch", default=False, - action='store_true', - help='watch the play of pre-trained policy only' + action="store_true", + help="watch the play of pre-trained policy only" ) return parser.parse_args() def test_trpo(args=get_args()): - env = gym.make(args.task) + env, train_envs, test_envs = make_mujoco_env( + args.task, args.seed, args.training_num, args.test_num, obs_norm=True + ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) - # train_envs = gym.make(args.task) - train_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True - ) - # test_envs = gym.make(args.task) - test_envs = SubprocVectorEnv( - [lambda: gym.make(args.task) for _ in range(args.test_num)], - norm_obs=True, - obs_rms=train_envs.obs_rms, - update_obs_rms=False - ) - # seed np.random.seed(args.seed) torch.manual_seed(args.seed) - train_envs.seed(args.seed) - test_envs.seed(args.seed) # model net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, unbounded=True, - device=args.device + device=args.device, ).to(args.device) net_c = Net( args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, - device=args.device + device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) @@ -157,12 +152,15 @@ def dist(*logits): optim_critic_iters=args.optim_critic_iters, max_kl=args.max_kl, backtrack_coeff=args.backtrack_coeff, - max_backtracks=args.max_backtracks + max_backtracks=args.max_backtracks, ) # load a previous policy if args.resume_path: - policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) + ckpt = torch.load(args.resume_path, map_location=args.device) + policy.load_state_dict(ckpt["model"]) + train_envs.set_obs_rms(ckpt["obs_rms"]) + test_envs.set_obs_rms(ckpt["obs_rms"]) print("Loaded agent from: ", args.resume_path) # collector @@ -172,16 +170,32 @@ def dist(*logits): buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) + # log - t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") - log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_trpo' - log_path = os.path.join(args.logdir, args.task, 'trpo', log_file) + now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") + args.algo_name = "trpo" + log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) + log_path = os.path.join(args.logdir, log_name) + + # logger + if args.logger == "wandb": + logger = WandbLogger( + save_interval=1, + name=log_name.replace(os.path.sep, "__"), + run_id=args.resume_id, + config=args, + project=args.wandb_project, + ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) - logger = TensorboardLogger(writer, update_interval=100, train_interval=100) + if args.logger == "tensorboard": + logger = TensorboardLogger(writer) + else: # wandb + logger.load(writer) def save_best_fn(policy): - torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) + state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()} + torch.save(state, os.path.join(log_path, "policy.pth")) if not args.watch: # trainer @@ -197,7 +211,7 @@ def save_best_fn(policy): step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, - test_in_train=False + test_in_train=False, ) pprint.pprint(result) @@ -209,5 +223,5 @@ def save_best_fn(policy): print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}') -if __name__ == '__main__': +if __name__ == "__main__": test_trpo() diff --git a/examples/mujoco/plotter.py b/examples/mujoco/plotter.py index e3e7057e4..8f16cd01a 100755 --- a/examples/mujoco/plotter.py +++ b/examples/mujoco/plotter.py @@ -91,8 +91,8 @@ def plot_ax( ylabel=None, title=None, xlim=None, - xkey='env_step', - ykey='rew', + xkey="env_step", + ykey="reward", smooth_radius=0, shaded_std=True, legend_outside=False, diff --git a/examples/mujoco/tools.py b/examples/mujoco/tools.py index 3ed4791fd..b5bd2e382 100755 --- a/examples/mujoco/tools.py +++ b/examples/mujoco/tools.py @@ -26,7 +26,7 @@ def group_files(file_list, pattern): res = defaultdict(list) for f in file_list: match = re.search(pattern, f) - key = match.group() if match else '' + key = match.group() if match else "" res[key].append(f) return res @@ -41,7 +41,7 @@ def csv2numpy(csv_file): def convert_tfevents_to_csv(root_dir, refresh=False): - """Recursively convert test/rew from all tfevent file under root_dir to csv. + """Recursively convert test/reward from all tfevent file under root_dir to csv. This function assumes that there is at most one tfevents file in each directory and will add suffix to that directory. @@ -54,10 +54,12 @@ def convert_tfevents_to_csv(root_dir, refresh=False): with tqdm.tqdm(tfevent_files) as t: for tfevent_file in t: t.set_postfix(file=tfevent_file) - output_file = os.path.join(os.path.split(tfevent_file)[0], "test_rew.csv") + output_file = os.path.join( + os.path.split(tfevent_file)[0], "test_reward.csv" + ) if os.path.exists(output_file) and not refresh: content = list(csv.reader(open(output_file, "r"))) - if content[0] == ["env_step", "rew", "time"]: + if content[0] == ["env_step", "reward", "time"]: for i in range(1, len(content)): content[i] = list(map(eval, content[i])) result[output_file] = content @@ -65,13 +67,13 @@ def convert_tfevents_to_csv(root_dir, refresh=False): ea = event_accumulator.EventAccumulator(tfevent_file) ea.Reload() initial_time = ea._first_event_timestamp - content = [["env_step", "rew", "time"]] - for test_rew in ea.scalars.Items("test/rew"): + content = [["env_step", "reward", "time"]] + for test_reward in ea.scalars.Items("test/reward"): content.append( [ - round(test_rew.step, 4), - round(test_rew.value, 4), - round(test_rew.wall_time - initial_time, 4), + round(test_reward.step, 4), + round(test_reward.value, 4), + round(test_reward.wall_time - initial_time, 4), ] ) csv.writer(open(output_file, 'w')).writerows(content) @@ -89,8 +91,8 @@ def merge_csv(csv_files, root_dir, remove_zero=False): sorted_keys = sorted(csv_files.keys()) sorted_values = [csv_files[k][1:] for k in sorted_keys] content = [ - ["env_step", "rew", "rew:shaded"] + - list(map(lambda f: "rew:" + os.path.relpath(f, root_dir), sorted_keys)) + ["env_step", "reward", "reward:shaded"] + + list(map(lambda f: "reward:" + os.path.relpath(f, root_dir), sorted_keys)) ] for rows in zip(*sorted_values): array = np.array(rows) @@ -98,7 +100,7 @@ def merge_csv(csv_files, root_dir, remove_zero=False): line = [rows[0][0], round(array[:, 1].mean(), 4), round(array[:, 1].std(), 4)] line += array[:, 1].tolist() content.append(line) - output_path = os.path.join(root_dir, f"test_rew_{len(csv_files)}seeds.csv") + output_path = os.path.join(root_dir, f"test_reward_{len(csv_files)}seeds.csv") print(f"Output merged csv file to {output_path} with {len(content[1:])} lines.") csv.writer(open(output_path, "w")).writerows(content) diff --git a/setup.py b/setup.py index 83ac36580..0ae409056 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ def get_extras_require() -> str: "pybullet": ["pybullet"], } if sys.platform == "linux": - req["dev"].append("envpool>=0.4.5") + req["dev"].append("envpool>=0.5.3") return req diff --git a/test/base/test_env.py b/test/base/test_env.py index 7e507c4ed..3b3a74c31 100644 --- a/test/base/test_env.py +++ b/test/base/test_env.py @@ -2,16 +2,29 @@ import time import numpy as np +import pytest from gym.spaces.discrete import Discrete from tianshou.data import Batch -from tianshou.env import DummyVectorEnv, RayVectorEnv, ShmemVectorEnv, SubprocVectorEnv +from tianshou.env import ( + DummyVectorEnv, + RayVectorEnv, + ShmemVectorEnv, + SubprocVectorEnv, + VectorEnvNormObs, +) +from tianshou.utils import RunningMeanStd if __name__ == '__main__': from env import MyTestEnv, NXEnv else: # pytest from test.base.env import MyTestEnv, NXEnv +try: + import envpool +except ImportError: + envpool = None + def has_ray(): try: @@ -195,7 +208,7 @@ def assert_get(v, expected): v.close() -def test_env_obs(): +def test_env_obs_dtype(): for obs_type in ["array", "object"]: envs = SubprocVectorEnv( [lambda i=x: NXEnv(i, obs_type) for x in [5, 10, 15, 20]] @@ -206,8 +219,70 @@ def test_env_obs(): assert obs.dtype == object +def run_align_norm_obs(raw_env, train_env, test_env, action_list): + eps = np.finfo(np.float32).eps.item() + raw_obs, train_obs = [raw_env.reset()], [train_env.reset()] + for action in action_list: + obs, rew, done, info = raw_env.step(action) + raw_obs.append(obs) + if np.any(done): + raw_obs.append(raw_env.reset(np.where(done)[0])) + obs, rew, done, info = train_env.step(action) + train_obs.append(obs) + if np.any(done): + train_obs.append(train_env.reset(np.where(done)[0])) + ref_rms = RunningMeanStd() + for ro, to in zip(raw_obs, train_obs): + ref_rms.update(ro) + no = (ro - ref_rms.mean) / np.sqrt(ref_rms.var + eps) + assert np.allclose(no, to) + assert np.allclose(ref_rms.mean, train_env.get_obs_rms().mean) + assert np.allclose(ref_rms.var, train_env.get_obs_rms().var) + assert np.allclose(ref_rms.mean, test_env.get_obs_rms().mean) + assert np.allclose(ref_rms.var, test_env.get_obs_rms().var) + test_obs = [test_env.reset()] + for action in action_list: + obs, rew, done, info = test_env.step(action) + test_obs.append(obs) + if np.any(done): + test_obs.append(test_env.reset(np.where(done)[0])) + for ro, to in zip(raw_obs, test_obs): + no = (ro - ref_rms.mean) / np.sqrt(ref_rms.var + eps) + assert np.allclose(no, to) + + +def test_venv_norm_obs(): + sizes = np.array([5, 10, 15, 20]) + action = np.array([1, 1, 1, 1]) + total_step = 30 + action_list = [action] * total_step + env_fns = [lambda i=x: MyTestEnv(size=i, array_state=True) for x in sizes] + raw = DummyVectorEnv(env_fns) + train_env = VectorEnvNormObs(DummyVectorEnv(env_fns)) + print(train_env.observation_space) + test_env = VectorEnvNormObs(DummyVectorEnv(env_fns), update_obs_rms=False) + test_env.set_obs_rms(train_env.get_obs_rms()) + run_align_norm_obs(raw, train_env, test_env, action_list) + + +@pytest.mark.skipif(envpool is None, reason="EnvPool doesn't support this platform") +def test_venv_wrapper_envpool(): + raw = envpool.make_gym("Ant-v3", num_envs=4) + train = VectorEnvNormObs(envpool.make_gym("Ant-v3", num_envs=4)) + test = VectorEnvNormObs( + envpool.make_gym("Ant-v3", num_envs=4), update_obs_rms=False + ) + test.set_obs_rms(train.get_obs_rms()) + actions = [ + np.array([raw.action_space.sample() for _ in range(4)]) for i in range(30) + ] + run_align_norm_obs(raw, train, test, actions) + + if __name__ == '__main__': - test_env_obs() + test_venv_norm_obs() + test_venv_wrapper_envpool() + test_env_obs_dtype() test_vecenv() test_async_env() test_async_check_id() diff --git a/tianshou/__init__.py b/tianshou/__init__.py index 5a9abbf1b..1d5ee077a 100644 --- a/tianshou/__init__.py +++ b/tianshou/__init__.py @@ -1,6 +1,6 @@ from tianshou import data, env, exploration, policy, trainer, utils -__version__ = "0.4.7" +__version__ = "0.4.8" __all__ = [ "env", diff --git a/tianshou/env/__init__.py b/tianshou/env/__init__.py index 3845f2691..66fbc6810 100644 --- a/tianshou/env/__init__.py +++ b/tianshou/env/__init__.py @@ -1,5 +1,6 @@ """Env package.""" +from tianshou.env.venv_wrappers import VectorEnvNormObs, VectorEnvWrapper from tianshou.env.venvs import ( BaseVectorEnv, DummyVectorEnv, @@ -19,5 +20,7 @@ "SubprocVectorEnv", "ShmemVectorEnv", "RayVectorEnv", + "VectorEnvWrapper", + "VectorEnvNormObs", "PettingZooEnv", ] diff --git a/tianshou/env/venv_wrappers.py b/tianshou/env/venv_wrappers.py new file mode 100644 index 000000000..860c390d9 --- /dev/null +++ b/tianshou/env/venv_wrappers.py @@ -0,0 +1,120 @@ +from typing import Any, List, Optional, Tuple, Union + +import numpy as np + +from tianshou.env.venvs import GYM_RESERVED_KEYS, BaseVectorEnv +from tianshou.utils import RunningMeanStd + + +class VectorEnvWrapper(BaseVectorEnv): + """Base class for vectorized environments wrapper.""" + + def __init__(self, venv: BaseVectorEnv) -> None: + self.venv = venv + self.is_async = venv.is_async + + def __len__(self) -> int: + return len(self.venv) + + def __getattribute__(self, key: str) -> Any: + if key in GYM_RESERVED_KEYS: # reserved keys in gym.Env + return getattr(self.venv, key) + else: + return super().__getattribute__(key) + + def get_env_attr( + self, + key: str, + id: Optional[Union[int, List[int], np.ndarray]] = None, + ) -> List[Any]: + return self.venv.get_env_attr(key, id) + + def set_env_attr( + self, + key: str, + value: Any, + id: Optional[Union[int, List[int], np.ndarray]] = None, + ) -> None: + return self.venv.set_env_attr(key, value, id) + + # TODO: compatible issue with reset -> (obs, info) + def reset( + self, id: Optional[Union[int, List[int], np.ndarray]] = None + ) -> np.ndarray: + return self.venv.reset(id) + + def step( + self, + action: np.ndarray, + id: Optional[Union[int, List[int], np.ndarray]] = None, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + return self.venv.step(action, id) + + def seed( + self, + seed: Optional[Union[int, List[int]]] = None, + ) -> List[Optional[List[int]]]: + return self.venv.seed(seed) + + def render(self, **kwargs: Any) -> List[Any]: + return self.venv.render(**kwargs) + + def close(self) -> None: + self.venv.close() + + +class VectorEnvNormObs(VectorEnvWrapper): + """An observation normalization wrapper for vectorized environments. + + :param bool update_obs_rms: whether to update obs_rms. Default to True. + :param float clip_obs: the maximum absolute value for observation. Default to + 10.0. + :param float epsilon: To avoid division by zero. + """ + + def __init__( + self, + venv: BaseVectorEnv, + update_obs_rms: bool = True, + clip_obs: float = 10.0, + epsilon: float = np.finfo(np.float32).eps.item(), + ) -> None: + super().__init__(venv) + # initialize observation running mean/std + self.update_obs_rms = update_obs_rms + self.obs_rms = RunningMeanStd() + self.clip_max = clip_obs + self.eps = epsilon + + # TODO: compatible issue with reset -> (obs, info) + def reset( + self, id: Optional[Union[int, List[int], np.ndarray]] = None + ) -> np.ndarray: + obs = self.venv.reset(id) + if self.obs_rms and self.update_obs_rms: + self.obs_rms.update(obs) + return self._norm_obs(obs) + + def step( + self, + action: np.ndarray, + id: Optional[Union[int, List[int], np.ndarray]] = None, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + obs, rew, done, info = self.venv.step(action, id) + if self.obs_rms and self.update_obs_rms: + self.obs_rms.update(obs) + return self._norm_obs(obs), rew, done, info + + def _norm_obs(self, obs: np.ndarray) -> np.ndarray: + if self.obs_rms: + obs = (obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.eps) + obs = np.clip(obs, -self.clip_max, self.clip_max) + return obs + + def set_obs_rms(self, obs_rms: RunningMeanStd) -> None: + """Set with given observation running mean/std.""" + self.obs_rms = obs_rms + + def get_obs_rms(self) -> RunningMeanStd: + """Return observation running mean/std.""" + return self.obs_rms diff --git a/tianshou/env/venvs.py b/tianshou/env/venvs.py index 044ecaaa4..93558d9ef 100644 --- a/tianshou/env/venvs.py +++ b/tianshou/env/venvs.py @@ -9,11 +9,14 @@ RayEnvWorker, SubprocEnvWorker, ) -from tianshou.utils import RunningMeanStd + +GYM_RESERVED_KEYS = [ + "metadata", "reward_range", "spec", "action_space", "observation_space" +] class BaseVectorEnv(object): - """Base class for vectorized environments wrapper. + """Base class for vectorized environments. Usage: :: @@ -61,13 +64,6 @@ def seed(self, seed): :param float timeout: use in asynchronous simulation same as above, in each vectorized step it only deal with those environments spending time within ``timeout`` seconds. - :param bool norm_obs: Whether to track mean/std of data and normalize observation - on return. For now, observation normalization only support observation of - type np.ndarray. - :param obs_rms: class to track mean&std of observation. If not given, it will - initialize a new one. Usually in envs that is used to evaluate algorithm, - obs_rms should be passed in. Default to None. - :param bool update_obs_rms: Whether to update obs_rms. Default to True. """ def __init__( @@ -76,9 +72,6 @@ def __init__( worker_fn: Callable[[Callable[[], gym.Env]], EnvWorker], wait_num: Optional[int] = None, timeout: Optional[float] = None, - norm_obs: bool = False, - obs_rms: Optional[RunningMeanStd] = None, - update_obs_rms: bool = True, ) -> None: self._env_fns = env_fns # A VectorEnv contains a pool of EnvWorkers, which corresponds to @@ -106,12 +99,6 @@ def __init__( self.ready_id = list(range(self.env_num)) self.is_closed = False - # initialize observation running mean/std - self.norm_obs = norm_obs - self.update_obs_rms = update_obs_rms - self.obs_rms = RunningMeanStd() if obs_rms is None and norm_obs else obs_rms - self.__eps = np.finfo(np.float32).eps.item() - def _assert_is_not_closed(self) -> None: assert not self.is_closed, \ f"Methods of {self.__class__.__name__} cannot be called after close." @@ -127,9 +114,7 @@ def __getattribute__(self, key: str) -> Any: ``action_space``. However, we would like the attribute lookup to go straight into the worker (in fact, this vector env's action_space is always None). """ - if key in [ - 'metadata', 'reward_range', 'spec', 'action_space', 'observation_space' - ]: # reserved keys in gym.Env + if key in GYM_RESERVED_KEYS: # reserved keys in gym.Env return self.get_env_attr(key) else: return super().__getattribute__(key) @@ -137,7 +122,7 @@ def __getattribute__(self, key: str) -> Any: def get_env_attr( self, key: str, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> List[Any]: """Get an attribute from the underlying environments. @@ -162,7 +147,7 @@ def set_env_attr( self, key: str, value: Any, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> None: """Set an attribute in the underlying environments. @@ -183,7 +168,7 @@ def set_env_attr( def _wrap_id( self, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> Union[List[int], np.ndarray]: if id is None: return list(range(self.env_num)) @@ -218,14 +203,12 @@ def reset( obs = np.stack(obs_list) except ValueError: # different len(obs) obs = np.array(obs_list, dtype=object) - if self.obs_rms and self.update_obs_rms: - self.obs_rms.update(obs) - return self.normalize_obs(obs) + return obs def step( self, action: np.ndarray, - id: Optional[Union[int, List[int], np.ndarray]] = None + id: Optional[Union[int, List[int], np.ndarray]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Run one timestep of some environments' dynamics. @@ -299,13 +282,11 @@ def step( rew_stack, done_stack, info_stack = map( np.stack, [rew_list, done_list, info_list] ) - if self.obs_rms and self.update_obs_rms: - self.obs_rms.update(obs_stack) - return self.normalize_obs(obs_stack), rew_stack, done_stack, info_stack + return obs_stack, rew_stack, done_stack, info_stack def seed( self, - seed: Optional[Union[int, List[int]]] = None + seed: Optional[Union[int, List[int]]] = None, ) -> List[Optional[List[int]]]: """Set the seed for all environments. @@ -347,15 +328,6 @@ def close(self) -> None: w.close() self.is_closed = True - def normalize_obs(self, obs: np.ndarray) -> np.ndarray: - """Normalize observations by statistics in obs_rms.""" - if self.obs_rms and self.norm_obs: - clip_max = 10.0 # this magic number is from openai baselines - # see baselines/common/vec_env/vec_normalize.py#L10 - obs = (obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.__eps) - obs = np.clip(obs, -clip_max, clip_max) - return obs - class DummyVectorEnv(BaseVectorEnv): """Dummy vectorized environment wrapper, implemented in for-loop.