diff --git a/docs/_static/images/Ant-v2.png b/docs/_static/images/Ant-v2.png new file mode 100644 index 000000000..b5497592a Binary files /dev/null and b/docs/_static/images/Ant-v2.png differ diff --git a/examples/ant_v2_ddpg.py b/examples/ant_v2_ddpg.py new file mode 100644 index 000000000..93f2d9826 --- /dev/null +++ b/examples/ant_v2_ddpg.py @@ -0,0 +1,105 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import DDPGPolicy +from tianshou.trainer import offpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env import VectorEnv, SubprocVectorEnv + +if __name__ == '__main__': + from continuous_net import Actor, Critic +else: # pytest + from test.continuous.net import Actor, Critic + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='Ant-v2') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--actor-lr', type=float, default=1e-4) + parser.add_argument('--critic-lr', type=float, default=1e-3) + parser.add_argument('--gamma', type=float, default=0.99) + parser.add_argument('--tau', type=float, default=0.005) + parser.add_argument('--exploration-noise', type=float, default=0.1) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=2400) + parser.add_argument('--collect-per-step', type=int, default=4) + parser.add_argument('--batch-size', type=int, default=128) + parser.add_argument('--layer-num', type=int, default=1) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=100) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_known_args()[0] + return args + + +def test_ddpg(args=get_args()): + env = gym.make(args.task) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.action_space.shape or env.action_space.n + args.max_action = env.action_space.high[0] + # train_envs = gym.make(args.task) + train_envs = VectorEnv( + [lambda: gym.make(args.task) for _ in range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: gym.make(args.task) for _ in range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + actor = Actor( + args.layer_num, args.state_shape, args.action_shape, + args.max_action, args.device + ).to(args.device) + actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) + critic = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) + policy = DDPGPolicy( + actor, actor_optim, critic, critic_optim, + args.tau, args.gamma, args.exploration_noise, + [env.action_space.low[0], env.action_space.high[0]], + reward_normalization=True, ignore_done=True) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # log + writer = SummaryWriter(args.logdir + '/' + 'ddpg') + + def stop_fn(x): + return x >= env.spec.reward_threshold + + # trainer + result = offpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.test_num, + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) + assert stop_fn(result['best_reward']) + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = gym.make(args.task) + collector = Collector(policy, env) + result = collector.collect(n_episode=1, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_ddpg() diff --git a/examples/ant_v2_sac.py b/examples/ant_v2_sac.py new file mode 100644 index 000000000..976847467 --- /dev/null +++ b/examples/ant_v2_sac.py @@ -0,0 +1,110 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import SACPolicy +from tianshou.trainer import offpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env import VectorEnv, SubprocVectorEnv + +if __name__ == '__main__': + from continuous_net import ActorProb, Critic +else: # pytest + from test.continuous.net import ActorProb, Critic + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='Ant-v2') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--actor-lr', type=float, default=3e-4) + parser.add_argument('--critic-lr', type=float, default=1e-3) + parser.add_argument('--gamma', type=float, default=0.99) + parser.add_argument('--tau', type=float, default=0.005) + parser.add_argument('--alpha', type=float, default=0.2) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=2400) + parser.add_argument('--collect-per-step', type=int, default=10) + parser.add_argument('--batch-size', type=int, default=128) + parser.add_argument('--layer-num', type=int, default=1) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=100) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_known_args()[0] + return args + + +def test_sac(args=get_args()): + env = gym.make(args.task) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.action_space.shape or env.action_space.n + args.max_action = env.action_space.high[0] + # train_envs = gym.make(args.task) + train_envs = VectorEnv( + [lambda: gym.make(args.task) for _ in range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: gym.make(args.task) for _ in range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + actor = ActorProb( + args.layer_num, args.state_shape, args.action_shape, + args.max_action, args.device + ).to(args.device) + actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) + critic1 = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) + critic2 = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) + policy = SACPolicy( + actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, + args.tau, args.gamma, args.alpha, + [env.action_space.low[0], env.action_space.high[0]], + reward_normalization=True, ignore_done=True) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # train_collector.collect(n_step=args.buffer_size) + # log + writer = SummaryWriter(args.logdir + '/' + 'sac') + + def stop_fn(x): + return x >= env.spec.reward_threshold + + # trainer + result = offpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.test_num, + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) + assert stop_fn(result['best_reward']) + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = gym.make(args.task) + collector = Collector(policy, env) + result = collector.collect(n_episode=1, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_sac() diff --git a/examples/ant_v2_td3.py b/examples/ant_v2_td3.py new file mode 100644 index 000000000..bd581c6e7 --- /dev/null +++ b/examples/ant_v2_td3.py @@ -0,0 +1,114 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import TD3Policy +from tianshou.trainer import offpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env import VectorEnv, SubprocVectorEnv + +if __name__ == '__main__': + from continuous_net import Actor, Critic +else: # pytest + from test.continuous.net import Actor, Critic + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='Ant-v2') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--actor-lr', type=float, default=3e-4) + parser.add_argument('--critic-lr', type=float, default=1e-3) + parser.add_argument('--gamma', type=float, default=0.99) + parser.add_argument('--tau', type=float, default=0.005) + parser.add_argument('--exploration-noise', type=float, default=0.1) + parser.add_argument('--policy-noise', type=float, default=0.2) + parser.add_argument('--noise-clip', type=float, default=0.5) + parser.add_argument('--update-actor-freq', type=int, default=2) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=2400) + parser.add_argument('--collect-per-step', type=int, default=10) + parser.add_argument('--batch-size', type=int, default=128) + parser.add_argument('--layer-num', type=int, default=1) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=100) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_known_args()[0] + return args + + +def test_td3(args=get_args()): + env = gym.make(args.task) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.action_space.shape or env.action_space.n + args.max_action = env.action_space.high[0] + # train_envs = gym.make(args.task) + train_envs = VectorEnv( + [lambda: gym.make(args.task) for _ in range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: gym.make(args.task) for _ in range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + actor = Actor( + args.layer_num, args.state_shape, args.action_shape, + args.max_action, args.device + ).to(args.device) + actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) + critic1 = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) + critic2 = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) + policy = TD3Policy( + actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, + args.tau, args.gamma, args.exploration_noise, args.policy_noise, + args.update_actor_freq, args.noise_clip, + [env.action_space.low[0], env.action_space.high[0]], + reward_normalization=True, ignore_done=True) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # train_collector.collect(n_step=args.buffer_size) + # log + writer = SummaryWriter(args.logdir + '/' + 'td3') + + def stop_fn(x): + return x >= env.spec.reward_threshold + + # trainer + result = offpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.test_num, + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) + assert stop_fn(result['best_reward']) + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = gym.make(args.task) + collector = Collector(policy, env) + result = collector.collect(n_episode=1, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_td3() diff --git a/examples/continuous_net.py b/examples/continuous_net.py new file mode 100644 index 000000000..1598a1ff5 --- /dev/null +++ b/examples/continuous_net.py @@ -0,0 +1,79 @@ +import torch +import numpy as np +from torch import nn + + +class Actor(nn.Module): + def __init__(self, layer_num, state_shape, action_shape, + max_action, device='cpu'): + super().__init__() + self.device = device + self.model = [ + nn.Linear(np.prod(state_shape), 128), + nn.ReLU(inplace=True)] + for i in range(layer_num): + self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)] + self.model += [nn.Linear(128, np.prod(action_shape))] + self.model = nn.Sequential(*self.model) + self._max = max_action + + def forward(self, s, **kwargs): + s = torch.tensor(s, device=self.device, dtype=torch.float) + batch = s.shape[0] + s = s.view(batch, -1) + logits = self.model(s) + logits = self._max * torch.tanh(logits) + return logits, None + + +class ActorProb(nn.Module): + def __init__(self, layer_num, state_shape, action_shape, + max_action, device='cpu'): + super().__init__() + self.device = device + self.model = [ + nn.Linear(np.prod(state_shape), 128), + nn.ReLU(inplace=True)] + for i in range(layer_num): + self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)] + self.model = nn.Sequential(*self.model) + self.mu = nn.Linear(128, np.prod(action_shape)) + self.sigma = nn.Linear(128, np.prod(action_shape)) + self._max = max_action + + def forward(self, s, **kwargs): + if not isinstance(s, torch.Tensor): + s = torch.tensor(s, device=self.device, dtype=torch.float) + batch = s.shape[0] + s = s.view(batch, -1) + logits = self.model(s) + mu = self._max * torch.tanh(self.mu(logits)) + sigma = torch.exp(self.sigma(logits)) + return (mu, sigma), None + + +class Critic(nn.Module): + def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'): + super().__init__() + self.device = device + self.model = [ + nn.Linear(np.prod(state_shape) + np.prod(action_shape), 128), + nn.ReLU(inplace=True)] + for i in range(layer_num): + self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)] + self.model += [nn.Linear(128, 1)] + self.model = nn.Sequential(*self.model) + + def forward(self, s, a=None): + if not isinstance(s, torch.Tensor): + s = torch.tensor(s, device=self.device, dtype=torch.float) + if a is not None and not isinstance(a, torch.Tensor): + a = torch.tensor(a, device=self.device, dtype=torch.float) + batch = s.shape[0] + s = s.view(batch, -1) + if a is None: + logits = self.model(s) + else: + a = a.view(batch, -1) + logits = self.model(torch.cat([s, a], dim=1)) + return logits diff --git a/examples/discrete_net.py b/examples/discrete_net.py new file mode 100644 index 000000000..14d2c6e5e --- /dev/null +++ b/examples/discrete_net.py @@ -0,0 +1,81 @@ +import torch +import numpy as np +from torch import nn +import torch.nn.functional as F + + +class Net(nn.Module): + def __init__(self, layer_num, state_shape, action_shape=0, device='cpu'): + super().__init__() + self.device = device + self.model = [ + nn.Linear(np.prod(state_shape), 128), + nn.ReLU(inplace=True)] + for i in range(layer_num): + self.model += [nn.Linear(128, 128), nn.ReLU(inplace=True)] + if action_shape: + self.model += [nn.Linear(128, np.prod(action_shape))] + self.model = nn.Sequential(*self.model) + + def forward(self, s, state=None, info={}): + if not isinstance(s, torch.Tensor): + s = torch.tensor(s, device=self.device, dtype=torch.float) + batch = s.shape[0] + s = s.view(batch, -1) + logits = self.model(s) + return logits, state + + +class Actor(nn.Module): + def __init__(self, preprocess_net, action_shape): + super().__init__() + self.preprocess = preprocess_net + self.last = nn.Linear(128, np.prod(action_shape)) + + def forward(self, s, state=None, info={}): + logits, h = self.preprocess(s, state) + logits = F.softmax(self.last(logits), dim=-1) + return logits, h + + +class Critic(nn.Module): + def __init__(self, preprocess_net): + super().__init__() + self.preprocess = preprocess_net + self.last = nn.Linear(128, 1) + + def forward(self, s): + logits, h = self.preprocess(s, None) + logits = self.last(logits) + return logits + + +class DQN(nn.Module): + + def __init__(self, h, w, action_shape, device='cpu'): + super(DQN, self).__init__() + self.device = device + + self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2) + self.bn1 = nn.BatchNorm2d(16) + self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) + self.bn2 = nn.BatchNorm2d(32) + self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) + self.bn3 = nn.BatchNorm2d(32) + + def conv2d_size_out(size, kernel_size=5, stride=2): + return (size - (kernel_size - 1) - 1) // stride + 1 + + convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) + convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) + linear_input_size = convw * convh * 32 + self.head = nn.Linear(linear_input_size, action_shape) + + def forward(self, x, state=None, info={}): + if not isinstance(x, torch.Tensor): + x = torch.tensor(x, device=self.device, dtype=torch.float) + x = x.permute(0, 3, 1, 2) + x = F.relu(self.bn1(self.conv1(x))) + x = F.relu(self.bn2(self.conv2(x))) + x = F.relu(self.bn3(self.conv3(x))) + return self.head(x.view(x.size(0), -1)), state diff --git a/examples/point_maze_td3.py b/examples/point_maze_td3.py new file mode 100644 index 000000000..5632ee486 --- /dev/null +++ b/examples/point_maze_td3.py @@ -0,0 +1,119 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import TD3Policy +from tianshou.trainer import offpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env import VectorEnv, SubprocVectorEnv + +if __name__ == '__main__': + from continuous_net import Actor, Critic +else: # pytest + from test.continuous.net import Actor, Critic + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='PointMaze-v0') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--actor-lr', type=float, default=3e-5) + parser.add_argument('--critic-lr', type=float, default=1e-4) + parser.add_argument('--gamma', type=float, default=0.99) + parser.add_argument('--tau', type=float, default=0.005) + parser.add_argument('--exploration-noise', type=float, default=0.1) + parser.add_argument('--policy-noise', type=float, default=0.2) + parser.add_argument('--noise-clip', type=float, default=0.5) + parser.add_argument('--update-actor-freq', type=int, default=2) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=2400) + parser.add_argument('--collect-per-step', type=int, default=10) + parser.add_argument('--batch-size', type=int, default=128) + parser.add_argument('--layer-num', type=int, default=1) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=100) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + parser.add_argument('--max_episode_steps', type=int, default=2000) + + args = parser.parse_known_args()[0] + return args + + +def test_td3(args=get_args()): + env = gym.make(args.task) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.action_space.shape or env.action_space.n + args.max_action = env.action_space.high[0] + # train_envs = gym.make(args.task) + train_envs = VectorEnv( + [lambda: gym.make(args.task) for _ in range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: gym.make(args.task) for _ in range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + actor = Actor( + args.layer_num, args.state_shape, args.action_shape, + args.max_action, args.device + ).to(args.device) + actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) + critic1 = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) + critic2 = Critic( + args.layer_num, args.state_shape, args.action_shape, args.device + ).to(args.device) + critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) + policy = TD3Policy( + actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, + args.tau, args.gamma, args.exploration_noise, args.policy_noise, + args.update_actor_freq, args.noise_clip, + [env.action_space.low[0], env.action_space.high[0]], + reward_normalization=True, ignore_done=True) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # train_collector.collect(n_step=args.buffer_size) + # log + writer = SummaryWriter(args.logdir + '/' + 'td3') + + def stop_fn(x): + if env.spec.reward_threshold: + return x >= env.spec.reward_threshold + else: + return False + + # trainer + result = offpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.test_num, + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) + assert stop_fn(result['best_reward']) + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = gym.make(args.task) + collector = Collector(policy, env) + result = collector.collect(n_step=1000, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_td3() diff --git a/examples/pong_a2c.py b/examples/pong_a2c.py new file mode 100644 index 000000000..ce55584c1 --- /dev/null +++ b/examples/pong_a2c.py @@ -0,0 +1,108 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import A2CPolicy +from tianshou.env import SubprocVectorEnv +from tianshou.trainer import onpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env.atari import create_atari_environment + +if __name__ == '__main__': + from discrete_net import Net, Actor, Critic +else: # pytest + from test.discrete.net import Net, Actor, Critic + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='Pong') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--lr', type=float, default=3e-4) + parser.add_argument('--gamma', type=float, default=0.9) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=1000) + parser.add_argument('--collect-per-step', type=int, default=100) + parser.add_argument('--repeat-per-collect', type=int, default=1) + parser.add_argument('--batch-size', type=int, default=64) + parser.add_argument('--layer-num', type=int, default=2) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=8) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + # a2c special + parser.add_argument('--vf-coef', type=float, default=0.5) + parser.add_argument('--ent-coef', type=float, default=0.001) + parser.add_argument('--max-grad-norm', type=float, default=None) + parser.add_argument('--max_episode_steps', type=int, default=2000) + args = parser.parse_known_args()[0] + return args + + +def test_a2c(args=get_args()): + env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.env.action_space.shape or env.env.action_space.n + # train_envs = gym.make(args.task) + train_envs = SubprocVectorEnv( + [lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in + range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in + range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + net = Net(args.layer_num, args.state_shape, device=args.device) + actor = Actor(net, args.action_shape).to(args.device) + critic = Critic(net).to(args.device) + optim = torch.optim.Adam(list( + actor.parameters()) + list(critic.parameters()), lr=args.lr) + dist = torch.distributions.Categorical + policy = A2CPolicy( + actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef, + ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # log + writer = SummaryWriter(args.logdir + '/' + 'a2c') + + def stop_fn(x): + if env.env.spec.reward_threshold: + return x >= env.spec.reward_threshold + else: + return False + + # trainer + result = onpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, + args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = create_atari_environment(args.task) + collector = Collector(policy, env) + result = collector.collect(n_episode=1, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_a2c() diff --git a/examples/pong_dqn.py b/examples/pong_dqn.py new file mode 100644 index 000000000..d32233b99 --- /dev/null +++ b/examples/pong_dqn.py @@ -0,0 +1,112 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import DQNPolicy +from tianshou.env import SubprocVectorEnv +from tianshou.trainer import offpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env.atari import create_atari_environment + +if __name__ == '__main__': + from discrete_net import DQN +else: # pytest + from test.discrete.net import DQN + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='Pong') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--eps-test', type=float, default=0.05) + parser.add_argument('--eps-train', type=float, default=0.1) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--gamma', type=float, default=0.9) + parser.add_argument('--n-step', type=int, default=1) + parser.add_argument('--target-update-freq', type=int, default=320) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=1000) + parser.add_argument('--collect-per-step', type=int, default=10) + parser.add_argument('--batch-size', type=int, default=64) + parser.add_argument('--layer-num', type=int, default=3) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=8) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_known_args()[0] + return args + + +def test_dqn(args=get_args()): + env = create_atari_environment(args.task) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.env.action_space.shape or env.env.action_space.n + # train_envs = gym.make(args.task) + train_envs = SubprocVectorEnv( + [lambda: create_atari_environment(args.task) for _ in range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: create_atari_environment(args.task) for _ in range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device) + net = net.to(args.device) + optim = torch.optim.Adam(net.parameters(), lr=args.lr) + policy = DQNPolicy( + net, optim, args.gamma, args.n_step, + use_target_network=args.target_update_freq > 0, + target_update_freq=args.target_update_freq) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # policy.set_eps(1) + train_collector.collect(n_step=args.batch_size * 4) + print(len(train_collector.buffer)) + # log + writer = SummaryWriter(args.logdir + '/' + 'dqn') + + def stop_fn(x): + if env.env.spec.reward_threshold: + return x >= env.spec.reward_threshold + else: + return False + + def train_fn(x): + policy.set_eps(args.eps_train) + + def test_fn(x): + policy.set_eps(args.eps_test) + + # trainer + result = offpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.test_num, + args.batch_size, train_fn=train_fn, test_fn=test_fn, + stop_fn=stop_fn, writer=writer, task=args.task) + + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = create_atari_environment(args.task) + collector = Collector(policy, env) + result = collector.collect(n_episode=1, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_dqn(get_args()) diff --git a/examples/pong_ppo.py b/examples/pong_ppo.py new file mode 100644 index 000000000..8374b9aed --- /dev/null +++ b/examples/pong_ppo.py @@ -0,0 +1,112 @@ +import gym +import torch +import pprint +import argparse +import numpy as np +from torch.utils.tensorboard import SummaryWriter + +from tianshou.policy import PPOPolicy +from tianshou.env import SubprocVectorEnv +from tianshou.trainer import onpolicy_trainer +from tianshou.data import Collector, ReplayBuffer +from tianshou.env.atari import create_atari_environment + +if __name__ == '__main__': + from discrete_net import Net, Actor, Critic +else: # pytest + from test.discrete.net import Net, Actor, Critic + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', type=str, default='Pong') + parser.add_argument('--seed', type=int, default=1626) + parser.add_argument('--buffer-size', type=int, default=20000) + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--gamma', type=float, default=0.99) + parser.add_argument('--epoch', type=int, default=100) + parser.add_argument('--step-per-epoch', type=int, default=1000) + parser.add_argument('--collect-per-step', type=int, default=100) + parser.add_argument('--repeat-per-collect', type=int, default=2) + parser.add_argument('--batch-size', type=int, default=64) + parser.add_argument('--layer-num', type=int, default=1) + parser.add_argument('--training-num', type=int, default=8) + parser.add_argument('--test-num', type=int, default=8) + parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( + '--device', type=str, + default='cuda' if torch.cuda.is_available() else 'cpu') + # ppo special + parser.add_argument('--vf-coef', type=float, default=0.5) + parser.add_argument('--ent-coef', type=float, default=0.0) + parser.add_argument('--eps-clip', type=float, default=0.2) + parser.add_argument('--max-grad-norm', type=float, default=0.5) + parser.add_argument('--max_episode_steps', type=int, default=2000) + args = parser.parse_known_args()[0] + return args + + +def test_ppo(args=get_args()): + env = create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) + args.state_shape = env.observation_space.shape or env.observation_space.n + args.action_shape = env.action_space().shape or env.action_space().n + # train_envs = gym.make(args.task) + train_envs = SubprocVectorEnv( + [lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in + range(args.training_num)]) + # test_envs = gym.make(args.task) + test_envs = SubprocVectorEnv( + [lambda: create_atari_environment(args.task, max_episode_steps=args.max_episode_steps) for _ in + range(args.test_num)]) + # seed + np.random.seed(args.seed) + torch.manual_seed(args.seed) + train_envs.seed(args.seed) + test_envs.seed(args.seed) + # model + net = Net(args.layer_num, args.state_shape, device=args.device) + actor = Actor(net, args.action_shape).to(args.device) + critic = Critic(net).to(args.device) + optim = torch.optim.Adam(list( + actor.parameters()) + list(critic.parameters()), lr=args.lr) + dist = torch.distributions.Categorical + policy = PPOPolicy( + actor, critic, optim, dist, args.gamma, + max_grad_norm=args.max_grad_norm, + eps_clip=args.eps_clip, + vf_coef=args.vf_coef, + ent_coef=args.ent_coef, + action_range=None) + # collector + train_collector = Collector( + policy, train_envs, ReplayBuffer(args.buffer_size)) + test_collector = Collector(policy, test_envs) + # log + writer = SummaryWriter(args.logdir + '/' + 'ppo') + + def stop_fn(x): + if env.env.spec.reward_threshold: + return x >= env.spec.reward_threshold + else: + return False + + # trainer + result = onpolicy_trainer( + policy, train_collector, test_collector, args.epoch, + args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, + args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) + train_collector.close() + test_collector.close() + if __name__ == '__main__': + pprint.pprint(result) + # Let's watch its performance! + env = create_atari_environment(args.task) + collector = Collector(policy, env) + result = collector.collect(n_step=2000, render=args.render) + print(f'Final reward: {result["rew"]}, length: {result["len"]}') + collector.close() + + +if __name__ == '__main__': + test_ppo() diff --git a/setup.py b/setup.py index 590d5fa4d..646d14627 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ extras_require={ 'atari': [ 'atari_py', + 'cv2' ], 'mujoco': [ 'mujoco_py', diff --git a/test/continuous/test_ddpg.py b/test/continuous/test_ddpg.py index cb100da4c..a1ac3abfe 100644 --- a/test/continuous/test_ddpg.py +++ b/test/continuous/test_ddpg.py @@ -34,6 +34,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -79,7 +80,7 @@ def test_ddpg(args=get_args()): policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(x): return x >= env.spec.reward_threshold @@ -88,7 +89,7 @@ def stop_fn(x): result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, - args.batch_size, stop_fn=stop_fn, writer=writer) + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -97,7 +98,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/continuous/test_ppo.py b/test/continuous/test_ppo.py index 3cfb9f289..c447bbd59 100644 --- a/test/continuous/test_ppo.py +++ b/test/continuous/test_ppo.py @@ -32,6 +32,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=16) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -87,7 +88,7 @@ def _test_ppo(args=get_args()): test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.step_per_epoch) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold @@ -96,7 +97,7 @@ def stop_fn(x): result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, - args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) + args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -105,7 +106,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/continuous/test_sac.py b/test/continuous/test_sac.py index a900940d3..948b5e399 100644 --- a/test/continuous/test_sac.py +++ b/test/continuous/test_sac.py @@ -34,6 +34,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -84,7 +85,7 @@ def test_sac(args=get_args()): test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'sac') def stop_fn(x): return x >= env.spec.reward_threshold @@ -93,7 +94,7 @@ def stop_fn(x): result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, - args.batch_size, stop_fn=stop_fn, writer=writer) + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -102,7 +103,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/continuous/test_td3.py b/test/continuous/test_td3.py index 78db4934e..fb3dd728f 100644 --- a/test/continuous/test_td3.py +++ b/test/continuous/test_td3.py @@ -37,6 +37,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -88,7 +89,7 @@ def test_td3(args=get_args()): test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'td3') def stop_fn(x): return x >= env.spec.reward_threshold @@ -97,7 +98,7 @@ def stop_fn(x): result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, - args.batch_size, stop_fn=stop_fn, writer=writer) + args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -106,7 +107,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/net.py b/test/discrete/net.py index 71c72724b..7f62c8d23 100644 --- a/test/discrete/net.py +++ b/test/discrete/net.py @@ -48,3 +48,33 @@ def forward(self, s): logits, h = self.preprocess(s, None) logits = self.last(logits) return logits + + +class DQN(nn.Module): + + def __init__(self, h, w, action_shape, device='cpu'): + super(DQN, self).__init__() + self.device = device + + self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2) + self.bn1 = nn.BatchNorm2d(16) + self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) + self.bn2 = nn.BatchNorm2d(32) + self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) + self.bn3 = nn.BatchNorm2d(32) + + def conv2d_size_out(size, kernel_size=5, stride=2): + return (size - (kernel_size - 1) - 1) // stride + 1 + + convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) + convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) + linear_input_size = convw * convh * 32 + self.head = nn.Linear(linear_input_size, action_shape) + + def forward(self, x, state=None, info={}): + if not isinstance(x, torch.Tensor): + s = torch.tensor(x, device=self.device, dtype=torch.float) + x = F.relu(self.bn1(self.conv1(x))) + x = F.relu(self.bn2(self.conv2(x))) + x = F.relu(self.bn3(self.conv3(x))) + return self.head(x.view(x.size(0), -1)), state diff --git a/test/discrete/test_a2c.py b/test/discrete/test_a2c.py index 20ef0efc0..5c9269c3b 100644 --- a/test/discrete/test_a2c.py +++ b/test/discrete/test_a2c.py @@ -32,6 +32,8 @@ def get_args(): parser.add_argument('--training-num', type=int, default=32) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) + parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -73,7 +75,7 @@ def test_a2c(args=get_args()): policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold @@ -82,7 +84,7 @@ def stop_fn(x): result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, - args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) + args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -91,7 +93,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_dqn.py b/test/discrete/test_dqn.py index 3a119936f..f5951eb41 100644 --- a/test/discrete/test_dqn.py +++ b/test/discrete/test_dqn.py @@ -11,9 +11,9 @@ from tianshou.data import Collector, ReplayBuffer if __name__ == '__main__': - from net import Net + from net import DQN else: # pytest - from test.discrete.net import Net + from test.discrete.net import DQN def get_args(): @@ -35,6 +35,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -58,7 +59,7 @@ def test_dqn(args=get_args()): train_envs.seed(args.seed) test_envs.seed(args.seed) # model - net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) + net = DQN(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( @@ -73,7 +74,7 @@ def test_dqn(args=get_args()): train_collector.collect(n_step=args.batch_size) print(len(train_collector.buffer)) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold @@ -89,7 +90,7 @@ def test_fn(x): policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, - stop_fn=stop_fn, writer=writer) + stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() @@ -99,7 +100,7 @@ def test_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_pg.py b/test/discrete/test_pg.py index e0f4a083b..b00896c73 100644 --- a/test/discrete/test_pg.py +++ b/test/discrete/test_pg.py @@ -86,6 +86,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=8) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -121,7 +122,7 @@ def test_pg(args=get_args()): policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold @@ -130,7 +131,7 @@ def stop_fn(x): result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, - args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) + args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -139,7 +140,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/test/discrete/test_ppo.py b/test/discrete/test_ppo.py index d21dd6c02..d2b889caf 100644 --- a/test/discrete/test_ppo.py +++ b/test/discrete/test_ppo.py @@ -32,6 +32,7 @@ def get_args(): parser.add_argument('--training-num', type=int, default=32) parser.add_argument('--test-num', type=int, default=100) parser.add_argument('--logdir', type=str, default='log') + parser.add_argument('--render', type=float, default=0.) parser.add_argument( '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') @@ -78,7 +79,7 @@ def test_ppo(args=get_args()): policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log - writer = SummaryWriter(args.logdir) + writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold @@ -87,7 +88,7 @@ def stop_fn(x): result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, - args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) + args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() @@ -96,7 +97,7 @@ def stop_fn(x): # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) - result = collector.collect(n_episode=1, render=1 / 35) + result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() diff --git a/tianshou/data/batch.py b/tianshou/data/batch.py index 13777abbd..d2af6b552 100644 --- a/tianshou/data/batch.py +++ b/tianshou/data/batch.py @@ -37,7 +37,7 @@ def append(self, batch): else: raise TypeError( 'No support for append with type {} in class Batch.' - .format(type(batch.__dict__[k]))) + .format(type(batch.__dict__[k]))) def split(self, size=None, permute=True): length = min([ diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index 0673ac7ac..00b3dafee 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -2,7 +2,7 @@ import torch import numpy as np from copy import deepcopy - +import warnings from tianshou.env import BaseVectorEnv from tianshou.data import Batch, ReplayBuffer from tianshou.utils import MovAvg @@ -87,6 +87,7 @@ def _make_batch(self, data): return np.array([data]) def collect(self, n_step=0, n_episode=0, render=0): + warning_count = 0 if not self._multi_env: n_episode = np.sum(n_episode) start_time = time.time() @@ -97,6 +98,10 @@ def collect(self, n_step=0, n_episode=0, render=0): reward_sum = 0 length_sum = 0 while True: + if warning_count >= 100000: + warnings.warn( + 'There are already many steps in an episode. You should add a time limitation to your environment!', + Warning) if self._multi_env: batch_data = Batch( obs=self._obs, act=self._act, rew=self._rew, @@ -131,11 +136,14 @@ def collect(self, n_step=0, n_episode=0, render=0): 'rew': self._rew[i], 'done': self._done[i], 'obs_next': obs_next[i], 'info': self._info[i]} if self._cached_buf: + warning_count += 1 self._cached_buf[i].add(**data) elif self._multi_buf: + warning_count += 1 self.buffer[i].add(**data) cur_step += 1 else: + warning_count += 1 self.buffer.add(**data) cur_step += 1 if self._done[i]: diff --git a/tianshou/exploration/random.py b/tianshou/exploration/random.py index 011afbed8..12d92eaa0 100644 --- a/tianshou/exploration/random.py +++ b/tianshou/exploration/random.py @@ -14,7 +14,7 @@ def __call__(self, size, mu=.1): if self.x is None or self.x.shape != size: self.x = 0 self.x = self.x + self.alpha * (mu - self.x) + \ - self.beta * np.random.normal(size=size) + self.beta * np.random.normal(size=size) return self.x def reset(self): diff --git a/tianshou/policy/a2c.py b/tianshou/policy/a2c.py index 93337d41d..79601fd3f 100644 --- a/tianshou/policy/a2c.py +++ b/tianshou/policy/a2c.py @@ -39,6 +39,7 @@ def learn(self, batch, batch_size=None, repeat=1): a_loss = -(dist.log_prob(a) * (r - v).detach()).mean() vf_loss = F.mse_loss(r[:, None], v) ent_loss = dist.entropy().mean() + loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss loss.backward() if self._grad_norm: diff --git a/tianshou/policy/pg.py b/tianshou/policy/pg.py index c5e3b70ee..33ee27312 100644 --- a/tianshou/policy/pg.py +++ b/tianshou/policy/pg.py @@ -34,6 +34,9 @@ def __call__(self, batch, state=None): def learn(self, batch, batch_size=None, repeat=1): losses = [] + + batch.returns = (batch.returns - batch.returns.mean()) \ + / (batch.returns.std() + self._eps) r = batch.returns batch.returns = (r - r.mean()) / (r.std() + self._eps) for _ in range(repeat): diff --git a/tianshou/policy/ppo.py b/tianshou/policy/ppo.py index 01270eff8..53389feaf 100644 --- a/tianshou/policy/ppo.py +++ b/tianshou/policy/ppo.py @@ -58,6 +58,9 @@ def sync_weight(self): def learn(self, batch, batch_size=None, repeat=1): losses, clip_losses, vf_losses, ent_losses = [], [], [], [] + + batch.returns = (batch.returns - batch.returns.mean()) \ + / (batch.returns.std() + self._eps) r = batch.returns batch.returns = (r - r.mean()) / (r.std() + self._eps) batch.act = torch.tensor(batch.act) @@ -79,6 +82,7 @@ def learn(self, batch, batch_size=None, repeat=1): clip_losses.append(clip_loss.detach().cpu().numpy()) vf_loss = F.smooth_l1_loss(self.critic(b.obs), target_v) vf_losses.append(vf_loss.detach().cpu().numpy()) + e_loss = dist.entropy().mean() ent_losses.append(e_loss.detach().cpu().numpy()) loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss @@ -87,7 +91,7 @@ def learn(self, batch, batch_size=None, repeat=1): loss.backward() nn.utils.clip_grad_norm_(list( self.actor.parameters()) + list(self.critic.parameters()), - self._max_grad_norm) + self._max_grad_norm) self.optim.step() self.sync_weight() return { diff --git a/tianshou/trainer/offpolicy.py b/tianshou/trainer/offpolicy.py index a095061e9..731910860 100644 --- a/tianshou/trainer/offpolicy.py +++ b/tianshou/trainer/offpolicy.py @@ -8,7 +8,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch, step_per_epoch, collect_per_step, episode_per_test, batch_size, train_fn=None, test_fn=None, stop_fn=None, - writer=None, verbose=True): + writer=None, verbose=True, task=''): global_step = 0 best_epoch, best_reward = -1, -1 stat = {} @@ -47,7 +47,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch, data[k] = f'{result[k]:.2f}' if writer: writer.add_scalar( - k, result[k], global_step=global_step) + k + '_' + task, result[k], global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() @@ -55,7 +55,7 @@ def offpolicy_trainer(policy, train_collector, test_collector, max_epoch, data[k] = f'{stat[k].get():.6f}' if writer: writer.add_scalar( - k, stat[k].get(), global_step=global_step) + k + '_' + task, stat[k].get(), global_step=global_step) t.update(1) t.set_postfix(**data) if t.n <= t.total: diff --git a/tianshou/trainer/onpolicy.py b/tianshou/trainer/onpolicy.py index 79cad82f6..6548b7693 100644 --- a/tianshou/trainer/onpolicy.py +++ b/tianshou/trainer/onpolicy.py @@ -9,7 +9,7 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch, step_per_epoch, collect_per_step, repeat_per_collect, episode_per_test, batch_size, train_fn=None, test_fn=None, stop_fn=None, - writer=None, verbose=True): + writer=None, verbose=True, task=''): global_step = 0 best_epoch, best_reward = -1, -1 stat = {} @@ -52,15 +52,15 @@ def onpolicy_trainer(policy, train_collector, test_collector, max_epoch, data[k] = f'{result[k]:.2f}' if writer: writer.add_scalar( - k, result[k], global_step=global_step) + k + '_' + task, result[k], global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f'{stat[k].get():.6f}' - if writer: + if writer and global_step: writer.add_scalar( - k, stat[k].get(), global_step=global_step) + k + '_' + task, stat[k].get(), global_step=global_step) t.update(step) t.set_postfix(**data) if t.n <= t.total: