这是indexloc提供的服务,不要输入任何密码
Skip to content

add args render; change the writter of tensorboard to separately write for different envs and algorithms #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
9 changes: 5 additions & 4 deletions test/continuous/test_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v0')
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=1e-4)
Expand All @@ -34,6 +34,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -79,7 +80,7 @@ def test_ddpg(args=get_args()):
policy, train_envs, ReplayBuffer(args.buffer_size))
test_collector = Collector(policy, test_envs)
# log
writer = SummaryWriter(args.logdir)
writer = SummaryWriter(args.logdir + '/' + 'ddpg')

def stop_fn(x):
return x >= env.spec.reward_threshold
Expand All @@ -88,7 +89,7 @@ def stop_fn(x):
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer)
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -97,7 +98,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
9 changes: 5 additions & 4 deletions test/continuous/test_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v0')
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--lr', type=float, default=3e-4)
Expand All @@ -32,6 +32,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=16)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -87,7 +88,7 @@ def _test_ppo(args=get_args()):
test_collector = Collector(policy, test_envs)
train_collector.collect(n_step=args.step_per_epoch)
# log
writer = SummaryWriter(args.logdir)
writer = SummaryWriter(args.logdir + '/' + 'ppo')

def stop_fn(x):
return x >= env.spec.reward_threshold
Expand All @@ -96,7 +97,7 @@ def stop_fn(x):
result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -105,7 +106,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
9 changes: 5 additions & 4 deletions test/continuous/test_sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v0')
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=3e-4)
Expand All @@ -34,6 +34,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -84,7 +85,7 @@ def test_sac(args=get_args()):
test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size)
# log
writer = SummaryWriter(args.logdir)
writer = SummaryWriter(args.logdir + '/' + 'sac')

def stop_fn(x):
return x >= env.spec.reward_threshold
Expand All @@ -93,7 +94,7 @@ def stop_fn(x):
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer)
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -102,7 +103,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
9 changes: 5 additions & 4 deletions test/continuous/test_td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default='Pendulum-v0')
parser.add_argument('--task', type=str, default='Ant-v2')
parser.add_argument('--seed', type=int, default=1626)
parser.add_argument('--buffer-size', type=int, default=20000)
parser.add_argument('--actor-lr', type=float, default=3e-4)
Expand All @@ -37,6 +37,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -88,7 +89,7 @@ def test_td3(args=get_args()):
test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size)
# log
writer = SummaryWriter(args.logdir)
writer = SummaryWriter(args.logdir + '/' + 'td3')

def stop_fn(x):
return x >= env.spec.reward_threshold
Expand All @@ -97,7 +98,7 @@ def stop_fn(x):
result = offpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, stop_fn=stop_fn, writer=writer)
args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -106,7 +107,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
6 changes: 4 additions & 2 deletions test/discrete/test_a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def get_args():
parser.add_argument('--training-num', type=int, default=32)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)

parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -82,7 +84,7 @@ def stop_fn(x):
result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -91,7 +93,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
5 changes: 3 additions & 2 deletions test/discrete/test_dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -89,7 +90,7 @@ def test_fn(x):
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.test_num,
args.batch_size, train_fn=train_fn, test_fn=test_fn,
stop_fn=stop_fn, writer=writer)
stop_fn=stop_fn, writer=writer, task=args.task)

assert stop_fn(result['best_reward'])
train_collector.close()
Expand All @@ -99,7 +100,7 @@ def test_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
5 changes: 3 additions & 2 deletions test/discrete/test_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=8)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -130,7 +131,7 @@ def stop_fn(x):
result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -139,7 +140,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
5 changes: 3 additions & 2 deletions test/discrete/test_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def get_args():
parser.add_argument('--training-num', type=int, default=32)
parser.add_argument('--test-num', type=int, default=100)
parser.add_argument('--logdir', type=str, default='log')
parser.add_argument('--render', type=float, default=0.)
parser.add_argument(
'--device', type=str,
default='cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -87,7 +88,7 @@ def stop_fn(x):
result = onpolicy_trainer(
policy, train_collector, test_collector, args.epoch,
args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
assert stop_fn(result['best_reward'])
train_collector.close()
test_collector.close()
Expand All @@ -96,7 +97,7 @@ def stop_fn(x):
# Let's watch its performance!
env = gym.make(args.task)
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=1 / 35)
result = collector.collect(n_episode=1, render=args.render)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()

Expand Down
2 changes: 1 addition & 1 deletion tianshou/data/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def append(self, batch):
else:
raise TypeError(
'No support for append with type {} in class Batch.'
.format(type(batch.__dict__[k])))
.format(type(batch.__dict__[k])))

def split(self, size=None, permute=True):
length = min([
Expand Down
3 changes: 2 additions & 1 deletion tianshou/data/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def collect(self, n_step=0, n_episode=0, render=0):
done=self._make_batch(self._done),
obs_next=None,
info=self._make_batch(self._info))

result = self.policy(batch_data, self.state)
self.state = result.state if hasattr(result, 'state') else None
if isinstance(result.act, torch.Tensor):
Expand All @@ -120,7 +121,7 @@ def collect(self, n_step=0, n_episode=0, render=0):
obs_next, self._rew, self._done, self._info = self.env.step(
self._act if self._multi_env else self._act[0])
if render > 0:
self.env.render()
# self.env.render()
time.sleep(render)
self.length += 1
self.reward += self._rew
Expand Down
2 changes: 1 addition & 1 deletion tianshou/exploration/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __call__(self, size, mu=.1):
if self.x is None or self.x.shape != size:
self.x = 0
self.x = self.x + self.alpha * (mu - self.x) + \
self.beta * np.random.normal(size=size)
self.beta * np.random.normal(size=size)
return self.x

def reset(self):
Expand Down
21 changes: 11 additions & 10 deletions tianshou/policy/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,17 @@ def learn(self, batch, batch_size=None, repeat=1):
a_loss = -(dist.log_prob(a) * (r - v).detach()).mean()
vf_loss = F.mse_loss(r[:, None], v)
ent_loss = dist.entropy().mean()
loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss
loss.backward()
if self._grad_norm:
nn.utils.clip_grad_norm_(
self.model.parameters(), max_norm=self._grad_norm)
self.optim.step()
actor_losses.append(a_loss.detach().cpu().numpy())
vf_losses.append(vf_loss.detach().cpu().numpy())
ent_losses.append(ent_loss.detach().cpu().numpy())
losses.append(loss.detach().cpu().numpy())

loss = a_loss + self._w_vf * vf_loss - self._w_ent * ent_loss
loss.backward()
if self._grad_norm:
nn.utils.clip_grad_norm_(
self.model.parameters(), max_norm=self._grad_norm)
self.optim.step()
actor_losses.append(a_loss.detach().cpu().numpy())
vf_losses.append(vf_loss.detach().cpu().numpy())
ent_losses.append(ent_loss.detach().cpu().numpy())
losses.append(loss.detach().cpu().numpy())
return {
'loss': losses,
'loss/actor': actor_losses,
Expand Down
Loading