这是indexloc提供的服务,不要输入任何密码
Skip to content

update save_fn in trainer #459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_version() -> str:
"isort",
"pytest",
"pytest-cov",
"ray>=1.0.0",
"ray>=1.0.0,<1.7.0",
"wandb>=0.12.0",
"networkx",
"mypy",
Expand Down
22 changes: 22 additions & 0 deletions tianshou/data/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ def collect(
* ``rews`` array of episode reward over collected episodes.
* ``lens`` array of episode length over collected episodes.
* ``idxs`` array of episode start index in buffer over collected episodes.
* ``rew`` mean of episodic rewards.
* ``len`` mean of episodic lengths.
* ``rew_std`` standard error of episodic rewards.
* ``len_std`` standard error of episodic lengths.
"""
assert not self.env.is_async, "Please use AsyncCollector if using async venv."
if n_step is not None:
Expand Down Expand Up @@ -311,15 +315,22 @@ def collect(
[episode_rews, episode_lens, episode_start_indices]
)
)
rew_mean, rew_std = rews.mean(), rews.std()
len_mean, len_std = lens.mean(), lens.std()
else:
rews, lens, idxs = np.array([]), np.array([], int), np.array([], int)
rew_mean = rew_std = len_mean = len_std = 0

return {
"n/ep": episode_count,
"n/st": step_count,
"rews": rews,
"lens": lens,
"idxs": idxs,
"rew": rew_mean,
"len": len_mean,
"rew_std": rew_std,
"len_std": len_std,
}


Expand Down Expand Up @@ -380,6 +391,10 @@ def collect(
* ``rews`` array of episode reward over collected episodes.
* ``lens`` array of episode length over collected episodes.
* ``idxs`` array of episode start index in buffer over collected episodes.
* ``rew`` mean of episodic rewards.
* ``len`` mean of episodic lengths.
* ``rew_std`` standard error of episodic rewards.
* ``len_std`` standard error of episodic lengths.
"""
# collect at least n_step or n_episode
if n_step is not None:
Expand Down Expand Up @@ -530,13 +545,20 @@ def collect(
[episode_rews, episode_lens, episode_start_indices]
)
)
rew_mean, rew_std = rews.mean(), rews.std()
len_mean, len_std = lens.mean(), lens.std()
else:
rews, lens, idxs = np.array([]), np.array([], int), np.array([], int)
rew_mean = rew_std = len_mean = len_std = 0

return {
"n/ep": episode_count,
"n/st": step_count,
"rews": rews,
"lens": lens,
"idxs": idxs,
"rew": rew_mean,
"len": len_mean,
"rew_std": rew_std,
"len_std": len_std,
}
2 changes: 2 additions & 0 deletions tianshou/policy/modelfree/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from tianshou.data import Batch, ReplayBuffer, to_torch_as
from tianshou.policy import A2CPolicy
from tianshou.utils.net.common import ActorCritic


class PPOPolicy(A2CPolicy):
Expand Down Expand Up @@ -83,6 +84,7 @@ def __init__(
"value clip is available only when `reward_normalization` is True"
self._norm_adv = advantage_normalization
self._recompute_adv = recompute_advantage
self._actor_critic: ActorCritic

def process_fn(
self, batch: Batch, buffer: ReplayBuffer, indices: np.ndarray
Expand Down
2 changes: 2 additions & 0 deletions tianshou/trainer/offline.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def offline_trainer(
)
best_epoch = start_epoch
best_reward, best_reward_std = test_result["rew"], test_result["rew_std"]
if save_fn:
save_fn(policy)

for epoch in range(1 + start_epoch, 1 + max_epoch):
policy.train()
Expand Down
5 changes: 4 additions & 1 deletion tianshou/trainer/offpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ def offpolicy_trainer(
)
best_epoch = start_epoch
best_reward, best_reward_std = test_result["rew"], test_result["rew_std"]
if save_fn:
save_fn(policy)

for epoch in range(1 + start_epoch, 1 + max_epoch):
# train
Expand All @@ -110,7 +112,8 @@ def offpolicy_trainer(
train_fn(epoch, env_step)
result = train_collector.collect(n_step=step_per_collect)
if result["n/ep"] > 0 and reward_metric:
result["rews"] = reward_metric(result["rews"])
rew = reward_metric(result["rews"])
result.update(rews=rew, rew=rew.mean(), rew_std=rew.std())
env_step += int(result["n/st"])
t.update(result["n/st"])
logger.log_train_data(result, env_step)
Expand Down
5 changes: 4 additions & 1 deletion tianshou/trainer/onpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ def onpolicy_trainer(
)
best_epoch = start_epoch
best_reward, best_reward_std = test_result["rew"], test_result["rew_std"]
if save_fn:
save_fn(policy)

for epoch in range(1 + start_epoch, 1 + max_epoch):
# train
Expand All @@ -118,7 +120,8 @@ def onpolicy_trainer(
n_step=step_per_collect, n_episode=episode_per_collect
)
if result["n/ep"] > 0 and reward_metric:
result["rews"] = reward_metric(result["rews"])
rew = reward_metric(result["rews"])
result.update(rews=rew, rew=rew.mean(), rew_std=rew.std())
env_step += int(result["n/st"])
t.update(result["n/st"])
logger.log_train_data(result, env_step)
Expand Down
3 changes: 2 additions & 1 deletion tianshou/trainer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def test_episode(
test_fn(epoch, global_step)
result = collector.collect(n_episode=n_episode)
if reward_metric:
result["rews"] = reward_metric(result["rews"])
rew = reward_metric(result["rews"])
result.update(rews=rew, rew=rew.mean(), rew_std=rew.std())
if logger and global_step is not None:
logger.log_test_data(result, global_step)
return result
Expand Down
22 changes: 4 additions & 18 deletions tianshou/utils/logger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,8 @@ def log_train_data(self, collect_result: dict, step: int) -> None:
:param collect_result: a dict containing information of data collected in
training stage, i.e., returns of collector.collect().
:param int step: stands for the timestep the collect_result being logged.

.. note::

``collect_result`` will be modified in-place with "rew" and "len" keys.
"""
if collect_result["n/ep"] > 0:
collect_result["rew"] = collect_result["rews"].mean()
collect_result["len"] = collect_result["lens"].mean()
if step - self.last_log_train_step >= self.train_interval:
log_data = {
"train/episode": collect_result["n/ep"],
Expand All @@ -70,23 +64,15 @@ def log_test_data(self, collect_result: dict, step: int) -> None:
:param collect_result: a dict containing information of data collected in
evaluating stage, i.e., returns of collector.collect().
:param int step: stands for the timestep the collect_result being logged.

.. note::

``collect_result`` will be modified in-place with "rew", "rew_std", "len",
and "len_std" keys.
"""
assert collect_result["n/ep"] > 0
rews, lens = collect_result["rews"], collect_result["lens"]
rew, rew_std, len_, len_std = rews.mean(), rews.std(), lens.mean(), lens.std()
collect_result.update(rew=rew, rew_std=rew_std, len=len_, len_std=len_std)
if step - self.last_log_test_step >= self.test_interval:
log_data = {
"test/env_step": step,
"test/reward": rew,
"test/length": len_,
"test/reward_std": rew_std,
"test/length_std": len_std,
"test/reward": collect_result["rew"],
"test/length": collect_result["len"],
"test/reward_std": collect_result["rew_std"],
"test/length_std": collect_result["len_std"],
}
self.write("test/env_step", step, log_data)
self.last_log_test_step = step
Expand Down