diff --git a/setup.py b/setup.py index 80209edcb..963fb79e5 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def get_version() -> str: "isort", "pytest", "pytest-cov", - "ray>=1.0.0", + "ray>=1.0.0,<1.7.0", "wandb>=0.12.0", "networkx", "mypy", diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index 8aaef2d55..5e5006dee 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -167,6 +167,10 @@ def collect( * ``rews`` array of episode reward over collected episodes. * ``lens`` array of episode length over collected episodes. * ``idxs`` array of episode start index in buffer over collected episodes. + * ``rew`` mean of episodic rewards. + * ``len`` mean of episodic lengths. + * ``rew_std`` standard error of episodic rewards. + * ``len_std`` standard error of episodic lengths. """ assert not self.env.is_async, "Please use AsyncCollector if using async venv." if n_step is not None: @@ -311,8 +315,11 @@ def collect( [episode_rews, episode_lens, episode_start_indices] ) ) + rew_mean, rew_std = rews.mean(), rews.std() + len_mean, len_std = lens.mean(), lens.std() else: rews, lens, idxs = np.array([]), np.array([], int), np.array([], int) + rew_mean = rew_std = len_mean = len_std = 0 return { "n/ep": episode_count, @@ -320,6 +327,10 @@ def collect( "rews": rews, "lens": lens, "idxs": idxs, + "rew": rew_mean, + "len": len_mean, + "rew_std": rew_std, + "len_std": len_std, } @@ -380,6 +391,10 @@ def collect( * ``rews`` array of episode reward over collected episodes. * ``lens`` array of episode length over collected episodes. * ``idxs`` array of episode start index in buffer over collected episodes. + * ``rew`` mean of episodic rewards. + * ``len`` mean of episodic lengths. + * ``rew_std`` standard error of episodic rewards. + * ``len_std`` standard error of episodic lengths. """ # collect at least n_step or n_episode if n_step is not None: @@ -530,8 +545,11 @@ def collect( [episode_rews, episode_lens, episode_start_indices] ) ) + rew_mean, rew_std = rews.mean(), rews.std() + len_mean, len_std = lens.mean(), lens.std() else: rews, lens, idxs = np.array([]), np.array([], int), np.array([], int) + rew_mean = rew_std = len_mean = len_std = 0 return { "n/ep": episode_count, @@ -539,4 +557,8 @@ def collect( "rews": rews, "lens": lens, "idxs": idxs, + "rew": rew_mean, + "len": len_mean, + "rew_std": rew_std, + "len_std": len_std, } diff --git a/tianshou/policy/modelfree/ppo.py b/tianshou/policy/modelfree/ppo.py index f27463fdf..45439ede3 100644 --- a/tianshou/policy/modelfree/ppo.py +++ b/tianshou/policy/modelfree/ppo.py @@ -6,6 +6,7 @@ from tianshou.data import Batch, ReplayBuffer, to_torch_as from tianshou.policy import A2CPolicy +from tianshou.utils.net.common import ActorCritic class PPOPolicy(A2CPolicy): @@ -83,6 +84,7 @@ def __init__( "value clip is available only when `reward_normalization` is True" self._norm_adv = advantage_normalization self._recompute_adv = recompute_advantage + self._actor_critic: ActorCritic def process_fn( self, batch: Batch, buffer: ReplayBuffer, indices: np.ndarray diff --git a/tianshou/trainer/offline.py b/tianshou/trainer/offline.py index 72cd00d06..f5b454fa2 100644 --- a/tianshou/trainer/offline.py +++ b/tianshou/trainer/offline.py @@ -81,6 +81,8 @@ def offline_trainer( ) best_epoch = start_epoch best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] + if save_fn: + save_fn(policy) for epoch in range(1 + start_epoch, 1 + max_epoch): policy.train() diff --git a/tianshou/trainer/offpolicy.py b/tianshou/trainer/offpolicy.py index 922646197..54b92fdc5 100644 --- a/tianshou/trainer/offpolicy.py +++ b/tianshou/trainer/offpolicy.py @@ -98,6 +98,8 @@ def offpolicy_trainer( ) best_epoch = start_epoch best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] + if save_fn: + save_fn(policy) for epoch in range(1 + start_epoch, 1 + max_epoch): # train @@ -110,7 +112,8 @@ def offpolicy_trainer( train_fn(epoch, env_step) result = train_collector.collect(n_step=step_per_collect) if result["n/ep"] > 0 and reward_metric: - result["rews"] = reward_metric(result["rews"]) + rew = reward_metric(result["rews"]) + result.update(rews=rew, rew=rew.mean(), rew_std=rew.std()) env_step += int(result["n/st"]) t.update(result["n/st"]) logger.log_train_data(result, env_step) diff --git a/tianshou/trainer/onpolicy.py b/tianshou/trainer/onpolicy.py index 6788e8a65..ad1b4a691 100644 --- a/tianshou/trainer/onpolicy.py +++ b/tianshou/trainer/onpolicy.py @@ -104,6 +104,8 @@ def onpolicy_trainer( ) best_epoch = start_epoch best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] + if save_fn: + save_fn(policy) for epoch in range(1 + start_epoch, 1 + max_epoch): # train @@ -118,7 +120,8 @@ def onpolicy_trainer( n_step=step_per_collect, n_episode=episode_per_collect ) if result["n/ep"] > 0 and reward_metric: - result["rews"] = reward_metric(result["rews"]) + rew = reward_metric(result["rews"]) + result.update(rews=rew, rew=rew.mean(), rew_std=rew.std()) env_step += int(result["n/st"]) t.update(result["n/st"]) logger.log_train_data(result, env_step) diff --git a/tianshou/trainer/utils.py b/tianshou/trainer/utils.py index a39a12fff..6ad2f0f2a 100644 --- a/tianshou/trainer/utils.py +++ b/tianshou/trainer/utils.py @@ -26,7 +26,8 @@ def test_episode( test_fn(epoch, global_step) result = collector.collect(n_episode=n_episode) if reward_metric: - result["rews"] = reward_metric(result["rews"]) + rew = reward_metric(result["rews"]) + result.update(rews=rew, rew=rew.mean(), rew_std=rew.std()) if logger and global_step is not None: logger.log_test_data(result, global_step) return result diff --git a/tianshou/utils/logger/base.py b/tianshou/utils/logger/base.py index 9b89d5e88..e6d15a14c 100644 --- a/tianshou/utils/logger/base.py +++ b/tianshou/utils/logger/base.py @@ -47,14 +47,8 @@ def log_train_data(self, collect_result: dict, step: int) -> None: :param collect_result: a dict containing information of data collected in training stage, i.e., returns of collector.collect(). :param int step: stands for the timestep the collect_result being logged. - - .. note:: - - ``collect_result`` will be modified in-place with "rew" and "len" keys. """ if collect_result["n/ep"] > 0: - collect_result["rew"] = collect_result["rews"].mean() - collect_result["len"] = collect_result["lens"].mean() if step - self.last_log_train_step >= self.train_interval: log_data = { "train/episode": collect_result["n/ep"], @@ -70,23 +64,15 @@ def log_test_data(self, collect_result: dict, step: int) -> None: :param collect_result: a dict containing information of data collected in evaluating stage, i.e., returns of collector.collect(). :param int step: stands for the timestep the collect_result being logged. - - .. note:: - - ``collect_result`` will be modified in-place with "rew", "rew_std", "len", - and "len_std" keys. """ assert collect_result["n/ep"] > 0 - rews, lens = collect_result["rews"], collect_result["lens"] - rew, rew_std, len_, len_std = rews.mean(), rews.std(), lens.mean(), lens.std() - collect_result.update(rew=rew, rew_std=rew_std, len=len_, len_std=len_std) if step - self.last_log_test_step >= self.test_interval: log_data = { "test/env_step": step, - "test/reward": rew, - "test/length": len_, - "test/reward_std": rew_std, - "test/length_std": len_std, + "test/reward": collect_result["rew"], + "test/length": collect_result["len"], + "test/reward_std": collect_result["rew_std"], + "test/length_std": collect_result["len_std"], } self.write("test/env_step", step, log_data) self.last_log_test_step = step