diff --git a/examples/mujoco/mujoco_env.py b/examples/mujoco/mujoco_env.py index 3a4812108..fd0313555 100644 --- a/examples/mujoco/mujoco_env.py +++ b/examples/mujoco/mujoco_env.py @@ -6,6 +6,7 @@ from tianshou.env import ShmemVectorEnv, VectorEnvNormObs from tianshou.highlevel.env import ContinuousEnvironments, EnvFactory +from tianshou.highlevel.experiment import TrainSeedMechanism from tianshou.highlevel.persistence import Persistence, PersistEvent, RestoreEvent from tianshou.highlevel.world import World @@ -17,7 +18,11 @@ log = logging.getLogger(__name__) -def make_mujoco_env(task: str, seed: int, num_train_envs: int, num_test_envs: int, obs_norm: bool): +def make_mujoco_env(task: str, seed: int, num_train_envs: int, + num_test_envs: int, obs_norm: bool, + train_seed_mechanism: TrainSeedMechanism = TrainSeedMechanism.NONE, + test_seeds: tuple[int, ...] | None = None + ): #makes mujoco envs, name is not really honest """Wrapper function for Mujoco env. If EnvPool is installed, it will automatically switch to EnvPool's Mujoco env. @@ -27,6 +32,7 @@ def make_mujoco_env(task: str, seed: int, num_train_envs: int, num_test_envs: in if envpool is not None: train_envs = env = envpool.make_gymnasium(task, num_envs=num_train_envs, seed=seed) test_envs = envpool.make_gymnasium(task, num_envs=num_test_envs, seed=seed) + #todo robert check how seeding is done here else: warnings.warn( "Recommend using envpool (pip install envpool) " @@ -35,8 +41,21 @@ def make_mujoco_env(task: str, seed: int, num_train_envs: int, num_test_envs: in env = gym.make(task) train_envs = ShmemVectorEnv([lambda: gym.make(task) for _ in range(num_train_envs)]) test_envs = ShmemVectorEnv([lambda: gym.make(task) for _ in range(num_test_envs)]) - train_envs.seed(seed) - test_envs.seed(seed) + if train_seed_mechanism.is_consecutive(): + train_envs.seed([seed + i for i in range(num_train_envs)]) + elif train_seed_mechanism.is_repeat(): + train_envs.seed([seed for _ in range(num_train_envs)]) + elif train_seed_mechanism.is_none(): + train_envs.seed(seed) + else: + NotImplementedError(f"train_seed_mechanism {train_seed_mechanism} not implemented") + + if test_seeds is None: + test_envs.seed(seed) + else: + assert len(test_seeds) == num_test_envs + test_envs.seed(test_seeds) + if obs_norm: # obs norm wrapper train_envs = VectorEnvNormObs(train_envs) @@ -69,10 +88,14 @@ def restore(self, event: RestoreEvent, world: World): class MujocoEnvFactory(EnvFactory): - def __init__(self, task: str, seed: int, obs_norm=True): + def __init__(self, task: str, seed: int, obs_norm=True, + train_seed_mechanism: TrainSeedMechanism = TrainSeedMechanism.NONE, + test_seeds: tuple[int, ...] | None = None): self.task = task self.seed = seed self.obs_norm = obs_norm + self.train_seed_mechanism = train_seed_mechanism + self.test_seeds = test_seeds def create_envs(self, num_training_envs: int, num_test_envs: int) -> ContinuousEnvironments: env, train_envs, test_envs = make_mujoco_env( @@ -81,8 +104,10 @@ def create_envs(self, num_training_envs: int, num_test_envs: int) -> ContinuousE num_train_envs=num_training_envs, num_test_envs=num_test_envs, obs_norm=self.obs_norm, + train_seed_mechanism=self.train_seed_mechanism, + test_seeds=self.test_seeds, ) - envs = ContinuousEnvironments(env=env, train_envs=train_envs, test_envs=test_envs) + envs = ContinuousEnvironments(env=env, train_envs=train_envs, test_envs=test_envs, test_seeds = self.test_seeds) if self.obs_norm: envs.set_persistence(MujocoEnvObsRmsPersistence()) return envs diff --git a/examples/mujoco/mujoco_ppo_hl_experiment.py b/examples/mujoco/mujoco_ppo_hl_experiment.py new file mode 100644 index 000000000..1b0244bf8 --- /dev/null +++ b/examples/mujoco/mujoco_ppo_hl_experiment.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +import os +from collections.abc import Sequence +from typing import Literal + +import torch + +from examples.mujoco.mujoco_env import MujocoEnvFactory +from tianshou.highlevel.config import SamplingConfig +from tianshou.highlevel.experiment import ( + ExperimentConfig, + EvaluationProtocolExperimentConfig, + PPOExperimentBuilder, +) +from tianshou.highlevel.params.dist_fn import ( + DistributionFunctionFactoryIndependentGaussians, +) +from tianshou.highlevel.params.lr_scheduler import LRSchedulerFactoryLinear +from tianshou.highlevel.params.policy_params import PPOParams +from tianshou.utils import logging +from tianshou.utils.logging import datetime_tag + + +def main( + experiment_config: EvaluationProtocolExperimentConfig, + task: str = "Ant-v4", + buffer_size: int = 4096, + hidden_sizes: Sequence[int] = (64, 64), + lr: float = 3e-4, + gamma: float = 0.99, + epoch: int = 100, + step_per_epoch: int = 30000, + step_per_collect: int = 2048, + repeat_per_collect: int = 10, + batch_size: int = 64, + training_num: int = 3, #64, + test_num: int = 2, #10, + rew_norm: bool = True, + vf_coef: float = 0.25, + ent_coef: float = 0.0, + gae_lambda: float = 0.95, + bound_action_method: Literal["clip", "tanh"] | None = "clip", + lr_decay: bool = True, + max_grad_norm: float = 0.5, + eps_clip: float = 0.2, + dual_clip: float | None = None, + value_clip: bool = False, + norm_adv: bool = False, + recompute_adv: bool = True, +): + b = 5 + log_name = os.path.join(task, "ppo", str(experiment_config.seed), datetime_tag()) + + sampling_config = SamplingConfig( + num_epochs=epoch, + step_per_epoch=step_per_epoch, + batch_size=batch_size, + num_train_envs=training_num, + num_test_envs=test_num, + buffer_size=buffer_size, + step_per_collect=step_per_collect, + repeat_per_collect=repeat_per_collect, + ) + + env_factory = MujocoEnvFactory(task, + experiment_config.seed, + obs_norm=True, + train_seed_mechanism=experiment_config.train_seed_mechanism, + test_seeds=experiment_config.test_seeds) + + experiment = ( + PPOExperimentBuilder(env_factory, experiment_config, sampling_config) + .with_ppo_params( + PPOParams( + discount_factor=gamma, + gae_lambda=gae_lambda, + action_bound_method=bound_action_method, + reward_normalization=rew_norm, + ent_coef=ent_coef, + vf_coef=vf_coef, + max_grad_norm=max_grad_norm, + value_clip=value_clip, + advantage_normalization=norm_adv, + eps_clip=eps_clip, + dual_clip=dual_clip, + recompute_advantage=recompute_adv, + lr=lr, + lr_scheduler_factory=LRSchedulerFactoryLinear(sampling_config) + if lr_decay + else None, + dist_fn=DistributionFunctionFactoryIndependentGaussians(), + ), + ) + .with_actor_factory_default(hidden_sizes, torch.nn.Tanh, continuous_unbounded=True) + .with_critic_factory_default(hidden_sizes, torch.nn.Tanh) + .build() + ) + experiment.run(log_name, experiment_config.record_seed_of_transition_to_buffer_test) + + +if __name__ == "__main__": + logging.run_cli(main) diff --git a/test/highlevel/test_eval_procedure.py b/test/highlevel/test_eval_procedure.py new file mode 100644 index 000000000..de4ce6f53 --- /dev/null +++ b/test/highlevel/test_eval_procedure.py @@ -0,0 +1,71 @@ +from test.highlevel.env_factory import ContinuousTestEnvFactory, DiscreteTestEnvFactory + +import pytest + +from tianshou.highlevel.config import SamplingConfig +from tianshou.highlevel.experiment import ( + PPOExperimentBuilder, + ExperimentConfig, + EvaluationProtocolExperimentConfig, + TrainSeedMechanism +) +from examples.mujoco.mujoco_env import MujocoEnvFactory + + +def test_standard_experiment_config(): + experiment_config = ExperimentConfig + sampling_config = SamplingConfig( + num_epochs=1, + step_per_epoch=100, + num_train_envs=2, + num_test_envs=2, + ) + env_factory = MujocoEnvFactory(task="Ant-v4", + seed=experiment_config.seed, + obs_norm=True,) + + ppo = PPOExperimentBuilder( + experiment_config=experiment_config, + env_factory=env_factory, + sampling_config=sampling_config, + ) + experiment = ppo.build() + experiment.run("test") + print(experiment) + + +@pytest.mark.parametrize("experiment_config", + [ + EvaluationProtocolExperimentConfig( + persistence_enabled=False, + train_seed_mechanism=TrainSeedMechanism.CONSECUTIVE, + test_seeds=(2,3)), + EvaluationProtocolExperimentConfig(persistence_enabled=False, train_seed_mechanism=TrainSeedMechanism.REPEAT, + test_seeds=(2,3)), + EvaluationProtocolExperimentConfig(persistence_enabled=False, train_seed_mechanism=TrainSeedMechanism.NONE, + test_seeds=(2,3)), + + ], +) +def test_experiment_builder_continuous_default_params(experiment_config): + sampling_config = SamplingConfig( + num_epochs=1, + step_per_epoch=100, + num_train_envs=2, + num_test_envs=2, + ) + env_factory = MujocoEnvFactory(task="Ant-v4", + seed=experiment_config.seed, + obs_norm=True, + train_seed_mechanism=experiment_config.train_seed_mechanism, + test_seeds=experiment_config.test_seeds) + + ppo = PPOExperimentBuilder( + experiment_config=experiment_config, + env_factory=env_factory, + sampling_config=sampling_config, + ) + experiment = ppo.build() + experiment.run("test", + record_seed_of_transition_to_buffer_test=experiment_config.record_seed_of_transition_to_buffer_test) + print(experiment) diff --git a/tianshou/data/collector.py b/tianshou/data/collector.py index f188f3ca0..d096823e2 100644 --- a/tianshou/data/collector.py +++ b/tianshou/data/collector.py @@ -94,6 +94,8 @@ def __init__( buffer: ReplayBuffer | None = None, preprocess_fn: Callable[..., RolloutBatchProtocol] | None = None, exploration_noise: bool = False, + record_seed_of_transition_to_buffer_test: bool = False, + test_seeds: tuple[int, ...] | None = None, ) -> None: super().__init__() if isinstance(env, gym.Env) and not hasattr(env, "__len__"): @@ -111,6 +113,8 @@ def __init__( self.data: RolloutBatchProtocol # avoid creating attribute outside __init__ self.reset(False) + self.record_seed_of_transition_to_buffer_test = record_seed_of_transition_to_buffer_test + self.test_seeds = test_seeds def _assign_buffer(self, buffer: ReplayBuffer | None) -> None: """Check if the buffer matches the constraint.""" @@ -317,7 +321,9 @@ def collect( ready_env_ids, ) done = np.logical_or(terminated, truncated) - + if self.record_seed_of_transition_to_buffer_test: + for env_id, active_id in enumerate(ready_env_ids): + info[env_id]["seed"] = self.test_seeds[active_id] self.data.update( obs_next=obs_next, rew=rew, diff --git a/tianshou/env/venvs.py b/tianshou/env/venvs.py index a9b0aff74..f96255d0a 100644 --- a/tianshou/env/venvs.py +++ b/tianshou/env/venvs.py @@ -315,7 +315,7 @@ def step( np.stack(info_list), ) - def seed(self, seed: int | list[int] | None = None) -> list[list[int] | None]: + def seed(self, seed: int | list[int] |tuple[int]| None = None) -> list[list[int] | None]: """Set the seed for all environments. Accept ``None``, an int (which will extend ``i`` to @@ -323,11 +323,11 @@ def seed(self, seed: int | list[int] | None = None) -> list[list[int] | None]: :return: The list of seeds used in this env's random number generators. The first value in the list should be the "main" seed, or the value - which a reproducer pass to "seed". + which a reproducer passes to "seed". """ self._assert_is_not_closed() seed_list: list[None] | list[int] - if seed is None: + if seed is None: #todo robert check this can happen, results in error when put in config seed_list = [seed] * self.env_num elif isinstance(seed, int): seed_list = [seed + i for i in range(self.env_num)] diff --git a/tianshou/highlevel/agent.py b/tianshou/highlevel/agent.py index 727bfb244..61639c827 100644 --- a/tianshou/highlevel/agent.py +++ b/tianshou/highlevel/agent.py @@ -93,6 +93,8 @@ def create_train_test_collector( self, policy: BasePolicy, envs: Environments, + record_seed_of_transition_to_buffer_test: bool = False, + test_seeds: tuple[int, ...] | None = None, ) -> tuple[Collector, Collector]: buffer_size = self.sampling_config.buffer_size train_envs = envs.train_envs @@ -113,7 +115,10 @@ def create_train_test_collector( ignore_obs_next=self.sampling_config.replay_buffer_ignore_obs_next, ) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) - test_collector = Collector(policy, envs.test_envs) + test_collector = Collector(policy, envs.test_envs, + record_seed_of_transition_to_buffer_test=record_seed_of_transition_to_buffer_test, + test_seeds = test_seeds) + self.test_seeds = test_seeds if self.sampling_config.start_timesteps > 0: log.info( f"Collecting {self.sampling_config.start_timesteps} initial environment steps before training (random={self.sampling_config.start_timesteps_random})", diff --git a/tianshou/highlevel/env.py b/tianshou/highlevel/env.py index c78b72c6c..21fc9f8fb 100644 --- a/tianshou/highlevel/env.py +++ b/tianshou/highlevel/env.py @@ -147,9 +147,11 @@ def get_type(self) -> EnvType: class ContinuousEnvironments(Environments): """Represents (vectorized) continuous environments.""" - def __init__(self, env: gym.Env, train_envs: BaseVectorEnv, test_envs: BaseVectorEnv): + def __init__(self, env: gym.Env, train_envs: BaseVectorEnv, test_envs: BaseVectorEnv, + test_seeds: tuple[int, ...]|None = None): super().__init__(env, train_envs, test_envs) self.state_shape, self.action_shape, self.max_action = self._get_continuous_env_info(env) + self.test_seeds = test_seeds @staticmethod def from_factory( diff --git a/tianshou/highlevel/experiment.py b/tianshou/highlevel/experiment.py index 3989d3583..e0b18aaaf 100644 --- a/tianshou/highlevel/experiment.py +++ b/tianshou/highlevel/experiment.py @@ -3,8 +3,9 @@ from abc import abstractmethod from collections.abc import Sequence from dataclasses import dataclass +from enum import Enum from pprint import pformat -from typing import Self +from typing import Self, Literal, Tuple import numpy as np import torch @@ -84,7 +85,21 @@ log = logging.getLogger(__name__) +class TrainSeedMechanism(Enum): + """Enumeration of mechanisms to create the train seed set.""" + REPEAT = "repeat" + CONSECUTIVE = "consecutive" + NONE = "none" + + def is_repeat(self) -> bool: + return self == TrainSeedMechanism.REPEAT + + def is_consecutive(self) -> bool: + return self == TrainSeedMechanism.CONSECUTIVE + + def is_none(self) -> bool: + return self == TrainSeedMechanism.NONE @dataclass class ExperimentConfig: """Generic config for setting up the experiment, not RL or training specific.""" @@ -114,6 +129,18 @@ class ExperimentConfig: policy_persistence_mode: PolicyPersistence.Mode = PolicyPersistence.Mode.POLICY """Controls the way in which the policy is persisted""" +@dataclass(kw_only=True) +class EvaluationProtocolExperimentConfig(ExperimentConfig): + train_seed_mechanism: TrainSeedMechanism = TrainSeedMechanism.CONSECUTIVE + """Whether all train seeds are supposed to be the same or consecutive + numbers, compare seed function in venvs.py """ + test_seeds : tuple[int,...] = (4,22) + """Set of seeds to use during testing""" + record_seed_of_transition_to_buffer_test: bool = True + def __post_init__(self): + assert set(self.test_seeds).isdisjoint({self.seed}) + #todo adapt this for more complex version of train and test seed sets + @dataclass class ExperimentResult: @@ -187,6 +214,7 @@ def run( self, experiment_name: str | None = None, logger_run_id: str | None = None, + record_seed_of_transition_to_buffer_test: bool = False, ) -> ExperimentResult: """Run the experiment and return the results. @@ -257,6 +285,8 @@ def run( train_collector, test_collector = self.agent_factory.create_train_test_collector( policy, envs, + record_seed_of_transition_to_buffer_test =record_seed_of_transition_to_buffer_test, + test_seeds = envs.test_seeds ) # create context object with all relevant instances (except trainer; added later) diff --git a/tianshou/hpo_sweepers/__init__.py b/tianshou/hpo_sweepers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tianshou/utils/evaluation_protocol_analysis.py b/tianshou/utils/evaluation_protocol_analysis.py new file mode 100644 index 000000000..b7819a900 --- /dev/null +++ b/tianshou/utils/evaluation_protocol_analysis.py @@ -0,0 +1,12 @@ +from tbparse import SummaryReader + +def read_data(log_dir): + reader = SummaryReader(log_dir) + print(reader) + df = reader.scalars + print(df['tag']) + + +if __name__ == '__main__': + log_dir = "examples/mujoco/log/Ant-v4/ppo/42/20240126-105848" + read_data(log_dir) \ No newline at end of file