Issues on loading a policy

- [ ] I have marked all applicable categories:
    + [ ] exception-raising bug
    + [ ] RL algorithm bug
    + [ ] documentation request (i.e. "X is missing from the documentation.")
    + [ ] new feature request
- [x] I have visited the [source website](https://github.com/thu-ml/tianshou/)
- [x] I have searched through the [issue tracker](https://github.com/thu-ml/tianshou/issues) for duplicates
- [x] I have mentioned version numbers, operating system and environment, where applicable:
  ```python
  import tianshou, torch, numpy, sys
  print(tianshou.__version__, torch.__version__, numpy.__version__, sys.version, sys.platform)
  ```

Issue:
I am currently training my policy with my own environment with Tianshou's PPO. (My code is based on the example code `tianshou/examples/mujoco/mujoco_ppo.py`) RL experiments seem okay when I checked training logs and videos recorded while training. However, when I tried loading the trained policy at checkpoints, the behavior of the agent is different from the ones at the recorded videos. (episode returns or metrics do not match at all either.) For comparison, the same environment was used in the recorded videos at training and the loaded policy after training. I may make a mistake on my evaluation code for loading the saved policy. Can I ask you for advice on this issue? I have attached the codes I used.

Version:
- tianshou 0.4.1 
- torch 1.8.1+cu111 
- numpy 1.20.3 
- system 3.8.5 (default, May 12 2021, 03:26:47), [GCC 9.3.0] linux

Training code:
  ```python
def train(experiment_config, ppo_config, reward_config):

    device = experiment_config["Device"] if torch.cuda.is_available() else "cpu"

    save_path = ...
    policy_path = ...
    save_test_video_path = ...

    os.makedirs(save_path, exist_ok=True)
    os.makedirs(policy_path, exist_ok=True)
    os.makedirs(save_test_video_path, exist_ok=True)

    env = myEnv()
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    max_action = env.action_space.high[0]

    train_envs = SubprocVectorEnv(
        [lambda: myEnv() for _ in range(ppo_config["Training Envs"])],
        norm_obs=True)

    test_envs = SubprocVectorEnv(
        [lambda: myEnv() for _ in range(ppo_config["Test Envs"]-1)]
        + [lambda: myEnv(recording_path=save_test_video_path)],
        norm_obs=True, obs_rms=train_envs.obs_rms, update_obs_rms=False)

    seed = experiment_config["Seed"]
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed()
    test_envs.seed()

    net_a = myNet(device=device)
    net_c = myNet(device=device)

    actor = ActorProb(net_a, action_shape, max_action=max_action, unbounded=True, device=device).to(device)
    critic = Critic(net_c, device=device).to(device)
    

    torch.nn.init.constant_(actor.sigma_param, ppo_config["Initial Sigma"])
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            # orthogonal initialization
            torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
            torch.nn.init.zeros_(m.bias)

    for m in actor.mu.modules():
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.zeros_(m.bias)
            m.weight.data.copy_(0.01 * m.weight.data)
    
    optim = torch.optim.Adam(
        list(actor.parameters()) + list(critic.parameters()), lr=ppo_config["Learning Rate"])

    lr_scheduler = None
    if ppo_config["Learning Rate Decay"]:
        max_update_num = np.ceil(
            ppo_config["Step/Epoch"] / ppo_config["Step/Collect"]) * ppo_config["Epoch"]

        lr_scheduler = LambdaLR(
            optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)

    def dist(*logits):
        return Independent(Normal(*logits), 1)

    print("\r\nExperiment Info")
    pprint.pprint(experiment_config)
    print("\r\nPPO CONFIG")
    pprint.pprint(ppo_config)
    print("\r\nReward CONFIG")
    pprint.pprint(reward_config)

    policy = PPOPolicy( actor, critic, optim, dist, 
                        discount_factor=ppo_config["Gamma"],
                        gae_lambda=ppo_config["GAE Lambda"],
                        max_grad_norm=ppo_config["Max Grad Norm"],
                        vf_coef=ppo_config["Value Coefficient"], 
                        ent_coef=ppo_config["Entropy Coefficient"],
                        reward_normalization=ppo_config["Reward Nomalization"], 
                        action_scaling=True,
                        action_bound_method="clip",
                        lr_scheduler=lr_scheduler, 
                        action_space=env.action_space,
                        eps_clip=ppo_config["Epsilon Clip"],
                        value_clip=ppo_config["Value Clip"],
                        dual_clip=ppo_config["Dual Clip"], 
                        advantage_normalization=ppo_config["Advantage Normalization"],
                        recompute_advantage=ppo_config["Recompute Advantage"])

    buffer = VectorReplayBuffer(ppo_config["Buffer Size"], len(train_envs))
    train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
    test_collector = Collector(policy, test_envs)

    logger = myLogger()

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(policy_path, 'policy.pth'))

    def save_checkpoint_fn(epoch, env_step, gradient_step):
        torch.save({
            'model': policy.state_dict(),
            'optim': optim.state_dict(),
        }, os.path.join(policy_path, 'checkpoint_{}.pth'.format(epoch)))

    result = onpolicy_trainer(
        policy, train_collector, test_collector, ppo_config["Epoch"], ppo_config["Step/Epoch"],
        ppo_config["Repeat/Collect"], ppo_config["Test Envs"], ppo_config["Batch Size"],
        step_per_collect=ppo_config["Step/Collect"], save_fn=save_fn, save_checkpoint_fn=save_checkpoint_fn, logger=logger,
        test_in_train=False)

    pprint.pprint(result)
  ```

Evaluation code:
  ```python
def eval(policy_path):

    save_test_video_path = ...

    env = myEnv(recording_path=save_test_video_path)
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    max_action = env.action_space.high[0]

    net_a = myNet(device=device)
    net_c = myNet(device=device)
    actor = ActorProb(net_a, action_shape, max_action=max_action, unbounded=True, device=device).to(device)
    critic = Critic(net_c, device=device).to(device)

    optim = torch.optim.Adam(
        list(actor.parameters()) + list(critic.parameters()), lr=ppo_config["Learning Rate"])

    lr_scheduler = None
    if ppo_config["Learning Rate Decay"]:
        max_update_num = np.ceil(
            ppo_config["Step/Epoch"] / ppo_config["Step/Collect"]) * ppo_config["Epoch"]

        lr_scheduler = LambdaLR(
            optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)

    def dist(*logits):
        return Independent(Normal(*logits), 1)

    policy = PPOPolicy( actor, critic, optim, dist)

    checkpoint = torch.load(policy_path, map_location=device)
    policy.load_state_dict(checkpoint['model'])
    optim.load_state_dict(checkpoint['optim'])
    print("\rLoaded agent from: ", policy_path)


    test_collector = Collector(policy, env)

    policy.eval()
    env.seed(0)
    env.reset()
    result = test_collector.collect(n_episode=1)
    print(f'\rFinal reward: {result["rews"].mean()}, length: {result["lens"].mean()}')
  ```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Issues on loading a policy #443

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issues on loading a policy #443

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions