这是indexloc提供的服务,不要输入任何密码
Skip to content

Issues on loading a policy #443

@kiwi-sherbet

Description

@kiwi-sherbet
  • I have marked all applicable categories:
    • exception-raising bug
    • RL algorithm bug
    • documentation request (i.e. "X is missing from the documentation.")
    • new feature request
  • I have visited the source website
  • I have searched through the issue tracker for duplicates
  • I have mentioned version numbers, operating system and environment, where applicable:
    import tianshou, torch, numpy, sys
    print(tianshou.__version__, torch.__version__, numpy.__version__, sys.version, sys.platform)

Issue:
I am currently training my policy with my own environment with Tianshou's PPO. (My code is based on the example code tianshou/examples/mujoco/mujoco_ppo.py) RL experiments seem okay when I checked training logs and videos recorded while training. However, when I tried loading the trained policy at checkpoints, the behavior of the agent is different from the ones at the recorded videos. (episode returns or metrics do not match at all either.) For comparison, the same environment was used in the recorded videos at training and the loaded policy after training. I may make a mistake on my evaluation code for loading the saved policy. Can I ask you for advice on this issue? I have attached the codes I used.

Version:

  • tianshou 0.4.1
  • torch 1.8.1+cu111
  • numpy 1.20.3
  • system 3.8.5 (default, May 12 2021, 03:26:47), [GCC 9.3.0] linux

Training code:

def train(experiment_config, ppo_config, reward_config):

  device = experiment_config["Device"] if torch.cuda.is_available() else "cpu"

  save_path = ...
  policy_path = ...
  save_test_video_path = ...

  os.makedirs(save_path, exist_ok=True)
  os.makedirs(policy_path, exist_ok=True)
  os.makedirs(save_test_video_path, exist_ok=True)

  env = myEnv()
  state_shape = env.observation_space.shape or env.observation_space.n
  action_shape = env.action_space.shape or env.action_space.n
  max_action = env.action_space.high[0]

  train_envs = SubprocVectorEnv(
      [lambda: myEnv() for _ in range(ppo_config["Training Envs"])],
      norm_obs=True)

  test_envs = SubprocVectorEnv(
      [lambda: myEnv() for _ in range(ppo_config["Test Envs"]-1)]
      + [lambda: myEnv(recording_path=save_test_video_path)],
      norm_obs=True, obs_rms=train_envs.obs_rms, update_obs_rms=False)

  seed = experiment_config["Seed"]
  np.random.seed(seed)
  torch.manual_seed(seed)
  train_envs.seed()
  test_envs.seed()

  net_a = myNet(device=device)
  net_c = myNet(device=device)

  actor = ActorProb(net_a, action_shape, max_action=max_action, unbounded=True, device=device).to(device)
  critic = Critic(net_c, device=device).to(device)
  

  torch.nn.init.constant_(actor.sigma_param, ppo_config["Initial Sigma"])
  for m in list(actor.modules()) + list(critic.modules()):
      if isinstance(m, torch.nn.Linear):
          # orthogonal initialization
          torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
          torch.nn.init.zeros_(m.bias)

  for m in actor.mu.modules():
      if isinstance(m, torch.nn.Linear):
          torch.nn.init.zeros_(m.bias)
          m.weight.data.copy_(0.01 * m.weight.data)
  
  optim = torch.optim.Adam(
      list(actor.parameters()) + list(critic.parameters()), lr=ppo_config["Learning Rate"])

  lr_scheduler = None
  if ppo_config["Learning Rate Decay"]:
      max_update_num = np.ceil(
          ppo_config["Step/Epoch"] / ppo_config["Step/Collect"]) * ppo_config["Epoch"]

      lr_scheduler = LambdaLR(
          optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)

  def dist(*logits):
      return Independent(Normal(*logits), 1)

  print("\r\nExperiment Info")
  pprint.pprint(experiment_config)
  print("\r\nPPO CONFIG")
  pprint.pprint(ppo_config)
  print("\r\nReward CONFIG")
  pprint.pprint(reward_config)

  policy = PPOPolicy( actor, critic, optim, dist, 
                      discount_factor=ppo_config["Gamma"],
                      gae_lambda=ppo_config["GAE Lambda"],
                      max_grad_norm=ppo_config["Max Grad Norm"],
                      vf_coef=ppo_config["Value Coefficient"], 
                      ent_coef=ppo_config["Entropy Coefficient"],
                      reward_normalization=ppo_config["Reward Nomalization"], 
                      action_scaling=True,
                      action_bound_method="clip",
                      lr_scheduler=lr_scheduler, 
                      action_space=env.action_space,
                      eps_clip=ppo_config["Epsilon Clip"],
                      value_clip=ppo_config["Value Clip"],
                      dual_clip=ppo_config["Dual Clip"], 
                      advantage_normalization=ppo_config["Advantage Normalization"],
                      recompute_advantage=ppo_config["Recompute Advantage"])

  buffer = VectorReplayBuffer(ppo_config["Buffer Size"], len(train_envs))
  train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
  test_collector = Collector(policy, test_envs)

  logger = myLogger()

  def save_fn(policy):
      torch.save(policy.state_dict(), os.path.join(policy_path, 'policy.pth'))

  def save_checkpoint_fn(epoch, env_step, gradient_step):
      torch.save({
          'model': policy.state_dict(),
          'optim': optim.state_dict(),
      }, os.path.join(policy_path, 'checkpoint_{}.pth'.format(epoch)))

  result = onpolicy_trainer(
      policy, train_collector, test_collector, ppo_config["Epoch"], ppo_config["Step/Epoch"],
      ppo_config["Repeat/Collect"], ppo_config["Test Envs"], ppo_config["Batch Size"],
      step_per_collect=ppo_config["Step/Collect"], save_fn=save_fn, save_checkpoint_fn=save_checkpoint_fn, logger=logger,
      test_in_train=False)

  pprint.pprint(result)

Evaluation code:

def eval(policy_path):

  save_test_video_path = ...

  env = myEnv(recording_path=save_test_video_path)
  state_shape = env.observation_space.shape or env.observation_space.n
  action_shape = env.action_space.shape or env.action_space.n
  max_action = env.action_space.high[0]

  net_a = myNet(device=device)
  net_c = myNet(device=device)
  actor = ActorProb(net_a, action_shape, max_action=max_action, unbounded=True, device=device).to(device)
  critic = Critic(net_c, device=device).to(device)

  optim = torch.optim.Adam(
      list(actor.parameters()) + list(critic.parameters()), lr=ppo_config["Learning Rate"])

  lr_scheduler = None
  if ppo_config["Learning Rate Decay"]:
      max_update_num = np.ceil(
          ppo_config["Step/Epoch"] / ppo_config["Step/Collect"]) * ppo_config["Epoch"]

      lr_scheduler = LambdaLR(
          optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)

  def dist(*logits):
      return Independent(Normal(*logits), 1)

  policy = PPOPolicy( actor, critic, optim, dist)

  checkpoint = torch.load(policy_path, map_location=device)
  policy.load_state_dict(checkpoint['model'])
  optim.load_state_dict(checkpoint['optim'])
  print("\rLoaded agent from: ", policy_path)


  test_collector = Collector(policy, env)

  policy.eval()
  env.seed(0)
  env.reset()
  result = test_collector.collect(n_episode=1)
  print(f'\rFinal reward: {result["rews"].mean()}, length: {result["lens"].mean()}')

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions