-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
- I have marked all applicable categories:
- exception-raising bug
- RL algorithm bug
- documentation request (i.e. "X is missing from the documentation.")
- new feature request
- I have visited the source website
- I have searched through the issue tracker for duplicates
- I have mentioned version numbers, operating system and environment, where applicable:
import tianshou, torch, numpy, sys print(tianshou.__version__, torch.__version__, numpy.__version__, sys.version, sys.platform)
Issue:
I am currently training my policy with my own environment with Tianshou's PPO. (My code is based on the example code tianshou/examples/mujoco/mujoco_ppo.py
) RL experiments seem okay when I checked training logs and videos recorded while training. However, when I tried loading the trained policy at checkpoints, the behavior of the agent is different from the ones at the recorded videos. (episode returns or metrics do not match at all either.) For comparison, the same environment was used in the recorded videos at training and the loaded policy after training. I may make a mistake on my evaluation code for loading the saved policy. Can I ask you for advice on this issue? I have attached the codes I used.
Version:
- tianshou 0.4.1
- torch 1.8.1+cu111
- numpy 1.20.3
- system 3.8.5 (default, May 12 2021, 03:26:47), [GCC 9.3.0] linux
Training code:
def train(experiment_config, ppo_config, reward_config):
device = experiment_config["Device"] if torch.cuda.is_available() else "cpu"
save_path = ...
policy_path = ...
save_test_video_path = ...
os.makedirs(save_path, exist_ok=True)
os.makedirs(policy_path, exist_ok=True)
os.makedirs(save_test_video_path, exist_ok=True)
env = myEnv()
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
max_action = env.action_space.high[0]
train_envs = SubprocVectorEnv(
[lambda: myEnv() for _ in range(ppo_config["Training Envs"])],
norm_obs=True)
test_envs = SubprocVectorEnv(
[lambda: myEnv() for _ in range(ppo_config["Test Envs"]-1)]
+ [lambda: myEnv(recording_path=save_test_video_path)],
norm_obs=True, obs_rms=train_envs.obs_rms, update_obs_rms=False)
seed = experiment_config["Seed"]
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed()
test_envs.seed()
net_a = myNet(device=device)
net_c = myNet(device=device)
actor = ActorProb(net_a, action_shape, max_action=max_action, unbounded=True, device=device).to(device)
critic = Critic(net_c, device=device).to(device)
torch.nn.init.constant_(actor.sigma_param, ppo_config["Initial Sigma"])
for m in list(actor.modules()) + list(critic.modules()):
if isinstance(m, torch.nn.Linear):
# orthogonal initialization
torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
torch.nn.init.zeros_(m.bias)
for m in actor.mu.modules():
if isinstance(m, torch.nn.Linear):
torch.nn.init.zeros_(m.bias)
m.weight.data.copy_(0.01 * m.weight.data)
optim = torch.optim.Adam(
list(actor.parameters()) + list(critic.parameters()), lr=ppo_config["Learning Rate"])
lr_scheduler = None
if ppo_config["Learning Rate Decay"]:
max_update_num = np.ceil(
ppo_config["Step/Epoch"] / ppo_config["Step/Collect"]) * ppo_config["Epoch"]
lr_scheduler = LambdaLR(
optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)
def dist(*logits):
return Independent(Normal(*logits), 1)
print("\r\nExperiment Info")
pprint.pprint(experiment_config)
print("\r\nPPO CONFIG")
pprint.pprint(ppo_config)
print("\r\nReward CONFIG")
pprint.pprint(reward_config)
policy = PPOPolicy( actor, critic, optim, dist,
discount_factor=ppo_config["Gamma"],
gae_lambda=ppo_config["GAE Lambda"],
max_grad_norm=ppo_config["Max Grad Norm"],
vf_coef=ppo_config["Value Coefficient"],
ent_coef=ppo_config["Entropy Coefficient"],
reward_normalization=ppo_config["Reward Nomalization"],
action_scaling=True,
action_bound_method="clip",
lr_scheduler=lr_scheduler,
action_space=env.action_space,
eps_clip=ppo_config["Epsilon Clip"],
value_clip=ppo_config["Value Clip"],
dual_clip=ppo_config["Dual Clip"],
advantage_normalization=ppo_config["Advantage Normalization"],
recompute_advantage=ppo_config["Recompute Advantage"])
buffer = VectorReplayBuffer(ppo_config["Buffer Size"], len(train_envs))
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)
logger = myLogger()
def save_fn(policy):
torch.save(policy.state_dict(), os.path.join(policy_path, 'policy.pth'))
def save_checkpoint_fn(epoch, env_step, gradient_step):
torch.save({
'model': policy.state_dict(),
'optim': optim.state_dict(),
}, os.path.join(policy_path, 'checkpoint_{}.pth'.format(epoch)))
result = onpolicy_trainer(
policy, train_collector, test_collector, ppo_config["Epoch"], ppo_config["Step/Epoch"],
ppo_config["Repeat/Collect"], ppo_config["Test Envs"], ppo_config["Batch Size"],
step_per_collect=ppo_config["Step/Collect"], save_fn=save_fn, save_checkpoint_fn=save_checkpoint_fn, logger=logger,
test_in_train=False)
pprint.pprint(result)
Evaluation code:
def eval(policy_path):
save_test_video_path = ...
env = myEnv(recording_path=save_test_video_path)
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
max_action = env.action_space.high[0]
net_a = myNet(device=device)
net_c = myNet(device=device)
actor = ActorProb(net_a, action_shape, max_action=max_action, unbounded=True, device=device).to(device)
critic = Critic(net_c, device=device).to(device)
optim = torch.optim.Adam(
list(actor.parameters()) + list(critic.parameters()), lr=ppo_config["Learning Rate"])
lr_scheduler = None
if ppo_config["Learning Rate Decay"]:
max_update_num = np.ceil(
ppo_config["Step/Epoch"] / ppo_config["Step/Collect"]) * ppo_config["Epoch"]
lr_scheduler = LambdaLR(
optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num)
def dist(*logits):
return Independent(Normal(*logits), 1)
policy = PPOPolicy( actor, critic, optim, dist)
checkpoint = torch.load(policy_path, map_location=device)
policy.load_state_dict(checkpoint['model'])
optim.load_state_dict(checkpoint['optim'])
print("\rLoaded agent from: ", policy_path)
test_collector = Collector(policy, env)
policy.eval()
env.seed(0)
env.reset()
result = test_collector.collect(n_episode=1)
print(f'\rFinal reward: {result["rews"].mean()}, length: {result["lens"].mean()}')