diff --git a/sandbox/multicolumn/train_ppo_operator.py b/sandbox/multicolumn/train_ppo_operator.py index 809201599cd7d63e883441a600f3a64b8b20a91e..128c06dbac7d166710b8c24eaaf2b0a89683ce8b 100644 --- a/sandbox/multicolumn/train_ppo_operator.py +++ b/sandbox/multicolumn/train_ppo_operator.py @@ -136,12 +136,14 @@ class TrialEvalCallback(EvalCallback): if __name__ == "__main__": - params = {'n_step_pow': 7.0, 'batches_pow': 7.0, 'gamma': 0.0, 'lr': - 0.0002916406263715553, 'lr_schedule': 'constant', 'ent_coef': - 0.005743227072532813, 'clip_range': 0.4, 'n_epochs': 10, - 'gae_lambda': 0.99, 'max_grad_norm': 0.5, 'vf_coef': - 0.8088573261336596, 'net_arch': 'medium', 'shared_arch': True, - 'activation_fn': 'relu'} + + # Best PPO Operator 3.6 + params = {'n_step_pow': 7.0, 'batches_pow': 6.0, 'gamma': 0.0, 'lr': + 0.0007141880569765198, 'lr_schedule': 'constant', 'ent_coef': + 0.002919157761809425, 'clip_range': 0.4, 'n_epochs': 1, + 'gae_lambda': 1.0, 'max_grad_norm': 0.3, 'vf_coef': + 0.5683112652410534, 'net_arch': 'small', 'shared_arch': True, + 'activation_fn': 'tanh'} kwargs = get_args(params)