diff --git a/sandbox/fractions/train_ppo_operator.py b/sandbox/fractions/train_ppo_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..1702ed5d287a7d6d67c58f110d3e3b2de38a3610 --- /dev/null +++ b/sandbox/fractions/train_ppo_operator.py @@ -0,0 +1,195 @@ +from typing import Dict +from typing import Any + +import optuna +from torch import nn as nn +from stable_baselines3 import PPO +from stable_baselines3.ppo import MlpPolicy +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.callbacks import EvalCallback +from stable_baselines3.common.vec_env import VecEnv + +from tutorenvs.utils import linear_schedule + + +def get_args(params: Dict[str, Any]) -> Dict[str, Any]: + """ + Sampler for PPO hyperparams. + :param trial: + :return: + """ + batch_size = int(2**params['batches_pow']) + n_steps = int(2**params['n_step_pow']) + gamma = params['gamma'] + learning_rate = params['lr'] + lr_schedule = params['lr_schedule'] + ent_coef = params['ent_coef'] + clip_range = params['clip_range'] + n_epochs = params['n_epochs'] + gae_lambda = params['gae_lambda'] + max_grad_norm = params['max_grad_norm'] + vf_coef = params['vf_coef'] + net_arch = params['net_arch'] + shared_arch = params['shared_arch'] + activation_fn = params['activation_fn'] + + # TODO: account when using multiple envs + if batch_size > n_steps: + batch_size = n_steps + + if lr_schedule == "linear": + learning_rate = linear_schedule(learning_rate) + + # Independent networks usually work best + # when not working with images + net_arch = { + True: { + "tiny": [32, dict(pi=[32], vf=[32])], + "small": [64, dict(pi=[64], vf=[64])], + "medium": [128, dict(pi=[128], vf=[128])], + }, + False: { + "tiny": [dict(pi=[32, 32], vf=[32, 32])], + "small": [dict(pi=[64, 64], vf=[64, 64])], + "medium": [dict(pi=[128, 128], vf=[128, 128])], + } + }[shared_arch][net_arch] + + activation_fn = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "elu": nn.ELU, + "leaky_relu": nn.LeakyReLU + }[activation_fn] + + ortho_init = False + + return { + "n_steps": + n_steps, + "batch_size": + batch_size, + "gamma": + gamma, + "learning_rate": + learning_rate, + "ent_coef": + ent_coef, + "clip_range": + clip_range, + "n_epochs": + n_epochs, + "gae_lambda": + gae_lambda, + "max_grad_norm": + max_grad_norm, + "vf_coef": + vf_coef, + # "sde_sample_freq": sde_sample_freq, + "policy_kwargs": + dict( + # log_std_init=log_std_init, + net_arch=net_arch, + activation_fn=activation_fn, + ortho_init=ortho_init, + ), + } + + +class TrialEvalCallback(EvalCallback): + """ + Callback used for evaluating and reporting a trial. + """ + def __init__( + self, + eval_env: VecEnv, + trial: optuna.Trial, + n_eval_episodes: int = 5, + eval_freq: int = 10000, + deterministic: bool = True, + verbose: int = 0, + ): + + super(TrialEvalCallback, self).__init__( + eval_env=eval_env, + n_eval_episodes=n_eval_episodes, + eval_freq=eval_freq, + deterministic=deterministic, + verbose=verbose, + ) + self.trial = trial + self.eval_idx = 0 + self.is_pruned = False + + def _on_step(self) -> bool: + if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: + super(TrialEvalCallback, self)._on_step() + self.eval_idx += 1 + # report best or report current ? + # report num_timesteps or elasped time ? + self.trial.report(self.last_mean_reward, self.eval_idx) + # Prune trial if need + if self.trial.should_prune(): + self.is_pruned = True + return False + return True + + +if __name__ == "__main__": + # params = { + # 'batch_size': 32, + # 'n_steps': 16, + # 'gamma': 0.0, + # 'lr': 0.00017980950834568327, + # 'lr_schedule': 'constant', + # 'ent_coef': 0.07439893598338435, + # 'clip_range': 0.4, + # 'n_epochs': 10, + # 'gae_lambda': 0.95, + # 'max_grad_norm': 0.8, + # 'vf_coef': 0.13214811411452415, + # 'net_arch': 'medium', + # 'shared_arch': False, + # 'activation_fn': 'tanh' + # } + + # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1, + # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma': + # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear', + # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch': + # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438} + + params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr': + 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef': + 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5, + 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef': + 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False, + 'activation_fn': 'relu'} + + kwargs = get_args(params) + + # multiprocess environment + env = make_vec_env('FractionArith-v1', n_envs=1) + model = PPO( + MlpPolicy, + env, + verbose=1, + tensorboard_log="./tensorboard_ppo/", + **kwargs + ) + # gamma=0.1, + # tensorboard_log="./tensorboard/v0/") + + # while True: + # Train + model.learn(total_timesteps=1000000) + + # Test + # obs = env.reset() + # rwd = 0 + # for _ in range(10000): + # action, _states = model.predict(obs) + # obs, rewards, dones, info = env.step(action) + # rwd += np.sum(rewards) + # env.render() + # print(rwd) diff --git a/sandbox/fractions/tune_ppo_operator.py b/sandbox/fractions/tune_ppo_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..17f6d8db7e03ab3e8dfe35217716cf4f64f9403a --- /dev/null +++ b/sandbox/fractions/tune_ppo_operator.py @@ -0,0 +1,230 @@ +from typing import Dict +from typing import Any +import tempfile + +import gym +import optuna +from torch import nn as nn +from stable_baselines3 import PPO +from stable_baselines3.ppo import MlpPolicy +# from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.callbacks import BaseCallback +from stable_baselines3.common.vec_env import DummyVecEnv +# from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.monitor import load_results + +import tutorenvs # noqa: F401 +from tutorenvs.utils import linear_schedule + + +def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: + """ + Sampler for PPO hyperparams. + + :param trial: + :return: + """ + n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1) + n_steps = int(2**n_step_pow) + + # possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048] + # n_steps = trial.suggest_categorical("n_steps", + # possible_n_steps) + + batches_pow = trial.suggest_discrete_uniform('batches_pow', 3, + n_step_pow, 1) + batch_size = int(2**batches_pow) + + # possible_batches = [8, 16, 32, 64, 128, 256, 512] + # batch_size = trial.suggest_categorical("batch_size", + # possible_batches) + + gamma = trial.suggest_categorical("gamma", [0.0]) + # 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) + learning_rate = trial.suggest_loguniform("lr", 1e-8, 1) + # lr_schedule = "constant" + # Uncomment to enable learning rate schedule + lr_schedule = trial.suggest_categorical('lr_schedule', + ['linear', 'constant']) + ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1) + clip_range = trial.suggest_categorical("clip_range", + [0.05, 0.1, 0.2, 0.3, 0.4]) + n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20]) + gae_lambda = trial.suggest_categorical( + "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) + max_grad_norm = trial.suggest_categorical( + "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) + vf_coef = trial.suggest_uniform("vf_coef", 0, 1) + net_arch = trial.suggest_categorical("net_arch", + ["tiny", "small", "medium"]) + shared_arch = trial.suggest_categorical("shared_arch", [True, False]) + ortho_init = False + activation_fn = trial.suggest_categorical("activation_fn", + ["tanh", "relu"]) + + # TODO: account when using multiple envs + if batch_size > n_steps: + batch_size = n_steps + + if lr_schedule == "linear": + learning_rate = linear_schedule(learning_rate) + + # Independent networks usually work best + # when not working with images + net_arch = { + True: { + "tiny": [32, dict(pi=[32], vf=[32])], + "small": [64, dict(pi=[64], vf=[64])], + "medium": [128, dict(pi=[128], vf=[128])], + }, + False: { + "tiny": [dict(pi=[32, 32], vf=[32, 32])], + "small": [dict(pi=[64, 64], vf=[64, 64])], + "medium": [dict(pi=[128, 128], vf=[128, 128])], + } + }[shared_arch][net_arch] + + activation_fn = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "elu": nn.ELU, + "leaky_relu": nn.LeakyReLU + }[activation_fn] + + return { + "n_steps": + n_steps, + "batch_size": + batch_size, + "gamma": + gamma, + "learning_rate": + learning_rate, + "ent_coef": + ent_coef, + "clip_range": + clip_range, + "n_epochs": + n_epochs, + "gae_lambda": + gae_lambda, + "max_grad_norm": + max_grad_norm, + "vf_coef": + vf_coef, + # "sde_sample_freq": sde_sample_freq, + "policy_kwargs": + dict( + # log_std_init=log_std_init, + net_arch=net_arch, + activation_fn=activation_fn, + ortho_init=ortho_init, + ), + } + + +class TrialCallback(BaseCallback): + """ + Callback used for evaluating and reporting a trial. + """ + def __init__( + self, + trial: optuna.Trial, + log_dir: str, + n_eval_episodes: int = 10, + eval_freq: int = 10000, + min_eval: float = -1500, + verbose: int = 0, + ): + super(TrialCallback, self).__init__(verbose) + + self.eval_freq = eval_freq + self.n_eval_episodes = n_eval_episodes + self.log_dir = log_dir + self.trial = trial + self.eval_idx = 0 + self.is_pruned = False + self.min_eval = min_eval + + def _on_step(self) -> bool: + if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: + results = load_results(self.log_dir) + if len(results) < self.n_eval_episodes: + return True + avg_last_n = results['r'][-self.n_eval_episodes:].mean() + self.eval_idx += 1 + # report best or report current ? + # report num_timesteps or elasped time ? + self.trial.report(avg_last_n, self.eval_idx) + # print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n) + + # Prune trial if need + if avg_last_n < self.min_eval or self.trial.should_prune(): + self.is_pruned = True + return False + + return True + + +def objective(trial: optuna.Trial) -> float: + n_eval_episodes = 15 + eval_freq = 5000 + n_steps = 350000 + + with tempfile.TemporaryDirectory() as log_dir: + env = DummyVecEnv([ + lambda: Monitor(gym.make('FractionArith-v1'), log_dir)]) + + ppo_args = sample_ppo_params(trial) + + model = PPO(MlpPolicy, env, + # tensorboard_log="./tensorboard_ppo_multi/", + **ppo_args) + # gamma=0.1, + # tensorboard_log="./tensorboard/v0/") + callback = TrialCallback(trial, log_dir, verbose=1, + n_eval_episodes=n_eval_episodes, + eval_freq=eval_freq) + + try: + model.learn(total_timesteps=n_steps, callback=callback) + model.env.close() + except Exception as e: + model.env.close() + print(e) + raise optuna.exceptions.TrialPruned() + + is_pruned = callback.is_pruned + del model.env + del model + + if is_pruned: + raise optuna.exceptions.TrialPruned() + + results = load_results(log_dir) + avg_last_n = results['r'][-n_eval_episodes:].mean() + # print('Final avg_last_n:', avg_last_n) + return avg_last_n + + +if __name__ == "__main__": + + # multiprocess environment + # env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1) + + pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000) + + study = optuna.create_study(study_name="ppo-operator", + pruner=pruner, + direction="maximize", + storage='sqlite:///study.db', + load_if_exists=True + ) + try: + study.optimize(objective, n_trials=1000, n_jobs=1) + except Exception as e: + print(e) + finally: + print("BEST") + print(study.best_params) diff --git a/sandbox/multicolumn/train_ppo_operator.py b/sandbox/multicolumn/train_ppo_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..969f159c559f2d4e111f7f84e4e8ec35ceb2c2b7 --- /dev/null +++ b/sandbox/multicolumn/train_ppo_operator.py @@ -0,0 +1,195 @@ +from typing import Dict +from typing import Any + +import optuna +from torch import nn as nn +from stable_baselines3 import PPO +from stable_baselines3.ppo import MlpPolicy +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.callbacks import EvalCallback +from stable_baselines3.common.vec_env import VecEnv + +from tutorenvs.utils import linear_schedule + + +def get_args(params: Dict[str, Any]) -> Dict[str, Any]: + """ + Sampler for PPO hyperparams. + :param trial: + :return: + """ + batch_size = int(2**params['batches_pow']) + n_steps = int(2**params['n_step_pow']) + gamma = params['gamma'] + learning_rate = params['lr'] + lr_schedule = params['lr_schedule'] + ent_coef = params['ent_coef'] + clip_range = params['clip_range'] + n_epochs = params['n_epochs'] + gae_lambda = params['gae_lambda'] + max_grad_norm = params['max_grad_norm'] + vf_coef = params['vf_coef'] + net_arch = params['net_arch'] + shared_arch = params['shared_arch'] + activation_fn = params['activation_fn'] + + # TODO: account when using multiple envs + if batch_size > n_steps: + batch_size = n_steps + + if lr_schedule == "linear": + learning_rate = linear_schedule(learning_rate) + + # Independent networks usually work best + # when not working with images + net_arch = { + True: { + "tiny": [32, dict(pi=[32], vf=[32])], + "small": [64, dict(pi=[64], vf=[64])], + "medium": [128, dict(pi=[128], vf=[128])], + }, + False: { + "tiny": [dict(pi=[32, 32], vf=[32, 32])], + "small": [dict(pi=[64, 64], vf=[64, 64])], + "medium": [dict(pi=[128, 128], vf=[128, 128])], + } + }[shared_arch][net_arch] + + activation_fn = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "elu": nn.ELU, + "leaky_relu": nn.LeakyReLU + }[activation_fn] + + ortho_init = False + + return { + "n_steps": + n_steps, + "batch_size": + batch_size, + "gamma": + gamma, + "learning_rate": + learning_rate, + "ent_coef": + ent_coef, + "clip_range": + clip_range, + "n_epochs": + n_epochs, + "gae_lambda": + gae_lambda, + "max_grad_norm": + max_grad_norm, + "vf_coef": + vf_coef, + # "sde_sample_freq": sde_sample_freq, + "policy_kwargs": + dict( + # log_std_init=log_std_init, + net_arch=net_arch, + activation_fn=activation_fn, + ortho_init=ortho_init, + ), + } + + +class TrialEvalCallback(EvalCallback): + """ + Callback used for evaluating and reporting a trial. + """ + def __init__( + self, + eval_env: VecEnv, + trial: optuna.Trial, + n_eval_episodes: int = 5, + eval_freq: int = 10000, + deterministic: bool = True, + verbose: int = 0, + ): + + super(TrialEvalCallback, self).__init__( + eval_env=eval_env, + n_eval_episodes=n_eval_episodes, + eval_freq=eval_freq, + deterministic=deterministic, + verbose=verbose, + ) + self.trial = trial + self.eval_idx = 0 + self.is_pruned = False + + def _on_step(self) -> bool: + if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: + super(TrialEvalCallback, self)._on_step() + self.eval_idx += 1 + # report best or report current ? + # report num_timesteps or elasped time ? + self.trial.report(self.last_mean_reward, self.eval_idx) + # Prune trial if need + if self.trial.should_prune(): + self.is_pruned = True + return False + return True + + +if __name__ == "__main__": + # params = { + # 'batch_size': 32, + # 'n_steps': 16, + # 'gamma': 0.0, + # 'lr': 0.00017980950834568327, + # 'lr_schedule': 'constant', + # 'ent_coef': 0.07439893598338435, + # 'clip_range': 0.4, + # 'n_epochs': 10, + # 'gae_lambda': 0.95, + # 'max_grad_norm': 0.8, + # 'vf_coef': 0.13214811411452415, + # 'net_arch': 'medium', + # 'shared_arch': False, + # 'activation_fn': 'tanh' + # } + + # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1, + # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma': + # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear', + # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch': + # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438} + + params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr': + 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef': + 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5, + 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef': + 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False, + 'activation_fn': 'relu'} + + kwargs = get_args(params) + + # multiprocess environment + env = make_vec_env('MulticolumnArithSymbolic-v1', n_envs=1) + model = PPO( + MlpPolicy, + env, + verbose=1, + tensorboard_log="./tensorboard_ppo_multi/", + **kwargs + ) + # gamma=0.1, + # tensorboard_log="./tensorboard/v0/") + + # while True: + # Train + model.learn(total_timesteps=1000000) + + # Test + # obs = env.reset() + # rwd = 0 + # for _ in range(10000): + # action, _states = model.predict(obs) + # obs, rewards, dones, info = env.step(action) + # rwd += np.sum(rewards) + # env.render() + # print(rwd) diff --git a/sandbox/multicolumn/tune_ppo_operator.py b/sandbox/multicolumn/tune_ppo_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..44aeffb02d71f3ca536beb0bd12521f10149dd54 --- /dev/null +++ b/sandbox/multicolumn/tune_ppo_operator.py @@ -0,0 +1,230 @@ +from typing import Dict +from typing import Any +import tempfile + +import gym +import optuna +from torch import nn as nn +from stable_baselines3 import PPO +from stable_baselines3.ppo import MlpPolicy +# from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.callbacks import BaseCallback +from stable_baselines3.common.vec_env import DummyVecEnv +# from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.monitor import load_results + +import tutorenvs # noqa: F401 +from tutorenvs.utils import linear_schedule + + +def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]: + """ + Sampler for PPO hyperparams. + + :param trial: + :return: + """ + n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1) + n_steps = int(2**n_step_pow) + + # possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048] + # n_steps = trial.suggest_categorical("n_steps", + # possible_n_steps) + + batches_pow = trial.suggest_discrete_uniform('batches_pow', 3, + n_step_pow, 1) + batch_size = int(2**batches_pow) + + # possible_batches = [8, 16, 32, 64, 128, 256, 512] + # batch_size = trial.suggest_categorical("batch_size", + # possible_batches) + + gamma = trial.suggest_categorical("gamma", [0.0]) + # 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) + learning_rate = trial.suggest_loguniform("lr", 1e-8, 1) + # lr_schedule = "constant" + # Uncomment to enable learning rate schedule + lr_schedule = trial.suggest_categorical('lr_schedule', + ['linear', 'constant']) + ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1) + clip_range = trial.suggest_categorical("clip_range", + [0.05, 0.1, 0.2, 0.3, 0.4]) + n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20]) + gae_lambda = trial.suggest_categorical( + "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]) + max_grad_norm = trial.suggest_categorical( + "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]) + vf_coef = trial.suggest_uniform("vf_coef", 0, 1) + net_arch = trial.suggest_categorical("net_arch", + ["tiny", "small", "medium"]) + shared_arch = trial.suggest_categorical("shared_arch", [True, False]) + ortho_init = False + activation_fn = trial.suggest_categorical("activation_fn", + ["tanh", "relu"]) + + # TODO: account when using multiple envs + if batch_size > n_steps: + batch_size = n_steps + + if lr_schedule == "linear": + learning_rate = linear_schedule(learning_rate) + + # Independent networks usually work best + # when not working with images + net_arch = { + True: { + "tiny": [32, dict(pi=[32], vf=[32])], + "small": [64, dict(pi=[64], vf=[64])], + "medium": [128, dict(pi=[128], vf=[128])], + }, + False: { + "tiny": [dict(pi=[32, 32], vf=[32, 32])], + "small": [dict(pi=[64, 64], vf=[64, 64])], + "medium": [dict(pi=[128, 128], vf=[128, 128])], + } + }[shared_arch][net_arch] + + activation_fn = { + "tanh": nn.Tanh, + "relu": nn.ReLU, + "elu": nn.ELU, + "leaky_relu": nn.LeakyReLU + }[activation_fn] + + return { + "n_steps": + n_steps, + "batch_size": + batch_size, + "gamma": + gamma, + "learning_rate": + learning_rate, + "ent_coef": + ent_coef, + "clip_range": + clip_range, + "n_epochs": + n_epochs, + "gae_lambda": + gae_lambda, + "max_grad_norm": + max_grad_norm, + "vf_coef": + vf_coef, + # "sde_sample_freq": sde_sample_freq, + "policy_kwargs": + dict( + # log_std_init=log_std_init, + net_arch=net_arch, + activation_fn=activation_fn, + ortho_init=ortho_init, + ), + } + + +class TrialCallback(BaseCallback): + """ + Callback used for evaluating and reporting a trial. + """ + def __init__( + self, + trial: optuna.Trial, + log_dir: str, + n_eval_episodes: int = 10, + eval_freq: int = 10000, + min_eval: float = -1000, + verbose: int = 0, + ): + super(TrialCallback, self).__init__(verbose) + + self.eval_freq = eval_freq + self.n_eval_episodes = n_eval_episodes + self.log_dir = log_dir + self.trial = trial + self.eval_idx = 0 + self.is_pruned = False + self.min_eval = min_eval + + def _on_step(self) -> bool: + if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: + results = load_results(self.log_dir) + if len(results) < self.n_eval_episodes: + return True + avg_last_n = results['r'][-self.n_eval_episodes:].mean() + self.eval_idx += 1 + # report best or report current ? + # report num_timesteps or elasped time ? + self.trial.report(avg_last_n, self.eval_idx) + # print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n) + + # Prune trial if need + if avg_last_n < self.min_eval or self.trial.should_prune(): + self.is_pruned = True + return False + + return True + + +def objective(trial: optuna.Trial) -> float: + n_eval_episodes = 15 + eval_freq = 5000 + n_steps = 350000 + + with tempfile.TemporaryDirectory() as log_dir: + env = DummyVecEnv([ + lambda: Monitor(gym.make('MulticolumnArithSymbolic-v1'), log_dir)]) + + ppo_args = sample_ppo_params(trial) + + model = PPO(MlpPolicy, env, + # tensorboard_log="./tensorboard_ppo_multi/", + **ppo_args) + # gamma=0.1, + # tensorboard_log="./tensorboard/v0/") + callback = TrialCallback(trial, log_dir, verbose=1, + n_eval_episodes=n_eval_episodes, + eval_freq=eval_freq) + + try: + model.learn(total_timesteps=n_steps, callback=callback) + model.env.close() + except Exception as e: + model.env.close() + print(e) + raise optuna.exceptions.TrialPruned() + + is_pruned = callback.is_pruned + del model.env + del model + + if is_pruned: + raise optuna.exceptions.TrialPruned() + + results = load_results(log_dir) + avg_last_n = results['r'][-n_eval_episodes:].mean() + # print('Final avg_last_n:', avg_last_n) + return avg_last_n + + +if __name__ == "__main__": + + # multiprocess environment + # env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1) + + pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000) + + study = optuna.create_study(study_name="ppo-multicolumn-operator", + pruner=pruner, + direction="maximize", + storage='sqlite:///study.db', + load_if_exists=True + ) + try: + study.optimize(objective, n_trials=1000, n_jobs=1) + except Exception as e: + print(e) + finally: + print("BEST") + print(study.best_params) diff --git a/tutorenvs/__init__.py b/tutorenvs/__init__.py index 04a1fef8ba481e8fd183da3f5eca58660dda109c..7e2a09c19c154677a054017bf2158d5c8a89d1c6 100644 --- a/tutorenvs/__init__.py +++ b/tutorenvs/__init__.py @@ -5,6 +5,7 @@ from tutorenvs.fractions import FractionArithOppEnv # noqa: F401 from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv # noqa: F401 from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv # noqa: F401 from tutorenvs.multicolumn import MultiColumnAdditionPerceptEnv # noqa: F401 +from tutorenvs.multicolumn import MultiColumnAdditionOppEnv # noqa: F401 register( id='FractionArith-v0', @@ -32,6 +33,11 @@ register( entry_point='tutorenvs:MultiColumnAdditionDigitsEnv', ) +register( + id='MulticolumnArithSymbolic-v1', + entry_point='tutorenvs:MultiColumnAdditionOppEnv', +) + register( id='MulticolumnArithPixel-v0', entry_point='tutorenvs:MultiColumnAdditionPixelEnv', diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py index 1d7194d3f31419ed90eb8cd374893deff1d32a23..f97a9b01b14ca0e77329269cc19ba28cb2713f81 100644 --- a/tutorenvs/multicolumn.py +++ b/tutorenvs/multicolumn.py @@ -513,6 +513,191 @@ class MultiColumnAdditionDigitsEnv(gym.Env): def render(self, mode='human', close=False): self.tutor.render() + +def int2_float_add_then_ones(x, y): + z = float(x) + float(y) + z = z % 10 + if z.is_integer(): + z = int(z) + return str(z) + + +def int2_float_add_then_tens(x, y): + z = float(x) + float(y) + z = z // 10 + if z.is_integer(): + z = int(z) + return str(z) + + +def int3_float_add_then_ones(x, y, w): + z = float(x) + float(y) + float(w) + z = z % 10 + if z.is_integer(): + z = int(z) + return str(z) + + +def int3_float_add_then_tens(x, y, w): + z = float(x) + float(y) + float(w) + z = z // 10 + if z.is_integer(): + z = int(z) + return str(z) + + +def add_tens(x, y, w): + if w is None: + return int2_float_add_then_tens(x, y) + return int3_float_add_then_tens(x, y, w) + + +def add_ones(x, y, w): + if w is None: + return int2_float_add_then_ones(x, y) + return int3_float_add_then_ones(x, y, w) + + +class MultiColumnAdditionOppEnv(gym.Env): + metadata = {'render.modes': ['human']} + + def __init__(self): + self.tutor = MultiColumnAdditionSymbolic() + n_selections = len(self.tutor.get_possible_selections()) + n_features = 2000 + n_operators = len(self.get_rl_operators()) + n_args = len(self.tutor.get_possible_args()) + self.dv = OnlineDictVectorizer(n_features) + self.observation_space = spaces.Box( + low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32) + self.action_space = spaces.MultiDiscrete([n_selections, n_operators, + n_args, n_args, n_args]) + self.n_steps = 0 + self.max_steps = 100000 + + def get_rl_operators(self): + return ['copy', + 'add2-tens', + 'add2-ones', + 'add3-tens', + 'add3-ones', + ] + + def get_rl_state(self): + state = self.tutor.state.copy() + for attr in self.tutor.state: + if attr == "operator": + continue + for attr2 in self.tutor.state: + if attr2 == "operator": + continue + if attr >= attr2: + continue + + try: + ones2 = int2_float_add_then_ones(state[attr], state[attr2]) + state['add2-ones(%s,%s)' % (attr, attr2)] = ones2 + except Exception: + pass + try: + tens2 = int2_float_add_then_tens(state[attr], state[attr2]) + state['add2-tens(%s,%s)' % (attr, attr2)] = tens2 + except Exception: + pass + + for attr3 in self.tutor.state: + if attr3 == "operator": + continue + if attr2 >= attr3: + continue + + try: + ones3 = int3_float_add_then_ones(state[attr], state[attr2], + state[attr3]) + state['add2-ones(%s,%s,%s)' % (attr, attr2, attr3)] = ones3 + except Exception: + pass + try: + tens3 = int3_float_add_then_tens(state[attr], state[attr2], + state[attr3]) + state['add2-tens(%s,%s,%s)' % (attr, attr2, attr3)] = tens3 + except Exception: + pass + + return state + + def step(self, action): + try: + s, a, i = self.decode(action) + reward = self.tutor.apply_sai(s, a, i) + done = (s == 'done' and reward == 1.0) + except ValueError: + reward = -1 + done = False + + # print(s, a, i) + # print() + # print(reward) + + state = self.get_rl_state() + # pprint(state) + obs = self.dv.fit_transform([state])[0] + info = {} + + return obs, reward, done, info + + def apply_rl_op(self, op, arg1, arg2, arg3): + if op == "copy": + return self.tutor.state[arg1] + elif op == "add2-tens": + return int2_float_add_then_tens(self.tutor.state[arg1], + self.tutor.state[arg2]) + elif op == "add2-ones": + return int2_float_add_then_ones(self.tutor.state[arg1], + self.tutor.state[arg2]) + elif op == "add3-tens": + return int3_float_add_then_tens(self.tutor.state[arg1], + self.tutor.state[arg2], + self.tutor.state[arg3]) + elif op == "add3-ones": + return int3_float_add_then_ones(self.tutor.state[arg1], + self.tutor.state[arg2], + self.tutor.state[arg3]) + + def decode(self, action): + # print(action) + s = self.tutor.get_possible_selections()[action[0]] + op = self.get_rl_operators()[action[1]] + arg1 = self.tutor.get_possible_args()[action[2]] + arg2 = self.tutor.get_possible_args()[action[3]] + arg3 = self.tutor.get_possible_args()[action[3]] + + if s == "done": + a = "ButtonPressed" + else: + a = "UpdateField" + + if s == "done": + v = -1 + if s == "check_convert": + v = "x" + else: + v = self.apply_rl_op(op, arg1, arg2, arg3) + + i = {'value': str(v)} + + return s, a, i + + def reset(self): + self.tutor.set_random_problem() + state = self.get_rl_state() + obs = self.dv.fit_transform([state])[0] + return obs + + def render(self, mode='human', close=False): + self.tutor.render() + + class MultiColumnAdditionPixelEnv(gym.Env): metadata = {'render.modes': ['human']}