diff --git a/sandbox/fractions/train_ppo_operator.py b/sandbox/fractions/train_ppo_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1702ed5d287a7d6d67c58f110d3e3b2de38a3610
--- /dev/null
+++ b/sandbox/fractions/train_ppo_operator.py
@@ -0,0 +1,195 @@
+from typing import Dict
+from typing import Any
+
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import VecEnv
+
+from tutorenvs.utils import linear_schedule
+
+
+def get_args(params: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams.
+    :param trial:
+    :return:
+    """
+    batch_size = int(2**params['batches_pow'])
+    n_steps = int(2**params['n_step_pow'])
+    gamma = params['gamma']
+    learning_rate = params['lr']
+    lr_schedule = params['lr_schedule']
+    ent_coef = params['ent_coef']
+    clip_range = params['clip_range']
+    n_epochs = params['n_epochs']
+    gae_lambda = params['gae_lambda']
+    max_grad_norm = params['max_grad_norm']
+    vf_coef = params['vf_coef']
+    net_arch = params['net_arch']
+    shared_arch = params['shared_arch']
+    activation_fn = params['activation_fn']
+
+    # TODO: account when using multiple envs
+    if batch_size > n_steps:
+        batch_size = n_steps
+
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    # Independent networks usually work best
+    # when not working with images
+    net_arch = {
+        True: {
+            "tiny": [32, dict(pi=[32], vf=[32])],
+            "small": [64, dict(pi=[64], vf=[64])],
+            "medium": [128, dict(pi=[128], vf=[128])],
+        },
+        False: {
+            "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+            "small": [dict(pi=[64, 64], vf=[64, 64])],
+            "medium": [dict(pi=[128, 128], vf=[128, 128])],
+        }
+    }[shared_arch][net_arch]
+
+    activation_fn = {
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "elu": nn.ELU,
+        "leaky_relu": nn.LeakyReLU
+    }[activation_fn]
+
+    ortho_init = False
+
+    return {
+        "n_steps":
+        n_steps,
+        "batch_size":
+        batch_size,
+        "gamma":
+        gamma,
+        "learning_rate":
+        learning_rate,
+        "ent_coef":
+        ent_coef,
+        "clip_range":
+        clip_range,
+        "n_epochs":
+        n_epochs,
+        "gae_lambda":
+        gae_lambda,
+        "max_grad_norm":
+        max_grad_norm,
+        "vf_coef":
+        vf_coef,
+        # "sde_sample_freq": sde_sample_freq,
+        "policy_kwargs":
+        dict(
+            # log_std_init=log_std_init,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+class TrialEvalCallback(EvalCallback):
+    """
+    Callback used for evaluating and reporting a trial.
+    """
+    def __init__(
+        self,
+        eval_env: VecEnv,
+        trial: optuna.Trial,
+        n_eval_episodes: int = 5,
+        eval_freq: int = 10000,
+        deterministic: bool = True,
+        verbose: int = 0,
+    ):
+
+        super(TrialEvalCallback, self).__init__(
+            eval_env=eval_env,
+            n_eval_episodes=n_eval_episodes,
+            eval_freq=eval_freq,
+            deterministic=deterministic,
+            verbose=verbose,
+        )
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            super(TrialEvalCallback, self)._on_step()
+            self.eval_idx += 1
+            # report best or report current ?
+            # report num_timesteps or elasped time ?
+            self.trial.report(self.last_mean_reward, self.eval_idx)
+            # Prune trial if need
+            if self.trial.should_prune():
+                self.is_pruned = True
+                return False
+        return True
+
+
+if __name__ == "__main__":
+    # params = {
+    #     'batch_size': 32,
+    #     'n_steps': 16,
+    #     'gamma': 0.0,
+    #     'lr': 0.00017980950834568327,
+    #     'lr_schedule': 'constant',
+    #     'ent_coef': 0.07439893598338435,
+    #     'clip_range': 0.4,
+    #     'n_epochs': 10,
+    #     'gae_lambda': 0.95,
+    #     'max_grad_norm': 0.8,
+    #     'vf_coef': 0.13214811411452415,
+    #     'net_arch': 'medium',
+    #     'shared_arch': False,
+    #     'activation_fn': 'tanh'
+    # }
+
+    # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
+    #           'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
+    #           0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
+    #           'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
+    #           'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
+
+    params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
+              0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
+              0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
+              'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
+              0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
+              'activation_fn': 'relu'}
+
+    kwargs = get_args(params)
+
+    # multiprocess environment
+    env = make_vec_env('FractionArith-v1', n_envs=1)
+    model = PPO(
+        MlpPolicy,
+        env,
+        verbose=1,
+        tensorboard_log="./tensorboard_ppo/",
+        **kwargs
+    )
+    # gamma=0.1,
+    # tensorboard_log="./tensorboard/v0/")
+
+    # while True:
+    # Train
+    model.learn(total_timesteps=1000000)
+
+    # Test
+    # obs = env.reset()
+    # rwd = 0
+    # for _ in range(10000):
+    #     action, _states = model.predict(obs)
+    #     obs, rewards, dones, info = env.step(action)
+    #     rwd += np.sum(rewards)
+    #     env.render()
+    # print(rwd)
diff --git a/sandbox/fractions/tune_ppo_operator.py b/sandbox/fractions/tune_ppo_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..17f6d8db7e03ab3e8dfe35217716cf4f64f9403a
--- /dev/null
+++ b/sandbox/fractions/tune_ppo_operator.py
@@ -0,0 +1,230 @@
+from typing import Dict
+from typing import Any
+import tempfile
+
+import gym
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+# from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import BaseCallback
+from stable_baselines3.common.vec_env import DummyVecEnv
+# from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.monitor import load_results
+
+import tutorenvs  # noqa: F401
+from tutorenvs.utils import linear_schedule
+
+
+def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams.
+
+    :param trial:
+    :return:
+    """
+    n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1)
+    n_steps = int(2**n_step_pow)
+
+    # possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+    # n_steps = trial.suggest_categorical("n_steps",
+    #                                     possible_n_steps)
+
+    batches_pow = trial.suggest_discrete_uniform('batches_pow', 3,
+                                                 n_step_pow, 1)
+    batch_size = int(2**batches_pow)
+
+    # possible_batches = [8, 16, 32, 64, 128, 256, 512]
+    # batch_size = trial.suggest_categorical("batch_size",
+    #                                        possible_batches)
+
+    gamma = trial.suggest_categorical("gamma", [0.0])
+    # 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
+    learning_rate = trial.suggest_loguniform("lr", 1e-8, 1)
+    # lr_schedule = "constant"
+    # Uncomment to enable learning rate schedule
+    lr_schedule = trial.suggest_categorical('lr_schedule',
+                                            ['linear', 'constant'])
+    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1)
+    clip_range = trial.suggest_categorical("clip_range",
+                                           [0.05, 0.1, 0.2, 0.3, 0.4])
+    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
+    gae_lambda = trial.suggest_categorical(
+        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
+    max_grad_norm = trial.suggest_categorical(
+        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
+    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
+    net_arch = trial.suggest_categorical("net_arch",
+                                         ["tiny", "small", "medium"])
+    shared_arch = trial.suggest_categorical("shared_arch", [True, False])
+    ortho_init = False
+    activation_fn = trial.suggest_categorical("activation_fn",
+                                              ["tanh", "relu"])
+
+    # TODO: account when using multiple envs
+    if batch_size > n_steps:
+        batch_size = n_steps
+
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    # Independent networks usually work best
+    # when not working with images
+    net_arch = {
+        True: {
+            "tiny": [32, dict(pi=[32], vf=[32])],
+            "small": [64, dict(pi=[64], vf=[64])],
+            "medium": [128, dict(pi=[128], vf=[128])],
+        },
+        False: {
+            "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+            "small": [dict(pi=[64, 64], vf=[64, 64])],
+            "medium": [dict(pi=[128, 128], vf=[128, 128])],
+        }
+    }[shared_arch][net_arch]
+
+    activation_fn = {
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "elu": nn.ELU,
+        "leaky_relu": nn.LeakyReLU
+    }[activation_fn]
+
+    return {
+        "n_steps":
+        n_steps,
+        "batch_size":
+        batch_size,
+        "gamma":
+        gamma,
+        "learning_rate":
+        learning_rate,
+        "ent_coef":
+        ent_coef,
+        "clip_range":
+        clip_range,
+        "n_epochs":
+        n_epochs,
+        "gae_lambda":
+        gae_lambda,
+        "max_grad_norm":
+        max_grad_norm,
+        "vf_coef":
+        vf_coef,
+        # "sde_sample_freq": sde_sample_freq,
+        "policy_kwargs":
+        dict(
+            # log_std_init=log_std_init,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+class TrialCallback(BaseCallback):
+    """
+    Callback used for evaluating and reporting a trial.
+    """
+    def __init__(
+        self,
+        trial: optuna.Trial,
+        log_dir: str,
+        n_eval_episodes: int = 10,
+        eval_freq: int = 10000,
+        min_eval: float = -1500,
+        verbose: int = 0,
+    ):
+        super(TrialCallback, self).__init__(verbose)
+
+        self.eval_freq = eval_freq
+        self.n_eval_episodes = n_eval_episodes
+        self.log_dir = log_dir
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+        self.min_eval = min_eval
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            results = load_results(self.log_dir)
+            if len(results) < self.n_eval_episodes:
+                return True
+            avg_last_n = results['r'][-self.n_eval_episodes:].mean()
+            self.eval_idx += 1
+            # report best or report current ?
+            # report num_timesteps or elasped time ?
+            self.trial.report(avg_last_n, self.eval_idx)
+            # print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n)
+
+            # Prune trial if need
+            if avg_last_n < self.min_eval or self.trial.should_prune():
+                self.is_pruned = True
+                return False
+
+        return True
+
+
+def objective(trial: optuna.Trial) -> float:
+    n_eval_episodes = 15
+    eval_freq = 5000
+    n_steps = 350000
+
+    with tempfile.TemporaryDirectory() as log_dir:
+        env = DummyVecEnv([
+            lambda: Monitor(gym.make('FractionArith-v1'), log_dir)])
+
+        ppo_args = sample_ppo_params(trial)
+
+        model = PPO(MlpPolicy, env,
+                    # tensorboard_log="./tensorboard_ppo_multi/",
+                    **ppo_args)
+        # gamma=0.1,
+        # tensorboard_log="./tensorboard/v0/")
+        callback = TrialCallback(trial, log_dir, verbose=1,
+                                 n_eval_episodes=n_eval_episodes,
+                                 eval_freq=eval_freq)
+
+        try:
+            model.learn(total_timesteps=n_steps, callback=callback)
+            model.env.close()
+        except Exception as e:
+            model.env.close()
+            print(e)
+            raise optuna.exceptions.TrialPruned()
+
+        is_pruned = callback.is_pruned
+        del model.env
+        del model
+
+        if is_pruned:
+            raise optuna.exceptions.TrialPruned()
+
+        results = load_results(log_dir)
+        avg_last_n = results['r'][-n_eval_episodes:].mean()
+        # print('Final avg_last_n:', avg_last_n)
+        return avg_last_n
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    # env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1)
+
+    pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000)
+
+    study = optuna.create_study(study_name="ppo-operator",
+                                pruner=pruner,
+                                direction="maximize",
+                                storage='sqlite:///study.db',
+                                load_if_exists=True
+                                )
+    try:
+        study.optimize(objective, n_trials=1000, n_jobs=1)
+    except Exception as e:
+        print(e)
+    finally:
+        print("BEST")
+        print(study.best_params)
diff --git a/sandbox/multicolumn/train_ppo_operator.py b/sandbox/multicolumn/train_ppo_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..969f159c559f2d4e111f7f84e4e8ec35ceb2c2b7
--- /dev/null
+++ b/sandbox/multicolumn/train_ppo_operator.py
@@ -0,0 +1,195 @@
+from typing import Dict
+from typing import Any
+
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import VecEnv
+
+from tutorenvs.utils import linear_schedule
+
+
+def get_args(params: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams.
+    :param trial:
+    :return:
+    """
+    batch_size = int(2**params['batches_pow'])
+    n_steps = int(2**params['n_step_pow'])
+    gamma = params['gamma']
+    learning_rate = params['lr']
+    lr_schedule = params['lr_schedule']
+    ent_coef = params['ent_coef']
+    clip_range = params['clip_range']
+    n_epochs = params['n_epochs']
+    gae_lambda = params['gae_lambda']
+    max_grad_norm = params['max_grad_norm']
+    vf_coef = params['vf_coef']
+    net_arch = params['net_arch']
+    shared_arch = params['shared_arch']
+    activation_fn = params['activation_fn']
+
+    # TODO: account when using multiple envs
+    if batch_size > n_steps:
+        batch_size = n_steps
+
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    # Independent networks usually work best
+    # when not working with images
+    net_arch = {
+        True: {
+            "tiny": [32, dict(pi=[32], vf=[32])],
+            "small": [64, dict(pi=[64], vf=[64])],
+            "medium": [128, dict(pi=[128], vf=[128])],
+        },
+        False: {
+            "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+            "small": [dict(pi=[64, 64], vf=[64, 64])],
+            "medium": [dict(pi=[128, 128], vf=[128, 128])],
+        }
+    }[shared_arch][net_arch]
+
+    activation_fn = {
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "elu": nn.ELU,
+        "leaky_relu": nn.LeakyReLU
+    }[activation_fn]
+
+    ortho_init = False
+
+    return {
+        "n_steps":
+        n_steps,
+        "batch_size":
+        batch_size,
+        "gamma":
+        gamma,
+        "learning_rate":
+        learning_rate,
+        "ent_coef":
+        ent_coef,
+        "clip_range":
+        clip_range,
+        "n_epochs":
+        n_epochs,
+        "gae_lambda":
+        gae_lambda,
+        "max_grad_norm":
+        max_grad_norm,
+        "vf_coef":
+        vf_coef,
+        # "sde_sample_freq": sde_sample_freq,
+        "policy_kwargs":
+        dict(
+            # log_std_init=log_std_init,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+class TrialEvalCallback(EvalCallback):
+    """
+    Callback used for evaluating and reporting a trial.
+    """
+    def __init__(
+        self,
+        eval_env: VecEnv,
+        trial: optuna.Trial,
+        n_eval_episodes: int = 5,
+        eval_freq: int = 10000,
+        deterministic: bool = True,
+        verbose: int = 0,
+    ):
+
+        super(TrialEvalCallback, self).__init__(
+            eval_env=eval_env,
+            n_eval_episodes=n_eval_episodes,
+            eval_freq=eval_freq,
+            deterministic=deterministic,
+            verbose=verbose,
+        )
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            super(TrialEvalCallback, self)._on_step()
+            self.eval_idx += 1
+            # report best or report current ?
+            # report num_timesteps or elasped time ?
+            self.trial.report(self.last_mean_reward, self.eval_idx)
+            # Prune trial if need
+            if self.trial.should_prune():
+                self.is_pruned = True
+                return False
+        return True
+
+
+if __name__ == "__main__":
+    # params = {
+    #     'batch_size': 32,
+    #     'n_steps': 16,
+    #     'gamma': 0.0,
+    #     'lr': 0.00017980950834568327,
+    #     'lr_schedule': 'constant',
+    #     'ent_coef': 0.07439893598338435,
+    #     'clip_range': 0.4,
+    #     'n_epochs': 10,
+    #     'gae_lambda': 0.95,
+    #     'max_grad_norm': 0.8,
+    #     'vf_coef': 0.13214811411452415,
+    #     'net_arch': 'medium',
+    #     'shared_arch': False,
+    #     'activation_fn': 'tanh'
+    # }
+
+    # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
+    #           'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
+    #           0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
+    #           'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
+    #           'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
+
+    params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
+              0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
+              0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
+              'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
+              0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
+              'activation_fn': 'relu'}
+
+    kwargs = get_args(params)
+
+    # multiprocess environment
+    env = make_vec_env('MulticolumnArithSymbolic-v1', n_envs=1)
+    model = PPO(
+        MlpPolicy,
+        env,
+        verbose=1,
+        tensorboard_log="./tensorboard_ppo_multi/",
+        **kwargs
+    )
+    # gamma=0.1,
+    # tensorboard_log="./tensorboard/v0/")
+
+    # while True:
+    # Train
+    model.learn(total_timesteps=1000000)
+
+    # Test
+    # obs = env.reset()
+    # rwd = 0
+    # for _ in range(10000):
+    #     action, _states = model.predict(obs)
+    #     obs, rewards, dones, info = env.step(action)
+    #     rwd += np.sum(rewards)
+    #     env.render()
+    # print(rwd)
diff --git a/sandbox/multicolumn/tune_ppo_operator.py b/sandbox/multicolumn/tune_ppo_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..44aeffb02d71f3ca536beb0bd12521f10149dd54
--- /dev/null
+++ b/sandbox/multicolumn/tune_ppo_operator.py
@@ -0,0 +1,230 @@
+from typing import Dict
+from typing import Any
+import tempfile
+
+import gym
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+# from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import BaseCallback
+from stable_baselines3.common.vec_env import DummyVecEnv
+# from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.monitor import load_results
+
+import tutorenvs  # noqa: F401
+from tutorenvs.utils import linear_schedule
+
+
+def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams.
+
+    :param trial:
+    :return:
+    """
+    n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1)
+    n_steps = int(2**n_step_pow)
+
+    # possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+    # n_steps = trial.suggest_categorical("n_steps",
+    #                                     possible_n_steps)
+
+    batches_pow = trial.suggest_discrete_uniform('batches_pow', 3,
+                                                 n_step_pow, 1)
+    batch_size = int(2**batches_pow)
+
+    # possible_batches = [8, 16, 32, 64, 128, 256, 512]
+    # batch_size = trial.suggest_categorical("batch_size",
+    #                                        possible_batches)
+
+    gamma = trial.suggest_categorical("gamma", [0.0])
+    # 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
+    learning_rate = trial.suggest_loguniform("lr", 1e-8, 1)
+    # lr_schedule = "constant"
+    # Uncomment to enable learning rate schedule
+    lr_schedule = trial.suggest_categorical('lr_schedule',
+                                            ['linear', 'constant'])
+    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1)
+    clip_range = trial.suggest_categorical("clip_range",
+                                           [0.05, 0.1, 0.2, 0.3, 0.4])
+    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
+    gae_lambda = trial.suggest_categorical(
+        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
+    max_grad_norm = trial.suggest_categorical(
+        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
+    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
+    net_arch = trial.suggest_categorical("net_arch",
+                                         ["tiny", "small", "medium"])
+    shared_arch = trial.suggest_categorical("shared_arch", [True, False])
+    ortho_init = False
+    activation_fn = trial.suggest_categorical("activation_fn",
+                                              ["tanh", "relu"])
+
+    # TODO: account when using multiple envs
+    if batch_size > n_steps:
+        batch_size = n_steps
+
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    # Independent networks usually work best
+    # when not working with images
+    net_arch = {
+        True: {
+            "tiny": [32, dict(pi=[32], vf=[32])],
+            "small": [64, dict(pi=[64], vf=[64])],
+            "medium": [128, dict(pi=[128], vf=[128])],
+        },
+        False: {
+            "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+            "small": [dict(pi=[64, 64], vf=[64, 64])],
+            "medium": [dict(pi=[128, 128], vf=[128, 128])],
+        }
+    }[shared_arch][net_arch]
+
+    activation_fn = {
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "elu": nn.ELU,
+        "leaky_relu": nn.LeakyReLU
+    }[activation_fn]
+
+    return {
+        "n_steps":
+        n_steps,
+        "batch_size":
+        batch_size,
+        "gamma":
+        gamma,
+        "learning_rate":
+        learning_rate,
+        "ent_coef":
+        ent_coef,
+        "clip_range":
+        clip_range,
+        "n_epochs":
+        n_epochs,
+        "gae_lambda":
+        gae_lambda,
+        "max_grad_norm":
+        max_grad_norm,
+        "vf_coef":
+        vf_coef,
+        # "sde_sample_freq": sde_sample_freq,
+        "policy_kwargs":
+        dict(
+            # log_std_init=log_std_init,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+class TrialCallback(BaseCallback):
+    """
+    Callback used for evaluating and reporting a trial.
+    """
+    def __init__(
+        self,
+        trial: optuna.Trial,
+        log_dir: str,
+        n_eval_episodes: int = 10,
+        eval_freq: int = 10000,
+        min_eval: float = -1000,
+        verbose: int = 0,
+    ):
+        super(TrialCallback, self).__init__(verbose)
+
+        self.eval_freq = eval_freq
+        self.n_eval_episodes = n_eval_episodes
+        self.log_dir = log_dir
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+        self.min_eval = min_eval
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            results = load_results(self.log_dir)
+            if len(results) < self.n_eval_episodes:
+                return True
+            avg_last_n = results['r'][-self.n_eval_episodes:].mean()
+            self.eval_idx += 1
+            # report best or report current ?
+            # report num_timesteps or elasped time ?
+            self.trial.report(avg_last_n, self.eval_idx)
+            # print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n)
+
+            # Prune trial if need
+            if avg_last_n < self.min_eval or self.trial.should_prune():
+                self.is_pruned = True
+                return False
+
+        return True
+
+
+def objective(trial: optuna.Trial) -> float:
+    n_eval_episodes = 15
+    eval_freq = 5000
+    n_steps = 350000
+
+    with tempfile.TemporaryDirectory() as log_dir:
+        env = DummyVecEnv([
+            lambda: Monitor(gym.make('MulticolumnArithSymbolic-v1'), log_dir)])
+
+        ppo_args = sample_ppo_params(trial)
+
+        model = PPO(MlpPolicy, env,
+                    # tensorboard_log="./tensorboard_ppo_multi/",
+                    **ppo_args)
+        # gamma=0.1,
+        # tensorboard_log="./tensorboard/v0/")
+        callback = TrialCallback(trial, log_dir, verbose=1,
+                                 n_eval_episodes=n_eval_episodes,
+                                 eval_freq=eval_freq)
+
+        try:
+            model.learn(total_timesteps=n_steps, callback=callback)
+            model.env.close()
+        except Exception as e:
+            model.env.close()
+            print(e)
+            raise optuna.exceptions.TrialPruned()
+
+        is_pruned = callback.is_pruned
+        del model.env
+        del model
+
+        if is_pruned:
+            raise optuna.exceptions.TrialPruned()
+
+        results = load_results(log_dir)
+        avg_last_n = results['r'][-n_eval_episodes:].mean()
+        # print('Final avg_last_n:', avg_last_n)
+        return avg_last_n
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    # env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1)
+
+    pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000)
+
+    study = optuna.create_study(study_name="ppo-multicolumn-operator",
+                                pruner=pruner,
+                                direction="maximize",
+                                storage='sqlite:///study.db',
+                                load_if_exists=True
+                                )
+    try:
+        study.optimize(objective, n_trials=1000, n_jobs=1)
+    except Exception as e:
+        print(e)
+    finally:
+        print("BEST")
+        print(study.best_params)
diff --git a/tutorenvs/__init__.py b/tutorenvs/__init__.py
index 04a1fef8ba481e8fd183da3f5eca58660dda109c..7e2a09c19c154677a054017bf2158d5c8a89d1c6 100644
--- a/tutorenvs/__init__.py
+++ b/tutorenvs/__init__.py
@@ -5,6 +5,7 @@ from tutorenvs.fractions import FractionArithOppEnv  # noqa: F401
 from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv  # noqa: F401
 from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv  # noqa: F401
 from tutorenvs.multicolumn import MultiColumnAdditionPerceptEnv  # noqa: F401
+from tutorenvs.multicolumn import MultiColumnAdditionOppEnv  # noqa: F401
 
 register(
     id='FractionArith-v0',
@@ -32,6 +33,11 @@ register(
     entry_point='tutorenvs:MultiColumnAdditionDigitsEnv',
 )
 
+register(
+    id='MulticolumnArithSymbolic-v1',
+    entry_point='tutorenvs:MultiColumnAdditionOppEnv',
+)
+
 register(
     id='MulticolumnArithPixel-v0',
     entry_point='tutorenvs:MultiColumnAdditionPixelEnv',
diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py
index 1d7194d3f31419ed90eb8cd374893deff1d32a23..f97a9b01b14ca0e77329269cc19ba28cb2713f81 100644
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -513,6 +513,191 @@ class MultiColumnAdditionDigitsEnv(gym.Env):
     def render(self, mode='human', close=False):
         self.tutor.render()
 
+
+def int2_float_add_then_ones(x, y):
+    z = float(x) + float(y)
+    z = z % 10
+    if z.is_integer():
+        z = int(z)
+    return str(z)
+
+
+def int2_float_add_then_tens(x, y):
+    z = float(x) + float(y)
+    z = z // 10
+    if z.is_integer():
+        z = int(z)
+    return str(z)
+
+
+def int3_float_add_then_ones(x, y, w):
+    z = float(x) + float(y) + float(w)
+    z = z % 10
+    if z.is_integer():
+        z = int(z)
+    return str(z)
+
+
+def int3_float_add_then_tens(x, y, w):
+    z = float(x) + float(y) + float(w)
+    z = z // 10
+    if z.is_integer():
+        z = int(z)
+    return str(z)
+
+
+def add_tens(x, y, w):
+    if w is None:
+        return int2_float_add_then_tens(x, y)
+    return int3_float_add_then_tens(x, y, w)
+
+
+def add_ones(x, y, w):
+    if w is None:
+        return int2_float_add_then_ones(x, y)
+    return int3_float_add_then_ones(x, y, w)
+
+
+class MultiColumnAdditionOppEnv(gym.Env):
+    metadata = {'render.modes': ['human']}
+
+    def __init__(self):
+        self.tutor = MultiColumnAdditionSymbolic()
+        n_selections = len(self.tutor.get_possible_selections())
+        n_features = 2000
+        n_operators = len(self.get_rl_operators())
+        n_args = len(self.tutor.get_possible_args())
+        self.dv = OnlineDictVectorizer(n_features)
+        self.observation_space = spaces.Box(
+            low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32)
+        self.action_space = spaces.MultiDiscrete([n_selections, n_operators,
+                                                  n_args, n_args, n_args])
+        self.n_steps = 0
+        self.max_steps = 100000
+
+    def get_rl_operators(self):
+        return ['copy',
+                'add2-tens',
+                'add2-ones',
+                'add3-tens',
+                'add3-ones',
+                ]
+
+    def get_rl_state(self):
+        state = self.tutor.state.copy()
+        for attr in self.tutor.state:
+            if attr == "operator":
+                continue
+            for attr2 in self.tutor.state:
+                if attr2 == "operator":
+                    continue
+                if attr >= attr2:
+                    continue
+
+                try:
+                    ones2 = int2_float_add_then_ones(state[attr], state[attr2])
+                    state['add2-ones(%s,%s)' % (attr, attr2)] = ones2
+                except Exception:
+                    pass
+                try:
+                    tens2 = int2_float_add_then_tens(state[attr], state[attr2])
+                    state['add2-tens(%s,%s)' % (attr, attr2)] = tens2
+                except Exception:
+                    pass
+
+                for attr3 in self.tutor.state:
+                    if attr3 == "operator":
+                        continue
+                    if attr2 >= attr3:
+                        continue
+
+                try:
+                    ones3 = int3_float_add_then_ones(state[attr], state[attr2],
+                                                     state[attr3])
+                    state['add2-ones(%s,%s,%s)' % (attr, attr2, attr3)] = ones3
+                except Exception:
+                    pass
+                try:
+                    tens3 = int3_float_add_then_tens(state[attr], state[attr2],
+                                                     state[attr3])
+                    state['add2-tens(%s,%s,%s)' % (attr, attr2, attr3)] = tens3
+                except Exception:
+                    pass
+
+        return state
+
+    def step(self, action):
+        try:
+            s, a, i = self.decode(action)
+            reward = self.tutor.apply_sai(s, a, i)
+            done = (s == 'done' and reward == 1.0)
+        except ValueError:
+            reward = -1
+            done = False
+
+        # print(s, a, i)
+        # print()
+        # print(reward)
+
+        state = self.get_rl_state()
+        # pprint(state)
+        obs = self.dv.fit_transform([state])[0]
+        info = {}
+
+        return obs, reward, done, info
+
+    def apply_rl_op(self, op, arg1, arg2, arg3):
+        if op == "copy":
+            return self.tutor.state[arg1]
+        elif op == "add2-tens":
+            return int2_float_add_then_tens(self.tutor.state[arg1],
+                                            self.tutor.state[arg2])
+        elif op == "add2-ones":
+            return int2_float_add_then_ones(self.tutor.state[arg1],
+                                            self.tutor.state[arg2])
+        elif op == "add3-tens":
+            return int3_float_add_then_tens(self.tutor.state[arg1],
+                                            self.tutor.state[arg2],
+                                            self.tutor.state[arg3])
+        elif op == "add3-ones":
+            return int3_float_add_then_ones(self.tutor.state[arg1],
+                                            self.tutor.state[arg2],
+                                            self.tutor.state[arg3])
+
+    def decode(self, action):
+        # print(action)
+        s = self.tutor.get_possible_selections()[action[0]]
+        op = self.get_rl_operators()[action[1]]
+        arg1 = self.tutor.get_possible_args()[action[2]]
+        arg2 = self.tutor.get_possible_args()[action[3]]
+        arg3 = self.tutor.get_possible_args()[action[3]]
+
+        if s == "done":
+            a = "ButtonPressed"
+        else:
+            a = "UpdateField"
+
+        if s == "done":
+            v = -1
+        if s == "check_convert":
+            v = "x"
+        else:
+            v = self.apply_rl_op(op, arg1, arg2, arg3)
+
+        i = {'value': str(v)}
+
+        return s, a, i
+
+    def reset(self):
+        self.tutor.set_random_problem()
+        state = self.get_rl_state()
+        obs = self.dv.fit_transform([state])[0]
+        return obs
+
+    def render(self, mode='human', close=False):
+        self.tutor.render()
+
+
 class MultiColumnAdditionPixelEnv(gym.Env):
     metadata = {'render.modes': ['human']}