From bf4ba0ba925ebcfe115d37da0ae60ccf1eb1c737 Mon Sep 17 00:00:00 2001
From: Chris MacLellan <2348-cm3786@users.noreply.gitlab.cci.drexel.edu>
Date: Fri, 15 Jan 2021 15:03:00 -0500
Subject: [PATCH] trying to get ppo model working with fractions, had to reduce
 problem difficulty

---
 sandbox/fractions/run_al_fractions.py |  35 ++--
 sandbox/fractions/train_ppo.py        | 195 ++++++++++++++++++++++
 sandbox/fractions/tune_ppo.py         | 229 ++++++++++++++++++++++++++
 tutorenvs/fractions.py                |  19 ++-
 tutorenvs/multicolumn.py              |   6 +-
 5 files changed, 452 insertions(+), 32 deletions(-)
 create mode 100644 sandbox/fractions/train_ppo.py
 create mode 100644 sandbox/fractions/tune_ppo.py

diff --git a/sandbox/fractions/run_al_fractions.py b/sandbox/fractions/run_al_fractions.py
index 824866f..eba7a1b 100644
--- a/sandbox/fractions/run_al_fractions.py
+++ b/sandbox/fractions/run_al_fractions.py
@@ -1,10 +1,9 @@
-from apprentice.agents.ModularAgent import ModularAgent
+from apprentice.agents.WhereWhenHowNoFoa import WhereWhenHowNoFoa
 from apprentice.working_memory.representation import Sai
 
 from tutorenvs.fractions import FractionArithSymbolic
 
 
-
 def run_training(agent, n=10):
 
     env = FractionArithSymbolic()
@@ -19,36 +18,30 @@ def run_training(agent, n=10):
         if response == {}:
             print('hint')
             selection, action, inputs = env.request_demo()
-            sai = Sai(selection=selection,
-                           action=action,
-                           inputs=inputs)
+            sai = Sai(selection=selection, action=action, inputs=inputs)
 
         else:
             sai = Sai(selection=response['selection'],
-                    action=response['action'],
-                    inputs=response['inputs'])
+                      action=response['action'],
+                      inputs=response['inputs'])
 
         reward = env.apply_sai(sai.selection, sai.action, sai.inputs)
         print('reward', reward)
 
-        agent.train(state, sai, reward)
+        next_state = env.get_state()
+
+        agent.train(state, sai, reward, next_state=next_state,
+                    skill_label="fractions",
+                    foci_of_attention=[])
 
         if sai.selection == "done" and reward == 1.0:
+            print('Finished problem {} of {}'.format(p, n))
             p += 1
 
-if __name__ == "__main__":
-    args = {"function_set" : ["RipFloatValue","Add",
-        'Multiply',
-        "Subtract",
-        # "Numerator_Multiply", "Cross_Multiply",
-        "Divide"],
 
-        "feature_set" : ["Equals"], "planner" : "numba", "search_depth" : 2,
-        "when_learner": "trestle", "where_learner": "FastMostSpecific",
-        "state_variablization" : "whereappend", "strip_attrs" :
-        ["to_left","to_right","above","below","type","id","offsetParent","dom_class"],
-        "when_args" : { "cross_rhs_inference" : "none" } }
+if __name__ == "__main__":
 
-    agent = ModularAgent(**args)
+    agent = WhereWhenHowNoFoa('fraction arith', 'fraction arith',
+                              search_depth=1)
 
-    run_training(agent, n = 100)
+    run_training(agent, n=500)
diff --git a/sandbox/fractions/train_ppo.py b/sandbox/fractions/train_ppo.py
new file mode 100644
index 0000000..079f259
--- /dev/null
+++ b/sandbox/fractions/train_ppo.py
@@ -0,0 +1,195 @@
+from typing import Dict
+from typing import Any
+
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import VecEnv
+
+from tutorenvs.utils import linear_schedule
+
+
+def get_args(params: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams.
+    :param trial:
+    :return:
+    """
+    batch_size = int(2**params['batches_pow'])
+    n_steps = int(2**params['n_step_pow'])
+    gamma = params['gamma']
+    learning_rate = params['lr']
+    lr_schedule = params['lr_schedule']
+    ent_coef = params['ent_coef']
+    clip_range = params['clip_range']
+    n_epochs = params['n_epochs']
+    gae_lambda = params['gae_lambda']
+    max_grad_norm = params['max_grad_norm']
+    vf_coef = params['vf_coef']
+    net_arch = params['net_arch']
+    shared_arch = params['shared_arch']
+    activation_fn = params['activation_fn']
+
+    # TODO: account when using multiple envs
+    if batch_size > n_steps:
+        batch_size = n_steps
+
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    # Independent networks usually work best
+    # when not working with images
+    net_arch = {
+        True: {
+            "tiny": [32, dict(pi=[32], vf=[32])],
+            "small": [64, dict(pi=[64], vf=[64])],
+            "medium": [128, dict(pi=[128], vf=[128])],
+        },
+        False: {
+            "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+            "small": [dict(pi=[64, 64], vf=[64, 64])],
+            "medium": [dict(pi=[128, 128], vf=[128, 128])],
+        }
+    }[shared_arch][net_arch]
+
+    activation_fn = {
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "elu": nn.ELU,
+        "leaky_relu": nn.LeakyReLU
+    }[activation_fn]
+
+    ortho_init = False
+
+    return {
+        "n_steps":
+        n_steps,
+        "batch_size":
+        batch_size,
+        "gamma":
+        gamma,
+        "learning_rate":
+        learning_rate,
+        "ent_coef":
+        ent_coef,
+        "clip_range":
+        clip_range,
+        "n_epochs":
+        n_epochs,
+        "gae_lambda":
+        gae_lambda,
+        "max_grad_norm":
+        max_grad_norm,
+        "vf_coef":
+        vf_coef,
+        # "sde_sample_freq": sde_sample_freq,
+        "policy_kwargs":
+        dict(
+            # log_std_init=log_std_init,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+class TrialEvalCallback(EvalCallback):
+    """
+    Callback used for evaluating and reporting a trial.
+    """
+    def __init__(
+        self,
+        eval_env: VecEnv,
+        trial: optuna.Trial,
+        n_eval_episodes: int = 5,
+        eval_freq: int = 10000,
+        deterministic: bool = True,
+        verbose: int = 0,
+    ):
+
+        super(TrialEvalCallback, self).__init__(
+            eval_env=eval_env,
+            n_eval_episodes=n_eval_episodes,
+            eval_freq=eval_freq,
+            deterministic=deterministic,
+            verbose=verbose,
+        )
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            super(TrialEvalCallback, self)._on_step()
+            self.eval_idx += 1
+            # report best or report current ?
+            # report num_timesteps or elasped time ?
+            self.trial.report(self.last_mean_reward, self.eval_idx)
+            # Prune trial if need
+            if self.trial.should_prune():
+                self.is_pruned = True
+                return False
+        return True
+
+
+if __name__ == "__main__":
+    # params = {
+    #     'batch_size': 32,
+    #     'n_steps': 16,
+    #     'gamma': 0.0,
+    #     'lr': 0.00017980950834568327,
+    #     'lr_schedule': 'constant',
+    #     'ent_coef': 0.07439893598338435,
+    #     'clip_range': 0.4,
+    #     'n_epochs': 10,
+    #     'gae_lambda': 0.95,
+    #     'max_grad_norm': 0.8,
+    #     'vf_coef': 0.13214811411452415,
+    #     'net_arch': 'medium',
+    #     'shared_arch': False,
+    #     'activation_fn': 'tanh'
+    # }
+
+    # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
+    #           'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
+    #           0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
+    #           'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
+    #           'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
+
+    params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
+              0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
+              0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
+              'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
+              0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
+              'activation_fn': 'relu'}
+
+    kwargs = get_args(params)
+
+    # multiprocess environment
+    env = make_vec_env('FractionArith-v0', n_envs=1)
+    model = PPO(
+        MlpPolicy,
+        env,
+        verbose=1,
+        tensorboard_log="./tensorboard_ppo/",
+        **kwargs
+    )
+    # gamma=0.1,
+    # tensorboard_log="./tensorboard/v0/")
+
+    # while True:
+    # Train
+    model.learn(total_timesteps=1000000)
+
+    # Test
+    # obs = env.reset()
+    # rwd = 0
+    # for _ in range(10000):
+    #     action, _states = model.predict(obs)
+    #     obs, rewards, dones, info = env.step(action)
+    #     rwd += np.sum(rewards)
+    #     env.render()
+    # print(rwd)
diff --git a/sandbox/fractions/tune_ppo.py b/sandbox/fractions/tune_ppo.py
new file mode 100644
index 0000000..0a4e1f6
--- /dev/null
+++ b/sandbox/fractions/tune_ppo.py
@@ -0,0 +1,229 @@
+from typing import Dict
+from typing import Any
+import tempfile
+
+import gym
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+# from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import BaseCallback
+from stable_baselines3.common.vec_env import DummyVecEnv
+# from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.monitor import load_results
+
+import tutorenvs  # noqa: F401
+from tutorenvs.utils import linear_schedule
+
+
+def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
+    """
+    Sampler for PPO hyperparams.
+
+    :param trial:
+    :return:
+    """
+    n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1)
+    n_steps = int(2**n_step_pow)
+
+    # possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+    # n_steps = trial.suggest_categorical("n_steps",
+    #                                     possible_n_steps)
+
+    batches_pow = trial.suggest_discrete_uniform('batches_pow', 3,
+                                                 n_step_pow, 1)
+    batch_size = int(2**batches_pow)
+
+    # possible_batches = [8, 16, 32, 64, 128, 256, 512]
+    # batch_size = trial.suggest_categorical("batch_size",
+    #                                        possible_batches)
+
+    gamma = trial.suggest_categorical("gamma", [0.0])
+    # 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
+    learning_rate = trial.suggest_loguniform("lr", 1e-8, 1)
+    # lr_schedule = "constant"
+    # Uncomment to enable learning rate schedule
+    lr_schedule = trial.suggest_categorical('lr_schedule',
+                                            ['linear', 'constant'])
+    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1)
+    clip_range = trial.suggest_categorical("clip_range",
+                                           [0.05, 0.1, 0.2, 0.3, 0.4])
+    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
+    gae_lambda = trial.suggest_categorical(
+        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
+    max_grad_norm = trial.suggest_categorical(
+        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
+    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
+    net_arch = trial.suggest_categorical("net_arch",
+                                         ["tiny", "small", "medium"])
+    shared_arch = trial.suggest_categorical("shared_arch", [True, False])
+    ortho_init = False
+    activation_fn = trial.suggest_categorical("activation_fn",
+                                              ["tanh", "relu"])
+
+    # TODO: account when using multiple envs
+    if batch_size > n_steps:
+        batch_size = n_steps
+
+    if lr_schedule == "linear":
+        learning_rate = linear_schedule(learning_rate)
+
+    # Independent networks usually work best
+    # when not working with images
+    net_arch = {
+        True: {
+            "tiny": [32, dict(pi=[32], vf=[32])],
+            "small": [64, dict(pi=[64], vf=[64])],
+            "medium": [128, dict(pi=[128], vf=[128])],
+        },
+        False: {
+            "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+            "small": [dict(pi=[64, 64], vf=[64, 64])],
+            "medium": [dict(pi=[128, 128], vf=[128, 128])],
+        }
+    }[shared_arch][net_arch]
+
+    activation_fn = {
+        "tanh": nn.Tanh,
+        "relu": nn.ReLU,
+        "elu": nn.ELU,
+        "leaky_relu": nn.LeakyReLU
+    }[activation_fn]
+
+    return {
+        "n_steps":
+        n_steps,
+        "batch_size":
+        batch_size,
+        "gamma":
+        gamma,
+        "learning_rate":
+        learning_rate,
+        "ent_coef":
+        ent_coef,
+        "clip_range":
+        clip_range,
+        "n_epochs":
+        n_epochs,
+        "gae_lambda":
+        gae_lambda,
+        "max_grad_norm":
+        max_grad_norm,
+        "vf_coef":
+        vf_coef,
+        # "sde_sample_freq": sde_sample_freq,
+        "policy_kwargs":
+        dict(
+            # log_std_init=log_std_init,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            ortho_init=ortho_init,
+        ),
+    }
+
+
+class TrialCallback(BaseCallback):
+    """
+    Callback used for evaluating and reporting a trial.
+    """
+    def __init__(
+        self,
+        trial: optuna.Trial,
+        log_dir: str,
+        n_eval_episodes: int = 10,
+        eval_freq: int = 10000,
+        min_eval: float = -600,
+        verbose: int = 0,
+    ):
+        super(TrialCallback, self).__init__(verbose)
+
+        self.eval_freq = eval_freq
+        self.n_eval_episodes = n_eval_episodes
+        self.log_dir = log_dir
+        self.trial = trial
+        self.eval_idx = 0
+        self.is_pruned = False
+        self.min_eval = min_eval
+
+    def _on_step(self) -> bool:
+        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+            results = load_results(self.log_dir)
+            if len(results) < self.n_eval_episodes:
+                return True
+            avg_last_n = results['r'][-self.n_eval_episodes:].mean()
+            self.eval_idx += 1
+            # report best or report current ?
+            # report num_timesteps or elasped time ?
+            self.trial.report(avg_last_n, self.eval_idx)
+            # print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n)
+
+            # Prune trial if need
+            if avg_last_n < self.min_eval or self.trial.should_prune():
+                self.is_pruned = True
+                return False
+
+        return True
+
+
+def objective(trial: optuna.Trial) -> float:
+    n_eval_episodes = 15
+    eval_freq = 5000
+    n_steps = 350000
+
+    with tempfile.TemporaryDirectory() as log_dir:
+        env = DummyVecEnv([
+            lambda: Monitor(gym.make('FractionArith-v0'), log_dir)])
+
+        ppo_args = sample_ppo_params(trial)
+
+        model = PPO(MlpPolicy, env,
+                    # tensorboard_log="./tensorboard_ppo_multi/",
+                    **ppo_args)
+        # gamma=0.1,
+        # tensorboard_log="./tensorboard/v0/")
+        callback = TrialCallback(trial, log_dir, verbose=1,
+                                 n_eval_episodes=n_eval_episodes,
+                                 eval_freq=eval_freq)
+
+        try:
+            model.learn(total_timesteps=n_steps, callback=callback)
+            model.env.close()
+        except Exception as e:
+            model.env.close()
+            print(e)
+            raise optuna.exceptions.TrialPruned()
+
+        is_pruned = callback.is_pruned
+        del model.env
+        del model
+
+        if is_pruned:
+            raise optuna.exceptions.TrialPruned()
+
+        results = load_results(log_dir)
+        avg_last_n = results['r'][-n_eval_episodes:].mean()
+        # print('Final avg_last_n:', avg_last_n)
+        return avg_last_n
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    # env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1)
+
+    pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000)
+
+    study = optuna.create_study(pruner=pruner,
+                                direction="maximize",
+                                storage='sqlite:///study.db',
+                                load_if_exists=True
+                                )
+    try:
+        study.optimize(objective, n_trials=1000, n_jobs=1)
+    except Exception as e:
+        print(e)
+    finally:
+        print("BEST")
+        print(study.best_params)
diff --git a/tutorenvs/fractions.py b/tutorenvs/fractions.py
index fd325d8..af81014 100644
--- a/tutorenvs/fractions.py
+++ b/tutorenvs/fractions.py
@@ -1,6 +1,7 @@
 from random import randint
 from random import choice
 from pprint import pprint
+import logging
 
 import cv2  # pytype:disable=import-error
 from PIL import Image, ImageDraw
@@ -15,6 +16,8 @@ import numpy as np
 from tutorenvs.utils import DataShopLogger
 from tutorenvs.utils import StubLogger
 
+pil_logger = logging.getLogger('PIL')
+pil_logger.setLevel(logging.INFO)
 
 class FractionArithSymbolic:
 
@@ -110,9 +113,9 @@ class FractionArithSymbolic:
 
         # append correct/incorrect counts
         if add_counts:
-            d.text((100, 0), str(self.num_hints), fill="yellow")
-            d.text((100, 10), str(self.num_incorrect_steps), fill="red")
-            d.text((100, 20), str(self.num_correct_steps), fill="green")
+            d.text((95, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
+            d.text((95, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
+            d.text((95, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
 
         # for eyes :)
         # if add_dot:
@@ -149,10 +152,10 @@ class FractionArithSymbolic:
         return state_output
 
     def set_random_problem(self):
-        num1 = str(randint(1, 7))
-        num2 = str(randint(1, 7))
-        denom1 = str(randint(2, 7))
-        denom2 = str(randint(2, 7))
+        num1 = str(randint(1, 5))
+        num2 = str(randint(1, 5))
+        denom1 = str(randint(2, 5))
+        denom2 = str(randint(2, 5))
         operator = choice(['+', '*'])
 
         self.reset(num1, denom1, operator, num2, denom2)
@@ -386,7 +389,7 @@ class FractionArithNumberEnv(gym.Env):
         self.dv = OnlineDictVectorizer(n_features)
         self.observation_space = spaces.Box(
             low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32)
-        self.action_space = spaces.MultiDiscrete([n_selections, 98])
+        self.action_space = spaces.MultiDiscrete([n_selections, 50])
         self.n_steps = 0
         self.max_steps = 100000
 
diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py
index 42281e9..088f7c8 100644
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -197,9 +197,9 @@ class MultiColumnAdditionSymbolic:
 
         # append correct/incorrect counts
         if add_counts:
-            d.text((0, 0), str(self.num_incorrect_steps), fill="red")
-            d.text((0, 10), str(self.num_correct_steps), fill="green")
-            d.text((0, 20), str(self.num_hints), fill="blue")
+            d.text((0, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
+            d.text((0, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
+            d.text((0, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
 
         if add_dot:
             d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3),
-- 
GitLab