From bf4ba0ba925ebcfe115d37da0ae60ccf1eb1c737 Mon Sep 17 00:00:00 2001
From: Chris MacLellan <2348-cm3786@users.noreply.gitlab.cci.drexel.edu>
Date: Fri, 15 Jan 2021 15:03:00 -0500
Subject: [PATCH] trying to get ppo model working with fractions, had to reduce
problem difficulty
---
sandbox/fractions/run_al_fractions.py | 35 ++--
sandbox/fractions/train_ppo.py | 195 ++++++++++++++++++++++
sandbox/fractions/tune_ppo.py | 229 ++++++++++++++++++++++++++
tutorenvs/fractions.py | 19 ++-
tutorenvs/multicolumn.py | 6 +-
5 files changed, 452 insertions(+), 32 deletions(-)
create mode 100644 sandbox/fractions/train_ppo.py
create mode 100644 sandbox/fractions/tune_ppo.py
diff --git a/sandbox/fractions/run_al_fractions.py b/sandbox/fractions/run_al_fractions.py
index 824866f..eba7a1b 100644
--- a/sandbox/fractions/run_al_fractions.py
+++ b/sandbox/fractions/run_al_fractions.py
@@ -1,10 +1,9 @@
-from apprentice.agents.ModularAgent import ModularAgent
+from apprentice.agents.WhereWhenHowNoFoa import WhereWhenHowNoFoa
from apprentice.working_memory.representation import Sai
from tutorenvs.fractions import FractionArithSymbolic
-
def run_training(agent, n=10):
env = FractionArithSymbolic()
@@ -19,36 +18,30 @@ def run_training(agent, n=10):
if response == {}:
print('hint')
selection, action, inputs = env.request_demo()
- sai = Sai(selection=selection,
- action=action,
- inputs=inputs)
+ sai = Sai(selection=selection, action=action, inputs=inputs)
else:
sai = Sai(selection=response['selection'],
- action=response['action'],
- inputs=response['inputs'])
+ action=response['action'],
+ inputs=response['inputs'])
reward = env.apply_sai(sai.selection, sai.action, sai.inputs)
print('reward', reward)
- agent.train(state, sai, reward)
+ next_state = env.get_state()
+
+ agent.train(state, sai, reward, next_state=next_state,
+ skill_label="fractions",
+ foci_of_attention=[])
if sai.selection == "done" and reward == 1.0:
+ print('Finished problem {} of {}'.format(p, n))
p += 1
-if __name__ == "__main__":
- args = {"function_set" : ["RipFloatValue","Add",
- 'Multiply',
- "Subtract",
- # "Numerator_Multiply", "Cross_Multiply",
- "Divide"],
- "feature_set" : ["Equals"], "planner" : "numba", "search_depth" : 2,
- "when_learner": "trestle", "where_learner": "FastMostSpecific",
- "state_variablization" : "whereappend", "strip_attrs" :
- ["to_left","to_right","above","below","type","id","offsetParent","dom_class"],
- "when_args" : { "cross_rhs_inference" : "none" } }
+if __name__ == "__main__":
- agent = ModularAgent(**args)
+ agent = WhereWhenHowNoFoa('fraction arith', 'fraction arith',
+ search_depth=1)
- run_training(agent, n = 100)
+ run_training(agent, n=500)
diff --git a/sandbox/fractions/train_ppo.py b/sandbox/fractions/train_ppo.py
new file mode 100644
index 0000000..079f259
--- /dev/null
+++ b/sandbox/fractions/train_ppo.py
@@ -0,0 +1,195 @@
+from typing import Dict
+from typing import Any
+
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import VecEnv
+
+from tutorenvs.utils import linear_schedule
+
+
+def get_args(params: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Sampler for PPO hyperparams.
+ :param trial:
+ :return:
+ """
+ batch_size = int(2**params['batches_pow'])
+ n_steps = int(2**params['n_step_pow'])
+ gamma = params['gamma']
+ learning_rate = params['lr']
+ lr_schedule = params['lr_schedule']
+ ent_coef = params['ent_coef']
+ clip_range = params['clip_range']
+ n_epochs = params['n_epochs']
+ gae_lambda = params['gae_lambda']
+ max_grad_norm = params['max_grad_norm']
+ vf_coef = params['vf_coef']
+ net_arch = params['net_arch']
+ shared_arch = params['shared_arch']
+ activation_fn = params['activation_fn']
+
+ # TODO: account when using multiple envs
+ if batch_size > n_steps:
+ batch_size = n_steps
+
+ if lr_schedule == "linear":
+ learning_rate = linear_schedule(learning_rate)
+
+ # Independent networks usually work best
+ # when not working with images
+ net_arch = {
+ True: {
+ "tiny": [32, dict(pi=[32], vf=[32])],
+ "small": [64, dict(pi=[64], vf=[64])],
+ "medium": [128, dict(pi=[128], vf=[128])],
+ },
+ False: {
+ "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+ "small": [dict(pi=[64, 64], vf=[64, 64])],
+ "medium": [dict(pi=[128, 128], vf=[128, 128])],
+ }
+ }[shared_arch][net_arch]
+
+ activation_fn = {
+ "tanh": nn.Tanh,
+ "relu": nn.ReLU,
+ "elu": nn.ELU,
+ "leaky_relu": nn.LeakyReLU
+ }[activation_fn]
+
+ ortho_init = False
+
+ return {
+ "n_steps":
+ n_steps,
+ "batch_size":
+ batch_size,
+ "gamma":
+ gamma,
+ "learning_rate":
+ learning_rate,
+ "ent_coef":
+ ent_coef,
+ "clip_range":
+ clip_range,
+ "n_epochs":
+ n_epochs,
+ "gae_lambda":
+ gae_lambda,
+ "max_grad_norm":
+ max_grad_norm,
+ "vf_coef":
+ vf_coef,
+ # "sde_sample_freq": sde_sample_freq,
+ "policy_kwargs":
+ dict(
+ # log_std_init=log_std_init,
+ net_arch=net_arch,
+ activation_fn=activation_fn,
+ ortho_init=ortho_init,
+ ),
+ }
+
+
+class TrialEvalCallback(EvalCallback):
+ """
+ Callback used for evaluating and reporting a trial.
+ """
+ def __init__(
+ self,
+ eval_env: VecEnv,
+ trial: optuna.Trial,
+ n_eval_episodes: int = 5,
+ eval_freq: int = 10000,
+ deterministic: bool = True,
+ verbose: int = 0,
+ ):
+
+ super(TrialEvalCallback, self).__init__(
+ eval_env=eval_env,
+ n_eval_episodes=n_eval_episodes,
+ eval_freq=eval_freq,
+ deterministic=deterministic,
+ verbose=verbose,
+ )
+ self.trial = trial
+ self.eval_idx = 0
+ self.is_pruned = False
+
+ def _on_step(self) -> bool:
+ if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+ super(TrialEvalCallback, self)._on_step()
+ self.eval_idx += 1
+ # report best or report current ?
+ # report num_timesteps or elasped time ?
+ self.trial.report(self.last_mean_reward, self.eval_idx)
+ # Prune trial if need
+ if self.trial.should_prune():
+ self.is_pruned = True
+ return False
+ return True
+
+
+if __name__ == "__main__":
+ # params = {
+ # 'batch_size': 32,
+ # 'n_steps': 16,
+ # 'gamma': 0.0,
+ # 'lr': 0.00017980950834568327,
+ # 'lr_schedule': 'constant',
+ # 'ent_coef': 0.07439893598338435,
+ # 'clip_range': 0.4,
+ # 'n_epochs': 10,
+ # 'gae_lambda': 0.95,
+ # 'max_grad_norm': 0.8,
+ # 'vf_coef': 0.13214811411452415,
+ # 'net_arch': 'medium',
+ # 'shared_arch': False,
+ # 'activation_fn': 'tanh'
+ # }
+
+ # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
+ # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
+ # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
+ # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
+ # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
+
+ params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
+ 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
+ 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
+ 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
+ 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
+ 'activation_fn': 'relu'}
+
+ kwargs = get_args(params)
+
+ # multiprocess environment
+ env = make_vec_env('FractionArith-v0', n_envs=1)
+ model = PPO(
+ MlpPolicy,
+ env,
+ verbose=1,
+ tensorboard_log="./tensorboard_ppo/",
+ **kwargs
+ )
+ # gamma=0.1,
+ # tensorboard_log="./tensorboard/v0/")
+
+ # while True:
+ # Train
+ model.learn(total_timesteps=1000000)
+
+ # Test
+ # obs = env.reset()
+ # rwd = 0
+ # for _ in range(10000):
+ # action, _states = model.predict(obs)
+ # obs, rewards, dones, info = env.step(action)
+ # rwd += np.sum(rewards)
+ # env.render()
+ # print(rwd)
diff --git a/sandbox/fractions/tune_ppo.py b/sandbox/fractions/tune_ppo.py
new file mode 100644
index 0000000..0a4e1f6
--- /dev/null
+++ b/sandbox/fractions/tune_ppo.py
@@ -0,0 +1,229 @@
+from typing import Dict
+from typing import Any
+import tempfile
+
+import gym
+import optuna
+from torch import nn as nn
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+# from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.callbacks import BaseCallback
+from stable_baselines3.common.vec_env import DummyVecEnv
+# from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.monitor import load_results
+
+import tutorenvs # noqa: F401
+from tutorenvs.utils import linear_schedule
+
+
+def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
+ """
+ Sampler for PPO hyperparams.
+
+ :param trial:
+ :return:
+ """
+ n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1)
+ n_steps = int(2**n_step_pow)
+
+ # possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+ # n_steps = trial.suggest_categorical("n_steps",
+ # possible_n_steps)
+
+ batches_pow = trial.suggest_discrete_uniform('batches_pow', 3,
+ n_step_pow, 1)
+ batch_size = int(2**batches_pow)
+
+ # possible_batches = [8, 16, 32, 64, 128, 256, 512]
+ # batch_size = trial.suggest_categorical("batch_size",
+ # possible_batches)
+
+ gamma = trial.suggest_categorical("gamma", [0.0])
+ # 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
+ learning_rate = trial.suggest_loguniform("lr", 1e-8, 1)
+ # lr_schedule = "constant"
+ # Uncomment to enable learning rate schedule
+ lr_schedule = trial.suggest_categorical('lr_schedule',
+ ['linear', 'constant'])
+ ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1)
+ clip_range = trial.suggest_categorical("clip_range",
+ [0.05, 0.1, 0.2, 0.3, 0.4])
+ n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
+ gae_lambda = trial.suggest_categorical(
+ "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
+ max_grad_norm = trial.suggest_categorical(
+ "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
+ vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
+ net_arch = trial.suggest_categorical("net_arch",
+ ["tiny", "small", "medium"])
+ shared_arch = trial.suggest_categorical("shared_arch", [True, False])
+ ortho_init = False
+ activation_fn = trial.suggest_categorical("activation_fn",
+ ["tanh", "relu"])
+
+ # TODO: account when using multiple envs
+ if batch_size > n_steps:
+ batch_size = n_steps
+
+ if lr_schedule == "linear":
+ learning_rate = linear_schedule(learning_rate)
+
+ # Independent networks usually work best
+ # when not working with images
+ net_arch = {
+ True: {
+ "tiny": [32, dict(pi=[32], vf=[32])],
+ "small": [64, dict(pi=[64], vf=[64])],
+ "medium": [128, dict(pi=[128], vf=[128])],
+ },
+ False: {
+ "tiny": [dict(pi=[32, 32], vf=[32, 32])],
+ "small": [dict(pi=[64, 64], vf=[64, 64])],
+ "medium": [dict(pi=[128, 128], vf=[128, 128])],
+ }
+ }[shared_arch][net_arch]
+
+ activation_fn = {
+ "tanh": nn.Tanh,
+ "relu": nn.ReLU,
+ "elu": nn.ELU,
+ "leaky_relu": nn.LeakyReLU
+ }[activation_fn]
+
+ return {
+ "n_steps":
+ n_steps,
+ "batch_size":
+ batch_size,
+ "gamma":
+ gamma,
+ "learning_rate":
+ learning_rate,
+ "ent_coef":
+ ent_coef,
+ "clip_range":
+ clip_range,
+ "n_epochs":
+ n_epochs,
+ "gae_lambda":
+ gae_lambda,
+ "max_grad_norm":
+ max_grad_norm,
+ "vf_coef":
+ vf_coef,
+ # "sde_sample_freq": sde_sample_freq,
+ "policy_kwargs":
+ dict(
+ # log_std_init=log_std_init,
+ net_arch=net_arch,
+ activation_fn=activation_fn,
+ ortho_init=ortho_init,
+ ),
+ }
+
+
+class TrialCallback(BaseCallback):
+ """
+ Callback used for evaluating and reporting a trial.
+ """
+ def __init__(
+ self,
+ trial: optuna.Trial,
+ log_dir: str,
+ n_eval_episodes: int = 10,
+ eval_freq: int = 10000,
+ min_eval: float = -600,
+ verbose: int = 0,
+ ):
+ super(TrialCallback, self).__init__(verbose)
+
+ self.eval_freq = eval_freq
+ self.n_eval_episodes = n_eval_episodes
+ self.log_dir = log_dir
+ self.trial = trial
+ self.eval_idx = 0
+ self.is_pruned = False
+ self.min_eval = min_eval
+
+ def _on_step(self) -> bool:
+ if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
+ results = load_results(self.log_dir)
+ if len(results) < self.n_eval_episodes:
+ return True
+ avg_last_n = results['r'][-self.n_eval_episodes:].mean()
+ self.eval_idx += 1
+ # report best or report current ?
+ # report num_timesteps or elasped time ?
+ self.trial.report(avg_last_n, self.eval_idx)
+ # print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n)
+
+ # Prune trial if need
+ if avg_last_n < self.min_eval or self.trial.should_prune():
+ self.is_pruned = True
+ return False
+
+ return True
+
+
+def objective(trial: optuna.Trial) -> float:
+ n_eval_episodes = 15
+ eval_freq = 5000
+ n_steps = 350000
+
+ with tempfile.TemporaryDirectory() as log_dir:
+ env = DummyVecEnv([
+ lambda: Monitor(gym.make('FractionArith-v0'), log_dir)])
+
+ ppo_args = sample_ppo_params(trial)
+
+ model = PPO(MlpPolicy, env,
+ # tensorboard_log="./tensorboard_ppo_multi/",
+ **ppo_args)
+ # gamma=0.1,
+ # tensorboard_log="./tensorboard/v0/")
+ callback = TrialCallback(trial, log_dir, verbose=1,
+ n_eval_episodes=n_eval_episodes,
+ eval_freq=eval_freq)
+
+ try:
+ model.learn(total_timesteps=n_steps, callback=callback)
+ model.env.close()
+ except Exception as e:
+ model.env.close()
+ print(e)
+ raise optuna.exceptions.TrialPruned()
+
+ is_pruned = callback.is_pruned
+ del model.env
+ del model
+
+ if is_pruned:
+ raise optuna.exceptions.TrialPruned()
+
+ results = load_results(log_dir)
+ avg_last_n = results['r'][-n_eval_episodes:].mean()
+ # print('Final avg_last_n:', avg_last_n)
+ return avg_last_n
+
+
+if __name__ == "__main__":
+
+ # multiprocess environment
+ # env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1)
+
+ pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000)
+
+ study = optuna.create_study(pruner=pruner,
+ direction="maximize",
+ storage='sqlite:///study.db',
+ load_if_exists=True
+ )
+ try:
+ study.optimize(objective, n_trials=1000, n_jobs=1)
+ except Exception as e:
+ print(e)
+ finally:
+ print("BEST")
+ print(study.best_params)
diff --git a/tutorenvs/fractions.py b/tutorenvs/fractions.py
index fd325d8..af81014 100644
--- a/tutorenvs/fractions.py
+++ b/tutorenvs/fractions.py
@@ -1,6 +1,7 @@
from random import randint
from random import choice
from pprint import pprint
+import logging
import cv2 # pytype:disable=import-error
from PIL import Image, ImageDraw
@@ -15,6 +16,8 @@ import numpy as np
from tutorenvs.utils import DataShopLogger
from tutorenvs.utils import StubLogger
+pil_logger = logging.getLogger('PIL')
+pil_logger.setLevel(logging.INFO)
class FractionArithSymbolic:
@@ -110,9 +113,9 @@ class FractionArithSymbolic:
# append correct/incorrect counts
if add_counts:
- d.text((100, 0), str(self.num_hints), fill="yellow")
- d.text((100, 10), str(self.num_incorrect_steps), fill="red")
- d.text((100, 20), str(self.num_correct_steps), fill="green")
+ d.text((95, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
+ d.text((95, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
+ d.text((95, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
# for eyes :)
# if add_dot:
@@ -149,10 +152,10 @@ class FractionArithSymbolic:
return state_output
def set_random_problem(self):
- num1 = str(randint(1, 7))
- num2 = str(randint(1, 7))
- denom1 = str(randint(2, 7))
- denom2 = str(randint(2, 7))
+ num1 = str(randint(1, 5))
+ num2 = str(randint(1, 5))
+ denom1 = str(randint(2, 5))
+ denom2 = str(randint(2, 5))
operator = choice(['+', '*'])
self.reset(num1, denom1, operator, num2, denom2)
@@ -386,7 +389,7 @@ class FractionArithNumberEnv(gym.Env):
self.dv = OnlineDictVectorizer(n_features)
self.observation_space = spaces.Box(
low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32)
- self.action_space = spaces.MultiDiscrete([n_selections, 98])
+ self.action_space = spaces.MultiDiscrete([n_selections, 50])
self.n_steps = 0
self.max_steps = 100000
diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py
index 42281e9..088f7c8 100644
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -197,9 +197,9 @@ class MultiColumnAdditionSymbolic:
# append correct/incorrect counts
if add_counts:
- d.text((0, 0), str(self.num_incorrect_steps), fill="red")
- d.text((0, 10), str(self.num_correct_steps), fill="green")
- d.text((0, 20), str(self.num_hints), fill="blue")
+ d.text((0, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
+ d.text((0, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
+ d.text((0, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
if add_dot:
d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3),
--
GitLab