Skip to content
Snippets Groups Projects
Commit bf4ba0ba authored by Chris MacLellan's avatar Chris MacLellan
Browse files

trying to get ppo model working with fractions, had to reduce problem difficulty

parent 8e3ed132
No related branches found
No related tags found
No related merge requests found
from apprentice.agents.ModularAgent import ModularAgent
from apprentice.agents.WhereWhenHowNoFoa import WhereWhenHowNoFoa
from apprentice.working_memory.representation import Sai
from tutorenvs.fractions import FractionArithSymbolic
def run_training(agent, n=10):
env = FractionArithSymbolic()
......@@ -19,9 +18,7 @@ def run_training(agent, n=10):
if response == {}:
print('hint')
selection, action, inputs = env.request_demo()
sai = Sai(selection=selection,
action=action,
inputs=inputs)
sai = Sai(selection=selection, action=action, inputs=inputs)
else:
sai = Sai(selection=response['selection'],
......@@ -31,24 +28,20 @@ def run_training(agent, n=10):
reward = env.apply_sai(sai.selection, sai.action, sai.inputs)
print('reward', reward)
agent.train(state, sai, reward)
next_state = env.get_state()
agent.train(state, sai, reward, next_state=next_state,
skill_label="fractions",
foci_of_attention=[])
if sai.selection == "done" and reward == 1.0:
print('Finished problem {} of {}'.format(p, n))
p += 1
if __name__ == "__main__":
args = {"function_set" : ["RipFloatValue","Add",
'Multiply',
"Subtract",
# "Numerator_Multiply", "Cross_Multiply",
"Divide"],
"feature_set" : ["Equals"], "planner" : "numba", "search_depth" : 2,
"when_learner": "trestle", "where_learner": "FastMostSpecific",
"state_variablization" : "whereappend", "strip_attrs" :
["to_left","to_right","above","below","type","id","offsetParent","dom_class"],
"when_args" : { "cross_rhs_inference" : "none" } }
if __name__ == "__main__":
agent = ModularAgent(**args)
agent = WhereWhenHowNoFoa('fraction arith', 'fraction arith',
search_depth=1)
run_training(agent, n = 100)
run_training(agent, n=500)
from typing import Dict
from typing import Any
import optuna
from torch import nn as nn
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import VecEnv
from tutorenvs.utils import linear_schedule
def get_args(params: Dict[str, Any]) -> Dict[str, Any]:
"""
Sampler for PPO hyperparams.
:param trial:
:return:
"""
batch_size = int(2**params['batches_pow'])
n_steps = int(2**params['n_step_pow'])
gamma = params['gamma']
learning_rate = params['lr']
lr_schedule = params['lr_schedule']
ent_coef = params['ent_coef']
clip_range = params['clip_range']
n_epochs = params['n_epochs']
gae_lambda = params['gae_lambda']
max_grad_norm = params['max_grad_norm']
vf_coef = params['vf_coef']
net_arch = params['net_arch']
shared_arch = params['shared_arch']
activation_fn = params['activation_fn']
# TODO: account when using multiple envs
if batch_size > n_steps:
batch_size = n_steps
if lr_schedule == "linear":
learning_rate = linear_schedule(learning_rate)
# Independent networks usually work best
# when not working with images
net_arch = {
True: {
"tiny": [32, dict(pi=[32], vf=[32])],
"small": [64, dict(pi=[64], vf=[64])],
"medium": [128, dict(pi=[128], vf=[128])],
},
False: {
"tiny": [dict(pi=[32, 32], vf=[32, 32])],
"small": [dict(pi=[64, 64], vf=[64, 64])],
"medium": [dict(pi=[128, 128], vf=[128, 128])],
}
}[shared_arch][net_arch]
activation_fn = {
"tanh": nn.Tanh,
"relu": nn.ReLU,
"elu": nn.ELU,
"leaky_relu": nn.LeakyReLU
}[activation_fn]
ortho_init = False
return {
"n_steps":
n_steps,
"batch_size":
batch_size,
"gamma":
gamma,
"learning_rate":
learning_rate,
"ent_coef":
ent_coef,
"clip_range":
clip_range,
"n_epochs":
n_epochs,
"gae_lambda":
gae_lambda,
"max_grad_norm":
max_grad_norm,
"vf_coef":
vf_coef,
# "sde_sample_freq": sde_sample_freq,
"policy_kwargs":
dict(
# log_std_init=log_std_init,
net_arch=net_arch,
activation_fn=activation_fn,
ortho_init=ortho_init,
),
}
class TrialEvalCallback(EvalCallback):
"""
Callback used for evaluating and reporting a trial.
"""
def __init__(
self,
eval_env: VecEnv,
trial: optuna.Trial,
n_eval_episodes: int = 5,
eval_freq: int = 10000,
deterministic: bool = True,
verbose: int = 0,
):
super(TrialEvalCallback, self).__init__(
eval_env=eval_env,
n_eval_episodes=n_eval_episodes,
eval_freq=eval_freq,
deterministic=deterministic,
verbose=verbose,
)
self.trial = trial
self.eval_idx = 0
self.is_pruned = False
def _on_step(self) -> bool:
if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
super(TrialEvalCallback, self)._on_step()
self.eval_idx += 1
# report best or report current ?
# report num_timesteps or elasped time ?
self.trial.report(self.last_mean_reward, self.eval_idx)
# Prune trial if need
if self.trial.should_prune():
self.is_pruned = True
return False
return True
if __name__ == "__main__":
# params = {
# 'batch_size': 32,
# 'n_steps': 16,
# 'gamma': 0.0,
# 'lr': 0.00017980950834568327,
# 'lr_schedule': 'constant',
# 'ent_coef': 0.07439893598338435,
# 'clip_range': 0.4,
# 'n_epochs': 10,
# 'gae_lambda': 0.95,
# 'max_grad_norm': 0.8,
# 'vf_coef': 0.13214811411452415,
# 'net_arch': 'medium',
# 'shared_arch': False,
# 'activation_fn': 'tanh'
# }
# params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
# 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
# 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
# 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
# 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
'activation_fn': 'relu'}
kwargs = get_args(params)
# multiprocess environment
env = make_vec_env('FractionArith-v0', n_envs=1)
model = PPO(
MlpPolicy,
env,
verbose=1,
tensorboard_log="./tensorboard_ppo/",
**kwargs
)
# gamma=0.1,
# tensorboard_log="./tensorboard/v0/")
# while True:
# Train
model.learn(total_timesteps=1000000)
# Test
# obs = env.reset()
# rwd = 0
# for _ in range(10000):
# action, _states = model.predict(obs)
# obs, rewards, dones, info = env.step(action)
# rwd += np.sum(rewards)
# env.render()
# print(rwd)
from typing import Dict
from typing import Any
import tempfile
import gym
import optuna
from torch import nn as nn
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
# from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv
# from stable_baselines3.common.vec_env import VecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.monitor import load_results
import tutorenvs # noqa: F401
from tutorenvs.utils import linear_schedule
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
"""
Sampler for PPO hyperparams.
:param trial:
:return:
"""
n_step_pow = trial.suggest_discrete_uniform('n_step_pow', 3, 11, 1)
n_steps = int(2**n_step_pow)
# possible_n_steps = [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
# n_steps = trial.suggest_categorical("n_steps",
# possible_n_steps)
batches_pow = trial.suggest_discrete_uniform('batches_pow', 3,
n_step_pow, 1)
batch_size = int(2**batches_pow)
# possible_batches = [8, 16, 32, 64, 128, 256, 512]
# batch_size = trial.suggest_categorical("batch_size",
# possible_batches)
gamma = trial.suggest_categorical("gamma", [0.0])
# 0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
learning_rate = trial.suggest_loguniform("lr", 1e-8, 1)
# lr_schedule = "constant"
# Uncomment to enable learning rate schedule
lr_schedule = trial.suggest_categorical('lr_schedule',
['linear', 'constant'])
ent_coef = trial.suggest_loguniform("ent_coef", 0.00000000001, 0.1)
clip_range = trial.suggest_categorical("clip_range",
[0.05, 0.1, 0.2, 0.3, 0.4])
n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
gae_lambda = trial.suggest_categorical(
"gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
max_grad_norm = trial.suggest_categorical(
"max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
net_arch = trial.suggest_categorical("net_arch",
["tiny", "small", "medium"])
shared_arch = trial.suggest_categorical("shared_arch", [True, False])
ortho_init = False
activation_fn = trial.suggest_categorical("activation_fn",
["tanh", "relu"])
# TODO: account when using multiple envs
if batch_size > n_steps:
batch_size = n_steps
if lr_schedule == "linear":
learning_rate = linear_schedule(learning_rate)
# Independent networks usually work best
# when not working with images
net_arch = {
True: {
"tiny": [32, dict(pi=[32], vf=[32])],
"small": [64, dict(pi=[64], vf=[64])],
"medium": [128, dict(pi=[128], vf=[128])],
},
False: {
"tiny": [dict(pi=[32, 32], vf=[32, 32])],
"small": [dict(pi=[64, 64], vf=[64, 64])],
"medium": [dict(pi=[128, 128], vf=[128, 128])],
}
}[shared_arch][net_arch]
activation_fn = {
"tanh": nn.Tanh,
"relu": nn.ReLU,
"elu": nn.ELU,
"leaky_relu": nn.LeakyReLU
}[activation_fn]
return {
"n_steps":
n_steps,
"batch_size":
batch_size,
"gamma":
gamma,
"learning_rate":
learning_rate,
"ent_coef":
ent_coef,
"clip_range":
clip_range,
"n_epochs":
n_epochs,
"gae_lambda":
gae_lambda,
"max_grad_norm":
max_grad_norm,
"vf_coef":
vf_coef,
# "sde_sample_freq": sde_sample_freq,
"policy_kwargs":
dict(
# log_std_init=log_std_init,
net_arch=net_arch,
activation_fn=activation_fn,
ortho_init=ortho_init,
),
}
class TrialCallback(BaseCallback):
"""
Callback used for evaluating and reporting a trial.
"""
def __init__(
self,
trial: optuna.Trial,
log_dir: str,
n_eval_episodes: int = 10,
eval_freq: int = 10000,
min_eval: float = -600,
verbose: int = 0,
):
super(TrialCallback, self).__init__(verbose)
self.eval_freq = eval_freq
self.n_eval_episodes = n_eval_episodes
self.log_dir = log_dir
self.trial = trial
self.eval_idx = 0
self.is_pruned = False
self.min_eval = min_eval
def _on_step(self) -> bool:
if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
results = load_results(self.log_dir)
if len(results) < self.n_eval_episodes:
return True
avg_last_n = results['r'][-self.n_eval_episodes:].mean()
self.eval_idx += 1
# report best or report current ?
# report num_timesteps or elasped time ?
self.trial.report(avg_last_n, self.eval_idx)
# print('Idx:', self.eval_idx, 'Avg_last_n', avg_last_n)
# Prune trial if need
if avg_last_n < self.min_eval or self.trial.should_prune():
self.is_pruned = True
return False
return True
def objective(trial: optuna.Trial) -> float:
n_eval_episodes = 15
eval_freq = 5000
n_steps = 350000
with tempfile.TemporaryDirectory() as log_dir:
env = DummyVecEnv([
lambda: Monitor(gym.make('FractionArith-v0'), log_dir)])
ppo_args = sample_ppo_params(trial)
model = PPO(MlpPolicy, env,
# tensorboard_log="./tensorboard_ppo_multi/",
**ppo_args)
# gamma=0.1,
# tensorboard_log="./tensorboard/v0/")
callback = TrialCallback(trial, log_dir, verbose=1,
n_eval_episodes=n_eval_episodes,
eval_freq=eval_freq)
try:
model.learn(total_timesteps=n_steps, callback=callback)
model.env.close()
except Exception as e:
model.env.close()
print(e)
raise optuna.exceptions.TrialPruned()
is_pruned = callback.is_pruned
del model.env
del model
if is_pruned:
raise optuna.exceptions.TrialPruned()
results = load_results(log_dir)
avg_last_n = results['r'][-n_eval_episodes:].mean()
# print('Final avg_last_n:', avg_last_n)
return avg_last_n
if __name__ == "__main__":
# multiprocess environment
# env = make_vec_env('MulticolumnArithSymbolic-v0', n_envs=1)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=20000)
study = optuna.create_study(pruner=pruner,
direction="maximize",
storage='sqlite:///study.db',
load_if_exists=True
)
try:
study.optimize(objective, n_trials=1000, n_jobs=1)
except Exception as e:
print(e)
finally:
print("BEST")
print(study.best_params)
from random import randint
from random import choice
from pprint import pprint
import logging
import cv2 # pytype:disable=import-error
from PIL import Image, ImageDraw
......@@ -15,6 +16,8 @@ import numpy as np
from tutorenvs.utils import DataShopLogger
from tutorenvs.utils import StubLogger
pil_logger = logging.getLogger('PIL')
pil_logger.setLevel(logging.INFO)
class FractionArithSymbolic:
......@@ -110,9 +113,9 @@ class FractionArithSymbolic:
# append correct/incorrect counts
if add_counts:
d.text((100, 0), str(self.num_hints), fill="yellow")
d.text((100, 10), str(self.num_incorrect_steps), fill="red")
d.text((100, 20), str(self.num_correct_steps), fill="green")
d.text((95, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
d.text((95, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
d.text((95, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
# for eyes :)
# if add_dot:
......@@ -149,10 +152,10 @@ class FractionArithSymbolic:
return state_output
def set_random_problem(self):
num1 = str(randint(1, 7))
num2 = str(randint(1, 7))
denom1 = str(randint(2, 7))
denom2 = str(randint(2, 7))
num1 = str(randint(1, 5))
num2 = str(randint(1, 5))
denom1 = str(randint(2, 5))
denom2 = str(randint(2, 5))
operator = choice(['+', '*'])
self.reset(num1, denom1, operator, num2, denom2)
......@@ -386,7 +389,7 @@ class FractionArithNumberEnv(gym.Env):
self.dv = OnlineDictVectorizer(n_features)
self.observation_space = spaces.Box(
low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32)
self.action_space = spaces.MultiDiscrete([n_selections, 98])
self.action_space = spaces.MultiDiscrete([n_selections, 50])
self.n_steps = 0
self.max_steps = 100000
......
......@@ -197,9 +197,9 @@ class MultiColumnAdditionSymbolic:
# append correct/incorrect counts
if add_counts:
d.text((0, 0), str(self.num_incorrect_steps), fill="red")
d.text((0, 10), str(self.num_correct_steps), fill="green")
d.text((0, 20), str(self.num_hints), fill="blue")
d.text((0, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
d.text((0, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
d.text((0, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
if add_dot:
d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment