diff --git a/sandbox/multicolumn/train_ppo.py b/sandbox/multicolumn/train_ppo.py
index bf39a402aaa0e99ccafcdac444063a97a9b307f4..41e745029174c7d4b7f3e36a1b3685dcf2a11816 100644
--- a/sandbox/multicolumn/train_ppo.py
+++ b/sandbox/multicolumn/train_ppo.py
@@ -19,7 +19,7 @@ def get_args(params: Dict[str, Any]) -> Dict[str, Any]:
:return:
"""
batch_size = int(2**params['batches_pow'])
- n_steps = int(params['n_step_pow'])
+ n_steps = int(2**params['n_step_pow'])
gamma = params['gamma']
learning_rate = params['lr']
lr_schedule = params['lr_schedule']
@@ -153,11 +153,19 @@ if __name__ == "__main__":
# 'activation_fn': 'tanh'
# }
- params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
- 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
- 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
- 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
- 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
+ # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
+ # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
+ # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
+ # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
+ # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
+
+ params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
+ 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
+ 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
+ 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
+ 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
+ 'activation_fn': 'relu'}
+
kwargs = get_args(params)
# multiprocess environment
@@ -174,7 +182,7 @@ if __name__ == "__main__":
# while True:
# Train
- model.learn(total_timesteps=5000000)
+ model.learn(total_timesteps=1000000)
# Test
# obs = env.reset()
diff --git a/sandbox/multicolumn/tune_ppo.py b/sandbox/multicolumn/tune_ppo.py
index 95c87c44102bb4a3c5f3f70dd89847b53051af2d..d28f03b3b3ef4d29e1113cdd5786b560e4b45eb3 100644
--- a/sandbox/multicolumn/tune_ppo.py
+++ b/sandbox/multicolumn/tune_ppo.py
@@ -170,7 +170,7 @@ class TrialCallback(BaseCallback):
def objective(trial: optuna.Trial) -> float:
n_eval_episodes = 15
eval_freq = 5000
- n_steps = 10000
+ n_steps = 350000
with tempfile.TemporaryDirectory() as log_dir:
env = DummyVecEnv([
diff --git a/tutorenvs/__init__.py b/tutorenvs/__init__.py
index 0e7c94d2264ddf3256a4ca6793c48da219e23c17..5f23777d8fbf804a565a7bd682f0e37953e74a26 100644
--- a/tutorenvs/__init__.py
+++ b/tutorenvs/__init__.py
@@ -1,13 +1,14 @@
from gym.envs.registration import register
-from tutorenvs.fractions import FractionArithDigitsEnv
-from tutorenvs.fractions import FractionArithOppEnv
-from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv
-from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv
-from tutorenvs.multicolumn import MultiColumnAdditionPerceptEnv
+from tutorenvs.fractions import FractionArithNumberEnv # noqa: F401
+from tutorenvs.fractions import FractionArithDigitsEnv # noqa: F401
+from tutorenvs.fractions import FractionArithOppEnv # noqa: F401
+from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv # noqa: F401
+from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv # noqa: F401
+from tutorenvs.multicolumn import MultiColumnAdditionPerceptEnv # noqa: F401
register(
id='FractionArith-v0',
- entry_point='tutorenvs:FractionArithOppEnv',
+ entry_point='tutorenvs:FractionArithNumberEnv',
)
register(
@@ -15,6 +16,11 @@ register(
entry_point='tutorenvs:FractionArithDigitsEnv',
)
+register(
+ id='FractionArith-v2',
+ entry_point='tutorenvs:FractionArithOppEnv',
+)
+
# TODO no pixel fractions yet.
# register(
# id='FractionArith-v2',
diff --git a/tutorenvs/fractions.py b/tutorenvs/fractions.py
index 1fc9d4428b7cd1a68a2b9751a0fe958ef0985b6e..fd325d8cd05d57e41fc52f002cb316c4f1f9a2da 100644
--- a/tutorenvs/fractions.py
+++ b/tutorenvs/fractions.py
@@ -9,9 +9,11 @@ from gym import error, spaces, utils
from gym.utils import seeding
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction import DictVectorizer
+from tutorenvs.utils import OnlineDictVectorizer
import numpy as np
from tutorenvs.utils import DataShopLogger
+from tutorenvs.utils import StubLogger
class FractionArithSymbolic:
@@ -20,11 +22,8 @@ class FractionArithSymbolic:
"""
Creates a state and sets a random problem.
"""
- self.num_correct_steps = 0
- self.num_incorrect_steps = 0
- self.num_hints = 0
-
- self.logger = DataShopLogger('FractionsTutor', extra_kcs=['ptype_field'])
+ # self.logger = DataShopLogger('FractionsTutor', extra_kcs=['ptype_field'])
+ self.logger = StubLogger()
self.logger.set_student()
self.set_random_problem()
# self.reset("", "", "", "", "")
@@ -35,6 +34,10 @@ class FractionArithSymbolic:
provided arguments.
"""
self.steps = 0
+ self.num_correct_steps = 0
+ self.num_incorrect_steps = 0
+ self.num_hints = 0
+
self.state = {
'initial_num_left': num1,
'initial_denom_left': denom1,
@@ -146,10 +149,10 @@ class FractionArithSymbolic:
return state_output
def set_random_problem(self):
- num1 = str(randint(1, 15))
- num2 = str(randint(1, 15))
- denom1 = str(randint(2, 15))
- denom2 = str(randint(2, 15))
+ num1 = str(randint(1, 7))
+ num2 = str(randint(1, 7))
+ denom1 = str(randint(2, 7))
+ denom2 = str(randint(2, 7))
operator = choice(['+', '*'])
self.reset(num1, denom1, operator, num2, denom2)
@@ -164,11 +167,12 @@ class FractionArithSymbolic:
def apply_sai(self, selection, action, inputs):
"""
- Give a SAI, it applies it. This method returns feedback (i.e., -1 or 1).
+ Give a SAI, it applies it. This method returns feedback
+ (i.e., -1 or 1).
"""
self.steps += 1
reward = self.evaluate_sai(selection, action, inputs)
-
+
if reward > 0:
outcome = "CORRECT"
self.num_correct_steps += 1
@@ -177,11 +181,11 @@ class FractionArithSymbolic:
self.num_incorrect_steps += 1
self.logger.log_step(selection, action, inputs['value'], outcome,
- step_name=self.ptype + '_' + demo[0],
+ step_name=self.ptype + '_' + selection,
kcs=[self.ptype + '_' + selection])
# Render output?
- self.render()
+ # self.render()
if reward == -1.0:
return reward
@@ -371,6 +375,79 @@ class FractionArithSymbolic:
raise Exception("request demo - logic missing")
+
+class FractionArithNumberEnv(gym.Env):
+ metadata = {'render.modes': ['human']}
+
+ def __init__(self):
+ self.tutor = FractionArithSymbolic()
+ n_selections = len(self.tutor.get_possible_selections())
+ n_features = 2000
+ self.dv = OnlineDictVectorizer(n_features)
+ self.observation_space = spaces.Box(
+ low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32)
+ self.action_space = spaces.MultiDiscrete([n_selections, 98])
+ self.n_steps = 0
+ self.max_steps = 100000
+
+ def get_rl_state(self):
+ return self.tutor.state
+
+ def step(self, action):
+ self.n_steps += 1
+
+ s, a, i = self.decode(action)
+ # print(s, a, i)
+ # print()
+ reward = self.tutor.apply_sai(s, a, i)
+ # self.render()
+ # print(reward)
+ state = self.tutor.state
+ # pprint(state)
+ obs = self.dv.fit_transform([state])[0]
+ done = (s == 'done' and reward == 1.0)
+
+ # have a max steps for a given problem.
+ # When we hit that we're done regardless.
+ if self.n_steps > self.max_steps:
+ done = True
+
+ info = {}
+
+ return obs, reward, done, info
+
+ def decode(self, action):
+ # print(action)
+ s = self.tutor.get_possible_selections()[action[0]]
+
+ if s == "done":
+ a = "ButtonPressed"
+ else:
+ a = "UpdateField"
+
+ if s == "done":
+ v = -1
+ if s == "check_convert":
+ v = "x"
+ else:
+ v = action[1] + 1
+
+ i = {'value': str(v)}
+
+ return s, a, i
+
+ def reset(self):
+ self.n_steps = 0
+ self.tutor.set_random_problem()
+ # self.render()
+ state = self.get_rl_state()
+ obs = self.dv.fit_transform([state])[0]
+ return obs
+
+ def render(self, mode='human', close=False):
+ self.tutor.render()
+
+
class FractionArithDigitsEnv(gym.Env):
metadata = {'render.modes': ['human']}
diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py
index 940fc0e08ffffe1e2d4a58a81665528ce7ecd984..92df12ece40ae633cb40fe72bbba5b93b821c13e 100644
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -29,8 +29,8 @@ class MultiColumnAdditionSymbolic:
Creates a state and sets a random problem.
"""
if logger is None:
- # self.logger = DataShopLogger('MulticolumnAdditionTutor', extra_kcs=['field'])
- self.logger = StubLogger()
+ self.logger = DataShopLogger('MulticolumnAdditionTutor', extra_kcs=['field'])
+ # self.logger = StubLogger()
else:
self.logger = logger
self.logger.set_student()