From 07b125ab0494ee318dfa4ffcf4c9f9a6f39bfb28 Mon Sep 17 00:00:00 2001 From: Chris MacLellan <2348-cm3786@users.noreply.gitlab.cci.drexel.edu> Date: Thu, 21 Jan 2021 17:42:18 -0500 Subject: [PATCH] set fractions back to normal difficulty --- sandbox/fractions/train_ppo_number.py | 35 +++-------------- sandbox/fractions/train_ppo_operator.py | 36 ++++------------- sandbox/multicolumn/train_ppo_operator.py | 35 +++-------------- tutorenvs/fractions.py | 24 +++++++----- tutorenvs/multicolumn.py | 47 +++++++++++------------ 5 files changed, 57 insertions(+), 120 deletions(-) diff --git a/sandbox/fractions/train_ppo_number.py b/sandbox/fractions/train_ppo_number.py index 079f259..c266f67 100644 --- a/sandbox/fractions/train_ppo_number.py +++ b/sandbox/fractions/train_ppo_number.py @@ -136,35 +136,12 @@ class TrialEvalCallback(EvalCallback): if __name__ == "__main__": - # params = { - # 'batch_size': 32, - # 'n_steps': 16, - # 'gamma': 0.0, - # 'lr': 0.00017980950834568327, - # 'lr_schedule': 'constant', - # 'ent_coef': 0.07439893598338435, - # 'clip_range': 0.4, - # 'n_epochs': 10, - # 'gae_lambda': 0.95, - # 'max_grad_norm': 0.8, - # 'vf_coef': 0.13214811411452415, - # 'net_arch': 'medium', - # 'shared_arch': False, - # 'activation_fn': 'tanh' - # } - - # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1, - # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma': - # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear', - # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch': - # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438} - - params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr': - 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef': - 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5, - 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef': - 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False, - 'activation_fn': 'relu'} + + params = {'activation_fn': 'relu', 'batches_pow': 9.0, 'clip_range': 0.2, + 'ent_coef': 0.013748019113722395, 'gae_lambda': 0.99, 'gamma': + 0.0, 'lr': 0.005533935261484844, 'lr_schedule': 'linear', + 'max_grad_norm': 2, 'n_epochs': 5, 'n_step_pow': 9.0, 'net_arch': + 'small', 'shared_arch': False, 'vf_coef': 0.5470657324084635} kwargs = get_args(params) diff --git a/sandbox/fractions/train_ppo_operator.py b/sandbox/fractions/train_ppo_operator.py index 1702ed5..3f9ffd3 100644 --- a/sandbox/fractions/train_ppo_operator.py +++ b/sandbox/fractions/train_ppo_operator.py @@ -136,35 +136,13 @@ class TrialEvalCallback(EvalCallback): if __name__ == "__main__": - # params = { - # 'batch_size': 32, - # 'n_steps': 16, - # 'gamma': 0.0, - # 'lr': 0.00017980950834568327, - # 'lr_schedule': 'constant', - # 'ent_coef': 0.07439893598338435, - # 'clip_range': 0.4, - # 'n_epochs': 10, - # 'gae_lambda': 0.95, - # 'max_grad_norm': 0.8, - # 'vf_coef': 0.13214811411452415, - # 'net_arch': 'medium', - # 'shared_arch': False, - # 'activation_fn': 'tanh' - # } - - # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1, - # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma': - # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear', - # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch': - # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438} - - params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr': - 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef': - 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5, - 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef': - 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False, - 'activation_fn': 'relu'} + # Best objective 6.266 + params = {'activation_fn': 'tanh', 'batches_pow': 5.0, 'clip_range': 0.1, + 'ent_coef': 0.032794340644757655, 'gae_lambda': 0.99, 'gamma': + 0.0, 'lr': 4.5573009134737684e-05, 'lr_schedule': 'constant', + 'max_grad_norm': 0.5, 'n_epochs': 10, 'n_step_pow': 8.0, + 'net_arch': 'tiny', 'shared_arch': True, 'vf_coef': + 0.23962206187507926} kwargs = get_args(params) diff --git a/sandbox/multicolumn/train_ppo_operator.py b/sandbox/multicolumn/train_ppo_operator.py index 969f159..8092015 100644 --- a/sandbox/multicolumn/train_ppo_operator.py +++ b/sandbox/multicolumn/train_ppo_operator.py @@ -136,34 +136,11 @@ class TrialEvalCallback(EvalCallback): if __name__ == "__main__": - # params = { - # 'batch_size': 32, - # 'n_steps': 16, - # 'gamma': 0.0, - # 'lr': 0.00017980950834568327, - # 'lr_schedule': 'constant', - # 'ent_coef': 0.07439893598338435, - # 'clip_range': 0.4, - # 'n_epochs': 10, - # 'gae_lambda': 0.95, - # 'max_grad_norm': 0.8, - # 'vf_coef': 0.13214811411452415, - # 'net_arch': 'medium', - # 'shared_arch': False, - # 'activation_fn': 'tanh' - # } - - # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1, - # 'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma': - # 0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear', - # 'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch': - # 'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438} - - params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr': - 0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef': - 0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5, - 'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef': - 0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False, + params = {'n_step_pow': 7.0, 'batches_pow': 7.0, 'gamma': 0.0, 'lr': + 0.0002916406263715553, 'lr_schedule': 'constant', 'ent_coef': + 0.005743227072532813, 'clip_range': 0.4, 'n_epochs': 10, + 'gae_lambda': 0.99, 'max_grad_norm': 0.5, 'vf_coef': + 0.8088573261336596, 'net_arch': 'medium', 'shared_arch': True, 'activation_fn': 'relu'} kwargs = get_args(params) @@ -174,7 +151,7 @@ if __name__ == "__main__": MlpPolicy, env, verbose=1, - tensorboard_log="./tensorboard_ppo_multi/", + tensorboard_log="./tensorboard_ppo/", **kwargs ) # gamma=0.1, diff --git a/tutorenvs/fractions.py b/tutorenvs/fractions.py index 34d64f9..8253811 100644 --- a/tutorenvs/fractions.py +++ b/tutorenvs/fractions.py @@ -26,8 +26,8 @@ class FractionArithSymbolic: Creates a state and sets a random problem. """ if logger is None: - # self.logger = DataShopLogger('MulticolumnAdditionTutor', extra_kcs=['field']) - self.logger = StubLogger() + self.logger = DataShopLogger('FractionsTutor', extra_kcs=['field']) + # self.logger = StubLogger() else: self.logger = logger self.logger.set_student() @@ -155,14 +155,15 @@ class FractionArithSymbolic: return state_output def set_random_problem(self): - num1 = str(randint(1, 5)) - num2 = str(randint(1, 5)) - denom1 = str(randint(2, 5)) - denom2 = str(randint(2, 5)) + num1 = str(randint(1, 15)) + num2 = str(randint(1, 15)) + denom1 = str(randint(2, 15)) + denom2 = str(randint(2, 15)) operator = choice(['+', '*']) self.reset(num1, denom1, operator, num2, denom2) - self.logger.set_problem("%s_%s_%s_%s_%s" % (num1, denom1, operator, num2, denom2)) + self.logger.set_problem("%s_%s_%s_%s_%s" % (num1, denom1, operator, + num2, denom2)) if operator == "+" and denom1 == denom2: self.ptype = 'AS' @@ -388,11 +389,11 @@ class FractionArithNumberEnv(gym.Env): def __init__(self): self.tutor = FractionArithSymbolic() n_selections = len(self.tutor.get_possible_selections()) - n_features = 2000 + n_features = 900 self.dv = OnlineDictVectorizer(n_features) self.observation_space = spaces.Box( low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32) - self.action_space = spaces.MultiDiscrete([n_selections, 50]) + self.action_space = spaces.MultiDiscrete([n_selections, 450]) self.n_steps = 0 self.max_steps = 100000 @@ -592,6 +593,7 @@ class FractionArithOppEnv(gym.Env): return state def step(self, action): + self.n_steps += 1 try: s, a, i = self.decode(action) reward = self.tutor.apply_sai(s, a, i) @@ -609,6 +611,9 @@ class FractionArithOppEnv(gym.Env): obs = self.dv.fit_transform([state])[0] info = {} + if self.n_steps > self.max_steps: + done = True + return obs, reward, done, info @@ -644,6 +649,7 @@ class FractionArithOppEnv(gym.Env): return s, a, i def reset(self): + self.n_steps = 0 self.tutor.set_random_problem() state = self.get_rl_state() obs = self.dv.fit_transform([state])[0] diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py index f97a9b0..a8a93a2 100644 --- a/tutorenvs/multicolumn.py +++ b/tutorenvs/multicolumn.py @@ -203,8 +203,8 @@ class MultiColumnAdditionSymbolic: # append correct/incorrect counts if add_counts: d.text((0, 0), "h:{}".format(self.num_hints), fill=(0,0,0)) - d.text((0, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0)) - d.text((0, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0)) + d.text((0, 80), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0)) + d.text((20, 0), "+:{}".format(self.num_correct_steps), fill=(0,0,0)) if add_dot: d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3), @@ -564,7 +564,7 @@ class MultiColumnAdditionOppEnv(gym.Env): def __init__(self): self.tutor = MultiColumnAdditionSymbolic() n_selections = len(self.tutor.get_possible_selections()) - n_features = 2000 + n_features = 5000 n_operators = len(self.get_rl_operators()) n_args = len(self.tutor.get_possible_args()) self.dv = OnlineDictVectorizer(n_features) @@ -586,47 +586,37 @@ class MultiColumnAdditionOppEnv(gym.Env): def get_rl_state(self): state = self.tutor.state.copy() for attr in self.tutor.state: - if attr == "operator": + if attr == "operator" or state[attr] == "": continue for attr2 in self.tutor.state: - if attr2 == "operator": + if attr2 == "operator" or state[attr2] == "": continue if attr >= attr2: continue - try: - ones2 = int2_float_add_then_ones(state[attr], state[attr2]) - state['add2-ones(%s,%s)' % (attr, attr2)] = ones2 - except Exception: - pass - try: - tens2 = int2_float_add_then_tens(state[attr], state[attr2]) - state['add2-tens(%s,%s)' % (attr, attr2)] = tens2 - except Exception: - pass + ones2 = int2_float_add_then_ones(state[attr], state[attr2]) + state['add2-ones(%s,%s)' % (attr, attr2)] = ones2 + tens2 = int2_float_add_then_tens(state[attr], state[attr2]) + state['add2-tens(%s,%s)' % (attr, attr2)] = tens2 for attr3 in self.tutor.state: - if attr3 == "operator": + if attr3 == "operator" or state[attr3] == "": continue if attr2 >= attr3: continue - try: ones3 = int3_float_add_then_ones(state[attr], state[attr2], state[attr3]) - state['add2-ones(%s,%s,%s)' % (attr, attr2, attr3)] = ones3 - except Exception: - pass - try: + state['add3-ones(%s,%s,%s)' % (attr, attr2, attr3)] = ones3 tens3 = int3_float_add_then_tens(state[attr], state[attr2], state[attr3]) - state['add2-tens(%s,%s,%s)' % (attr, attr2, attr3)] = tens3 - except Exception: - pass + state['add3-tens(%s,%s,%s)' % (attr, attr2, attr3)] = tens3 return state def step(self, action): + self.n_steps += 1 + try: s, a, i = self.decode(action) reward = self.tutor.apply_sai(s, a, i) @@ -635,6 +625,8 @@ class MultiColumnAdditionOppEnv(gym.Env): reward = -1 done = False + # self.tutor.render() + # print(s, a, i) # print() # print(reward) @@ -644,6 +636,12 @@ class MultiColumnAdditionOppEnv(gym.Env): obs = self.dv.fit_transform([state])[0] info = {} + # have a max steps for a given problem. + # When we hit that we're done regardless. + if self.n_steps > self.max_steps: + done = True + + return obs, reward, done, info def apply_rl_op(self, op, arg1, arg2, arg3): @@ -689,6 +687,7 @@ class MultiColumnAdditionOppEnv(gym.Env): return s, a, i def reset(self): + self.n_steps = 0 self.tutor.set_random_problem() state = self.get_rl_state() obs = self.dv.fit_transform([state])[0] -- GitLab