set fractions back to normal difficulty

07b125ab · Chris MacLellan · 1c5ca9c8 · 07b125ab · 07b125ab · 07b125ab
Commit 07b125ab authored 4 years ago by Chris MacLellan
--- a/sandbox/fractions/train_ppo_number.py
+++ b/sandbox/fractions/train_ppo_number.py
@@ -136,35 +136,12 @@ class TrialEvalCallback(EvalCallback):


 if __name__ == "__main__":
-    # params = {
-    #     'batch_size': 32,
-    #     'n_steps': 16,
-    #     'gamma': 0.0,
-    #     'lr': 0.00017980950834568327,
-    #     'lr_schedule': 'constant',
-    #     'ent_coef': 0.07439893598338435,
-    #     'clip_range': 0.4,
-    #     'n_epochs': 10,
-    #     'gae_lambda': 0.95,
-    #     'max_grad_norm': 0.8,
-    #     'vf_coef': 0.13214811411452415,
-    #     'net_arch': 'medium',
-    #     'shared_arch': False,
-    #     'activation_fn': 'tanh'
-    # }
-
-    # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
-    #           'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
-    #           0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
-    #           'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
-    #           'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
-
-    params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
-              0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
-              0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
-              'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
-              0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
-              'activation_fn': 'relu'}
+
+    params = {'activation_fn': 'relu', 'batches_pow': 9.0, 'clip_range': 0.2,
+              'ent_coef': 0.013748019113722395, 'gae_lambda': 0.99, 'gamma':
+              0.0, 'lr': 0.005533935261484844, 'lr_schedule': 'linear',
+              'max_grad_norm': 2, 'n_epochs': 5, 'n_step_pow': 9.0, 'net_arch':
+              'small', 'shared_arch': False, 'vf_coef': 0.5470657324084635}

    kwargs = get_args(params)


--- a/sandbox/fractions/train_ppo_operator.py
+++ b/sandbox/fractions/train_ppo_operator.py
@@ -136,35 +136,13 @@ class TrialEvalCallback(EvalCallback):


 if __name__ == "__main__":
-    # params = {
-    #     'batch_size': 32,
-    #     'n_steps': 16,
-    #     'gamma': 0.0,
-    #     'lr': 0.00017980950834568327,
-    #     'lr_schedule': 'constant',
-    #     'ent_coef': 0.07439893598338435,
-    #     'clip_range': 0.4,
-    #     'n_epochs': 10,
-    #     'gae_lambda': 0.95,
-    #     'max_grad_norm': 0.8,
-    #     'vf_coef': 0.13214811411452415,
-    #     'net_arch': 'medium',
-    #     'shared_arch': False,
-    #     'activation_fn': 'tanh'
-    # }
-
-    # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
-    #           'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
-    #           0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
-    #           'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
-    #           'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
-
-    params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
-              0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
-              0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
-              'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
-              0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
-              'activation_fn': 'relu'}
+    # Best objective 6.266
+    params = {'activation_fn': 'tanh', 'batches_pow': 5.0, 'clip_range': 0.1,
+              'ent_coef': 0.032794340644757655, 'gae_lambda': 0.99, 'gamma':
+              0.0, 'lr': 4.5573009134737684e-05, 'lr_schedule': 'constant',
+              'max_grad_norm': 0.5, 'n_epochs': 10, 'n_step_pow': 8.0,
+              'net_arch': 'tiny', 'shared_arch': True, 'vf_coef':
+              0.23962206187507926}

    kwargs = get_args(params)


--- a/sandbox/multicolumn/train_ppo_operator.py
+++ b/sandbox/multicolumn/train_ppo_operator.py
@@ -136,34 +136,11 @@ class TrialEvalCallback(EvalCallback):


 if __name__ == "__main__":
-    # params = {
-    #     'batch_size': 32,
-    #     'n_steps': 16,
-    #     'gamma': 0.0,
-    #     'lr': 0.00017980950834568327,
-    #     'lr_schedule': 'constant',
-    #     'ent_coef': 0.07439893598338435,
-    #     'clip_range': 0.4,
-    #     'n_epochs': 10,
-    #     'gae_lambda': 0.95,
-    #     'max_grad_norm': 0.8,
-    #     'vf_coef': 0.13214811411452415,
-    #     'net_arch': 'medium',
-    #     'shared_arch': False,
-    #     'activation_fn': 'tanh'
-    # }
-
-    # params = {'activation_fn': 'relu', 'batch_size': 32, 'clip_range': 0.1,
-    #           'ent_coef': 0.008425259906148678, 'gae_lambda': 0.98, 'gamma':
-    #           0.0, 'lr': 0.0014548935455020253, 'lr_schedule': 'linear',
-    #           'max_grad_norm': 0.6, 'n_epochs': 5, 'n_steps': 64, 'net_arch':
-    #           'medium', 'shared_arch': True, 'vf_coef': 0.6725952403531438}
-
-    params = {'n_step_pow': 5.0, 'batches_pow': 5.0, 'gamma': 0.0, 'lr':
-              0.0014291278312354846, 'lr_schedule': 'linear', 'ent_coef':
-              0.042102094710275415, 'clip_range': 0.2, 'n_epochs': 5,
-              'gae_lambda': 0.92, 'max_grad_norm': 0.7, 'vf_coef':
-              0.40158288555773314, 'net_arch': 'medium', 'shared_arch': False,
+    params = {'n_step_pow': 7.0, 'batches_pow': 7.0, 'gamma': 0.0, 'lr':
+              0.0002916406263715553, 'lr_schedule': 'constant', 'ent_coef':
+              0.005743227072532813, 'clip_range': 0.4, 'n_epochs': 10,
+              'gae_lambda': 0.99, 'max_grad_norm': 0.5, 'vf_coef':
+              0.8088573261336596, 'net_arch': 'medium', 'shared_arch': True,
              'activation_fn': 'relu'}

    kwargs = get_args(params)
@@ -174,7 +151,7 @@ if __name__ == "__main__":
        MlpPolicy,
        env,
        verbose=1,
-        tensorboard_log="./tensorboard_ppo_multi/",
+        tensorboard_log="./tensorboard_ppo/",
        **kwargs
    )
    # gamma=0.1,

--- a/tutorenvs/fractions.py
+++ b/tutorenvs/fractions.py
@@ -26,8 +26,8 @@ class FractionArithSymbolic:
        Creates a state and sets a random problem.
        """
        if logger is None:
-            # self.logger = DataShopLogger('MulticolumnAdditionTutor', extra_kcs=['field'])
-            self.logger = StubLogger()
+            self.logger = DataShopLogger('FractionsTutor', extra_kcs=['field'])
+            # self.logger = StubLogger()
        else:
            self.logger = logger
        self.logger.set_student()
@@ -155,14 +155,15 @@ class FractionArithSymbolic:
        return state_output

    def set_random_problem(self):
-        num1 = str(randint(1, 5))
-        num2 = str(randint(1, 5))
-        denom1 = str(randint(2, 5))
-        denom2 = str(randint(2, 5))
+        num1 = str(randint(1, 15))
+        num2 = str(randint(1, 15))
+        denom1 = str(randint(2, 15))
+        denom2 = str(randint(2, 15))
        operator = choice(['+', '*'])

        self.reset(num1, denom1, operator, num2, denom2)
-        self.logger.set_problem("%s_%s_%s_%s_%s" % (num1, denom1, operator, num2, denom2))
+        self.logger.set_problem("%s_%s_%s_%s_%s" % (num1, denom1, operator,
+                                                    num2, denom2))

        if operator == "+" and denom1 == denom2:
            self.ptype = 'AS'
@@ -388,11 +389,11 @@ class FractionArithNumberEnv(gym.Env):
    def __init__(self):
        self.tutor = FractionArithSymbolic()
        n_selections = len(self.tutor.get_possible_selections())
-        n_features = 2000
+        n_features = 900
        self.dv = OnlineDictVectorizer(n_features)
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(1, n_features), dtype=np.float32)
-        self.action_space = spaces.MultiDiscrete([n_selections, 50])
+        self.action_space = spaces.MultiDiscrete([n_selections, 450])
        self.n_steps = 0
        self.max_steps = 100000

@@ -592,6 +593,7 @@ class FractionArithOppEnv(gym.Env):
        return state

    def step(self, action):
+        self.n_steps += 1
        try:
            s, a, i = self.decode(action)
            reward = self.tutor.apply_sai(s, a, i)
@@ -609,6 +611,9 @@ class FractionArithOppEnv(gym.Env):
        obs = self.dv.fit_transform([state])[0]
        info = {}

+        if self.n_steps > self.max_steps:
+            done = True
+
        return obs, reward, done, info


@@ -644,6 +649,7 @@ class FractionArithOppEnv(gym.Env):
        return s, a, i

    def reset(self):
+        self.n_steps = 0
        self.tutor.set_random_problem()
        state = self.get_rl_state()
        obs = self.dv.fit_transform([state])[0]

--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -203,8 +203,8 @@ class MultiColumnAdditionSymbolic:
        # append correct/incorrect counts
        if add_counts:
            d.text((0, 0), "h:{}".format(self.num_hints), fill=(0,0,0))
-            d.text((0, 10), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
-            d.text((0, 20), "+:{}".format(self.num_correct_steps), fill=(0,0,0))
+            d.text((0, 80), "-:{}".format(self.num_incorrect_steps), fill=(0,0,0))
+            d.text((20, 0), "+:{}".format(self.num_correct_steps), fill=(0,0,0))

        if add_dot:
            d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3),
@@ -564,7 +564,7 @@ class MultiColumnAdditionOppEnv(gym.Env):
    def __init__(self):
        self.tutor = MultiColumnAdditionSymbolic()
        n_selections = len(self.tutor.get_possible_selections())
-        n_features = 2000
+        n_features = 5000
        n_operators = len(self.get_rl_operators())
        n_args = len(self.tutor.get_possible_args())
        self.dv = OnlineDictVectorizer(n_features)
@@ -586,47 +586,37 @@ class MultiColumnAdditionOppEnv(gym.Env):
    def get_rl_state(self):
        state = self.tutor.state.copy()
        for attr in self.tutor.state:
-            if attr == "operator":
+            if attr == "operator" or state[attr] == "":
                continue
            for attr2 in self.tutor.state:
-                if attr2 == "operator":
+                if attr2 == "operator" or state[attr2] == "":
                    continue
                if attr >= attr2:
                    continue

-                try:
                ones2 = int2_float_add_then_ones(state[attr], state[attr2])
                state['add2-ones(%s,%s)' % (attr, attr2)] = ones2
-                except Exception:
-                    pass
-                try:
                tens2 = int2_float_add_then_tens(state[attr], state[attr2])
                state['add2-tens(%s,%s)' % (attr, attr2)] = tens2
-                except Exception:
-                    pass

                for attr3 in self.tutor.state:
-                    if attr3 == "operator":
+                    if attr3 == "operator" or state[attr3] == "":
                        continue
                    if attr2 >= attr3:
                        continue

-                try:
                    ones3 = int3_float_add_then_ones(state[attr], state[attr2],
                                                     state[attr3])
-                    state['add2-ones(%s,%s,%s)' % (attr, attr2, attr3)] = ones3
-                except Exception:
-                    pass
-                try:
+                    state['add3-ones(%s,%s,%s)' % (attr, attr2, attr3)] = ones3
                    tens3 = int3_float_add_then_tens(state[attr], state[attr2],
                                                     state[attr3])
-                    state['add2-tens(%s,%s,%s)' % (attr, attr2, attr3)] = tens3
-                except Exception:
-                    pass
+                    state['add3-tens(%s,%s,%s)' % (attr, attr2, attr3)] = tens3

        return state

    def step(self, action):
+        self.n_steps += 1
+
        try:
            s, a, i = self.decode(action)
            reward = self.tutor.apply_sai(s, a, i)
@@ -635,6 +625,8 @@ class MultiColumnAdditionOppEnv(gym.Env):
            reward = -1
            done = False

+        # self.tutor.render()
+
        # print(s, a, i)
        # print()
        # print(reward)
@@ -644,6 +636,12 @@ class MultiColumnAdditionOppEnv(gym.Env):
        obs = self.dv.fit_transform([state])[0]
        info = {}

+        # have a max steps for a given problem.
+        # When we hit that we're done regardless.
+        if self.n_steps > self.max_steps:
+            done = True
+
+
        return obs, reward, done, info

    def apply_rl_op(self, op, arg1, arg2, arg3):
@@ -689,6 +687,7 @@ class MultiColumnAdditionOppEnv(gym.Env):
        return s, a, i

    def reset(self):
+        self.n_steps = 0
        self.tutor.set_random_problem()
        state = self.get_rl_state()
        obs = self.dv.fit_transform([state])[0]