working pixel representation for multicolumn

c8a36fb1 · Chris MacLellan · 7f7470b0 · c8a36fb1 · c8a36fb1 · c8a36fb1
Commit c8a36fb1 authored 4 years ago by Chris MacLellan
--- a/README.md
+++ b/README.md
-This library contains headless versions of a number of commonly used tutoring system environments for training simulated students. 
+This library contains headless versions of a number of commonly used tutoring
+system environments for training simulated students.  There are currently two
+different tutoring systems that can be loaded: the fraction arithmetic tutor
+and a multi-column arithmetic tutor. 
-To create an AI Gym environment for the Fraction Arithmetic tutor use the following commands:
+The core of each tutor is a "Symbolic" variant of each tutor. This class
+maintains the logic for the tutor, including calls for getting the state
+description, applying selection, action, inputs (SAIs) which returns
+feedback about whether the provided sai is correct/incorrect, and which
+provides a method for requesting a demonstration (a valid Sai for the
+next step).
+The Apprentice Learner Architecture can interface directly with these
+symbolic variants of the tutor. 
+Next, there are separate classes that wrap these symbolic tutorrs in AI gym
+environments that can be loaded by a reinforcement learning algorithm, such as
+those in the stable baseline library (e.g., the PPO algorithm). 
+Currently, I am exploring multiple representations for the RL tutor:
+- Operators model: this is the closest to what the Apprentice Learner would 
+  use. In particular, the agent perceives the same hot-one coded state features
+  that the AL agent would get. It also has 4 discrete action outputs
+  (multi-discrete space), the first is for the selection (all selectable
+  interface elements in the tutor), the second discrete output is for an
+  operator (copy, add, subtract, multiply, etc., these correspond to prior
+  knowledge operators/functions), the next two outputs correspond to the fields
+  that get passed as input to the operator (e.g., two other fields in the tutor
+  interface that currently have values).
+- Digits model, this has the same input as the Operators model (hot-one
+  features that AL would get), but has a different action output. Instead, it
+  has 4 discrete action outputs (multi-discrete space). The first output is for
+  selection (as above), the second is for a digit in the ones place (0-9), the
+  third is for a digit in the tens place (0-9), and the fourth is for a digit
+  in the hundreds place (0-9).  Depending on the tutor the number of digits
+  might be more or less depending on what is necessary to solve the task (e.g.,
+  the multi-column arith only has a single digit). 
+- Pixel model, This has the same output as the Digits model, but has a different
+  input. Instead, the model gets a black and white, pixel representation of the
+  tutor interface. It is not identical to the human tutor represention, but it
+  is a semi-resonable facimile that includes all the information that exists in
+  the human tutor.
+These different representations are registered as AI gym models under the names
+"FractionArith" and "MultiColumnArith" and version numbers "v0" (for operator
+model), "v1" (for digits model), and "v2" for pixel model. As an example of how
+to create an operator model for fraction arithmetic and train a PPO model from the
+stable baselines package, you would use the following code:
 ```
 import gym
 import tutorenvs
-env = gym.make('FractionArith-v0')
+env = make_vec_env('MultiColumnArith-v0', n_envs=8)
+model = PPO2(MlpPolicy, env, verbose=1)
+model.learn(total_timesteps=2000000)
 ```
+See the code in the `sandbox/` folder for examples of how to train different
+kinds of agents on these different environments. The
+`sandbox/run_al_fractions.py` code shows how to train an Apprentice agent on
+the headless fractions tutor.
--- a/sandbox/run_ppo_multicolumn.py
+++ b/sandbox/run_ppo_multicolumn.py
 import gym
 from stable_baselines.common import make_vec_env
 from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
 from stable_baselines import PPO2
 from stable_baselines import SAC
 import tutorenvs
@@ -10,11 +11,15 @@ import numpy as np
 if __name__ == "__main__":
    # multiprocess environment
-    env = make_vec_env('MultiColumnArith-v0', n_envs=8)
+    env = make_vec_env('MultiColumnArith-v2', n_envs=8)
-    model = PPO2(MlpPolicy, env, verbose=1,
+    model = PPO2(CnnPolicy, env, verbose=1,
            gamma=0.5,
-            policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+            policy_kwargs={'net_arch': [100, 100, {'vf': [65], 'pi': [65]}]},
            tensorboard_log="./ppo_MultiColumnArith-v0/")
+    # model = PPO2(MlpPolicy, env, verbose=1,
+    #         gamma=0.5,
+    #         # policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+    #         tensorboard_log="./ppo_MultiColumnArith-v0/")
    while True:
        model.learn(total_timesteps=9999999999)

--- a/tutorenvs/__init__.py
+++ b/tutorenvs/__init__.py
@@ -3,23 +3,35 @@ from tutorenvs.fractions import FractionArithDigitsEnv
 from tutorenvs.fractions import FractionArithOppEnv
 from tutorenvs.multicolumn import MultiColumnAdditionOppEnv
 from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv
+from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv
 register(
    id='FractionArith-v0',
-    entry_point='tutorenvs:FractionArithDigitsEnv',
+    entry_point='tutorenvs:FractionArithOppEnv',
 )
 register(
    id='FractionArith-v1',
-    entry_point='tutorenvs:FractionArithOppEnv',
+    entry_point='tutorenvs:FractionArithDigitsEnv',
 )
+# TODO no pixel fractions yet.
+# register(
+#     id='FractionArith-v2',
+#     entry_point='tutorenvs:FractionArithPixelEnv',
+# )
 register(
    id='MultiColumnArith-v0',
-    entry_point='tutorenvs:MultiColumnAdditionDigitsEnv',
+    entry_point='tutorenvs:MultiColumnAdditionOppEnv',
 )
 register(
    id='MultiColumnArith-v1',
-    entry_point='tutorenvs:MultiColumnAdditionOppEnv',
+    entry_point='tutorenvs:MultiColumnAdditionDigitsEnv',
+)
+register(
+    id='MultiColumnArith-v2',
+    entry_point='tutorenvs:MultiColumnAdditionPixelEnv',
 )
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -8,6 +8,7 @@ from gym.utils import seeding
 from sklearn.feature_extraction import FeatureHasher
 from sklearn.feature_extraction import DictVectorizer
 import numpy as np
+from PIL import Image, ImageDraw
 from tutorenvs.utils import BaseOppEnv
@@ -417,6 +418,31 @@ class MultiColumnAdditionDigitsEnv(gym.Env):
        #     'answer_tens': '',
        #     'answer_ones': ''
        # }
+        state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for
+                attr in self.tutor.state}
+        output = " %s%s%s \n  %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
+                state["hundreds_carry"],
+                state["tens_carry"],
+                state["ones_carry"],
+                state["upper_hundreds"],
+                state["upper_tens"],
+                state["upper_ones"],
+                state["lower_hundreds"],
+                state["lower_tens"],
+                state["lower_ones"],
+                state["answer_thousands"],
+                state["answer_hundreds"],
+                state["answer_tens"],
+                state["answer_ones"],
+                )
+        img = Image.new('RGB', (50, 90), color="white")
+        d = ImageDraw.Draw(img)
+        d.text((10, 10), output, fill='black')
+        img.save('test.png')
+        print(np.array(img))
        return self.tutor.state
    def __init__(self):
@@ -473,3 +499,101 @@ class MultiColumnAdditionDigitsEnv(gym.Env):
    def render(self, mode='human', close=False):
        self.tutor.render()
+class MultiColumnAdditionPixelEnv(gym.Env):
+    metadata = {'render.modes': ['human']}
+    def get_rl_state(self):
+        # self.state = {
+        #     'hundreds_carry': '',
+        #     'tens_carry': '',
+        #     'ones_carry': '',
+        #     'upper_hundreds': upper_hundreds,
+        #     'upper_tens': upper_tens,
+        #     'upper_ones': upper_ones,
+        #     'lower_hundreds': lower_hundreds,
+        #     'lower_tens': lower_tens,
+        #     'lower_ones': lower_ones,
+        #     'operator': '+',
+        #     'answer_thousands': '',
+        #     'answer_hundreds': '',
+        #     'answer_tens': '',
+        #     'answer_ones': ''
+        # }
+        state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for
+                attr in self.tutor.state}
+        output = " %s%s%s \n  %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
+                state["hundreds_carry"],
+                state["tens_carry"],
+                state["ones_carry"],
+                state["upper_hundreds"],
+                state["upper_tens"],
+                state["upper_ones"],
+                state["lower_hundreds"],
+                state["lower_tens"],
+                state["lower_ones"],
+                state["answer_thousands"],
+                state["answer_hundreds"],
+                state["answer_tens"],
+                state["answer_ones"],
+                )
+        img = Image.new('RGB', (50, 90), color="white")
+        d = ImageDraw.Draw(img)
+        d.text((10, 10), output, fill='black')
+        img = img.convert('L')
+        # img.save('test.png')
+        return np.expand_dims(np.array(img)/255, axis=2)
+    def __init__(self):
+        self.tutor = MultiColumnAdditionSymbolic()
+        n_selections = len(self.tutor.get_possible_selections())
+        print('shape = ', self.get_rl_state().shape)
+        self.observation_space = spaces.Box(low=0.0,
+                high=1.0, shape=self.get_rl_state().shape, dtype=np.float32)
+        self.action_space = spaces.MultiDiscrete([n_selections, 10])
+    def step(self, action):
+        s, a, i = self.decode(action)
+        # print(s, a, i)
+        # print()
+        reward = self.tutor.apply_sai(s, a, i)
+        # print(reward)
+        obs = self.get_rl_state()
+        # pprint(state)
+        done = (s == 'done' and reward == 1.0)
+        info = {}
+        return obs, reward, done, info
+    def decode(self, action):
+        # print(action)
+        s = self.tutor.get_possible_selections()[action[0]]
+        if s == "done":
+            a = "ButtonPressed"
+        else:
+            a = "UpdateField"
+        if s == "done":
+            v = -1
+        if s == "check_convert":
+            v = "x"
+        else:
+            v = action[1]
+        i = {'value': str(v)}
+        return s, a, i
+    def reset(self):
+        self.tutor.set_random_problem()
+        obs = self.get_rl_state()
+        return obs
+    def render(self, mode='human', close=False):
+        self.tutor.render()