diff --git a/README.md b/README.md index 1b4da26e708fce0c253cad775c3222251f221cb7..b9775de915f3e2238de40b7cbf2cd2317e603c05 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,64 @@ -This library contains headless versions of a number of commonly used tutoring system environments for training simulated students. +This library contains headless versions of a number of commonly used tutoring +system environments for training simulated students. There are currently two +different tutoring systems that can be loaded: the fraction arithmetic tutor +and a multi-column arithmetic tutor. -To create an AI Gym environment for the Fraction Arithmetic tutor use the following commands: +The core of each tutor is a "Symbolic" variant of each tutor. This class +maintains the logic for the tutor, including calls for getting the state +description, applying selection, action, inputs (SAIs) which returns +feedback about whether the provided sai is correct/incorrect, and which +provides a method for requesting a demonstration (a valid Sai for the +next step). +The Apprentice Learner Architecture can interface directly with these +symbolic variants of the tutor. + +Next, there are separate classes that wrap these symbolic tutorrs in AI gym +environments that can be loaded by a reinforcement learning algorithm, such as +those in the stable baseline library (e.g., the PPO algorithm). + +Currently, I am exploring multiple representations for the RL tutor: + +- Operators model: this is the closest to what the Apprentice Learner would + use. In particular, the agent perceives the same hot-one coded state features + that the AL agent would get. It also has 4 discrete action outputs + (multi-discrete space), the first is for the selection (all selectable + interface elements in the tutor), the second discrete output is for an + operator (copy, add, subtract, multiply, etc., these correspond to prior + knowledge operators/functions), the next two outputs correspond to the fields + that get passed as input to the operator (e.g., two other fields in the tutor + interface that currently have values). + +- Digits model, this has the same input as the Operators model (hot-one + features that AL would get), but has a different action output. Instead, it + has 4 discrete action outputs (multi-discrete space). The first output is for + selection (as above), the second is for a digit in the ones place (0-9), the + third is for a digit in the tens place (0-9), and the fourth is for a digit + in the hundreds place (0-9). Depending on the tutor the number of digits + might be more or less depending on what is necessary to solve the task (e.g., + the multi-column arith only has a single digit). + +- Pixel model, This has the same output as the Digits model, but has a different + input. Instead, the model gets a black and white, pixel representation of the + tutor interface. It is not identical to the human tutor represention, but it + is a semi-resonable facimile that includes all the information that exists in + the human tutor. + +These different representations are registered as AI gym models under the names +"FractionArith" and "MultiColumnArith" and version numbers "v0" (for operator +model), "v1" (for digits model), and "v2" for pixel model. As an example of how +to create an operator model for fraction arithmetic and train a PPO model from the +stable baselines package, you would use the following code: ``` import gym import tutorenvs -env = gym.make('FractionArith-v0') +env = make_vec_env('MultiColumnArith-v0', n_envs=8) +model = PPO2(MlpPolicy, env, verbose=1) +model.learn(total_timesteps=2000000) ``` + +See the code in the `sandbox/` folder for examples of how to train different +kinds of agents on these different environments. The +`sandbox/run_al_fractions.py` code shows how to train an Apprentice agent on +the headless fractions tutor. diff --git a/sandbox/run_ppo_multicolumn.py b/sandbox/run_ppo_multicolumn.py index 31cea306bd386d789965444c8f3181d010e38116..e0809f4a73e743e40b663c016112df1f5fa28561 100644 --- a/sandbox/run_ppo_multicolumn.py +++ b/sandbox/run_ppo_multicolumn.py @@ -1,6 +1,7 @@ import gym from stable_baselines.common import make_vec_env from stable_baselines.common.policies import MlpPolicy +from stable_baselines.common.policies import CnnPolicy from stable_baselines import PPO2 from stable_baselines import SAC import tutorenvs @@ -10,11 +11,15 @@ import numpy as np if __name__ == "__main__": # multiprocess environment - env = make_vec_env('MultiColumnArith-v0', n_envs=8) - model = PPO2(MlpPolicy, env, verbose=1, + env = make_vec_env('MultiColumnArith-v2', n_envs=8) + model = PPO2(CnnPolicy, env, verbose=1, gamma=0.5, - policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]}, + policy_kwargs={'net_arch': [100, 100, {'vf': [65], 'pi': [65]}]}, tensorboard_log="./ppo_MultiColumnArith-v0/") + # model = PPO2(MlpPolicy, env, verbose=1, + # gamma=0.5, + # # policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]}, + # tensorboard_log="./ppo_MultiColumnArith-v0/") while True: model.learn(total_timesteps=9999999999) diff --git a/tutorenvs/__init__.py b/tutorenvs/__init__.py index e89834c773ba3a1e01fd593df8a7900127b537c5..837554f5cec5f100c81c966dfcb9a43831dca397 100644 --- a/tutorenvs/__init__.py +++ b/tutorenvs/__init__.py @@ -3,23 +3,35 @@ from tutorenvs.fractions import FractionArithDigitsEnv from tutorenvs.fractions import FractionArithOppEnv from tutorenvs.multicolumn import MultiColumnAdditionOppEnv from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv +from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv register( id='FractionArith-v0', - entry_point='tutorenvs:FractionArithDigitsEnv', + entry_point='tutorenvs:FractionArithOppEnv', ) register( id='FractionArith-v1', - entry_point='tutorenvs:FractionArithOppEnv', + entry_point='tutorenvs:FractionArithDigitsEnv', ) +# TODO no pixel fractions yet. +# register( +# id='FractionArith-v2', +# entry_point='tutorenvs:FractionArithPixelEnv', +# ) + register( id='MultiColumnArith-v0', - entry_point='tutorenvs:MultiColumnAdditionDigitsEnv', + entry_point='tutorenvs:MultiColumnAdditionOppEnv', ) register( id='MultiColumnArith-v1', - entry_point='tutorenvs:MultiColumnAdditionOppEnv', + entry_point='tutorenvs:MultiColumnAdditionDigitsEnv', +) + +register( + id='MultiColumnArith-v2', + entry_point='tutorenvs:MultiColumnAdditionPixelEnv', ) diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py index 9649b5a9c969301054b350a3c3973c4be5a96da3..2da302b3ded89baca60a52442494e0fbdcb25b56 100644 --- a/tutorenvs/multicolumn.py +++ b/tutorenvs/multicolumn.py @@ -8,6 +8,7 @@ from gym.utils import seeding from sklearn.feature_extraction import FeatureHasher from sklearn.feature_extraction import DictVectorizer import numpy as np +from PIL import Image, ImageDraw from tutorenvs.utils import BaseOppEnv @@ -417,6 +418,31 @@ class MultiColumnAdditionDigitsEnv(gym.Env): # 'answer_tens': '', # 'answer_ones': '' # } + state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for + attr in self.tutor.state} + + output = " %s%s%s \n %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % ( + state["hundreds_carry"], + state["tens_carry"], + state["ones_carry"], + state["upper_hundreds"], + state["upper_tens"], + state["upper_ones"], + state["lower_hundreds"], + state["lower_tens"], + state["lower_ones"], + state["answer_thousands"], + state["answer_hundreds"], + state["answer_tens"], + state["answer_ones"], + ) + + img = Image.new('RGB', (50, 90), color="white") + d = ImageDraw.Draw(img) + d.text((10, 10), output, fill='black') + img.save('test.png') + print(np.array(img)) + return self.tutor.state def __init__(self): @@ -473,3 +499,101 @@ class MultiColumnAdditionDigitsEnv(gym.Env): def render(self, mode='human', close=False): self.tutor.render() + +class MultiColumnAdditionPixelEnv(gym.Env): + metadata = {'render.modes': ['human']} + + def get_rl_state(self): + # self.state = { + # 'hundreds_carry': '', + # 'tens_carry': '', + # 'ones_carry': '', + # 'upper_hundreds': upper_hundreds, + # 'upper_tens': upper_tens, + # 'upper_ones': upper_ones, + # 'lower_hundreds': lower_hundreds, + # 'lower_tens': lower_tens, + # 'lower_ones': lower_ones, + # 'operator': '+', + # 'answer_thousands': '', + # 'answer_hundreds': '', + # 'answer_tens': '', + # 'answer_ones': '' + # } + state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for + attr in self.tutor.state} + + output = " %s%s%s \n %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % ( + state["hundreds_carry"], + state["tens_carry"], + state["ones_carry"], + state["upper_hundreds"], + state["upper_tens"], + state["upper_ones"], + state["lower_hundreds"], + state["lower_tens"], + state["lower_ones"], + state["answer_thousands"], + state["answer_hundreds"], + state["answer_tens"], + state["answer_ones"], + ) + + img = Image.new('RGB', (50, 90), color="white") + d = ImageDraw.Draw(img) + d.text((10, 10), output, fill='black') + img = img.convert('L') + # img.save('test.png') + return np.expand_dims(np.array(img)/255, axis=2) + + def __init__(self): + self.tutor = MultiColumnAdditionSymbolic() + n_selections = len(self.tutor.get_possible_selections()) + + print('shape = ', self.get_rl_state().shape) + + self.observation_space = spaces.Box(low=0.0, + high=1.0, shape=self.get_rl_state().shape, dtype=np.float32) + self.action_space = spaces.MultiDiscrete([n_selections, 10]) + + def step(self, action): + s, a, i = self.decode(action) + # print(s, a, i) + # print() + reward = self.tutor.apply_sai(s, a, i) + # print(reward) + + obs = self.get_rl_state() + # pprint(state) + done = (s == 'done' and reward == 1.0) + info = {} + + return obs, reward, done, info + + def decode(self, action): + # print(action) + s = self.tutor.get_possible_selections()[action[0]] + + if s == "done": + a = "ButtonPressed" + else: + a = "UpdateField" + + if s == "done": + v = -1 + if s == "check_convert": + v = "x" + else: + v = action[1] + + i = {'value': str(v)} + + return s, a, i + + def reset(self): + self.tutor.set_random_problem() + obs = self.get_rl_state() + return obs + + def render(self, mode='human', close=False): + self.tutor.render()