diff --git a/sandbox/run_ppo_multi-v0.py b/sandbox/run_ppo_multi-v0.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e856616c6f8b15987e052420a56f15e1f952fa
--- /dev/null
+++ b/sandbox/run_ppo_multi-v0.py
@@ -0,0 +1,35 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+ # multiprocess environment
+ env = make_vec_env('MultiColumnArith-v0', n_envs=1)
+ model = PPO2(MlpPolicy, env, verbose=1,
+ gamma=0.5,
+ policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+ tensorboard_log="./tensorboard/v0/")
+
+ while True:
+ model.learn(total_timesteps=100)
+
+ # To demonstrate saving and loading
+ # model.save("ppo2_multicolumn-v0")
+ # del model
+ # model = PPO2.load("ppo2_multicolumn-v0")
+
+ # Enjoy trained agent
+ obs = env.reset()
+ rwd = 0
+ for _ in range(10000):
+ action, _states = model.predict(obs)
+ obs, rewards, dones, info = env.step(action)
+ rwd += np.sum(rewards)
+ env.render()
+ print(rwd)
diff --git a/sandbox/run_ppo_multi-v1.py b/sandbox/run_ppo_multi-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e44d9e277c46f00b8f50f808d389d2717a63e2f
--- /dev/null
+++ b/sandbox/run_ppo_multi-v1.py
@@ -0,0 +1,34 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+ # multiprocess environment
+ env = make_vec_env('MultiColumnArith-v1', n_envs=9)
+ model = PPO2(MlpPolicy, env, verbose=1,
+ gamma=0.5,
+ policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+ tensorboard_log="./tensorboard/")
+
+ while True:
+ model.learn(total_timesteps=100)
+
+ # To demonstrate saving and loading
+ # model.save("ppo2_multicolumn-v0")
+ # del model
+ # model = PPO2.load("ppo2_multicolumn-v0")
+
+ # Enjoy trained agent
+ obs = env.reset()
+ rwd = 0
+ for _ in range(10000):
+ action, _states = model.predict(obs)
+ obs, rewards, dones, info = env.step(action)
+ rwd += np.sum(rewards)
+ env.render()
+ print(rwd)
diff --git a/sandbox/run_ppo_multi-v2.py b/sandbox/run_ppo_multi-v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..256558ea8220ecd5fb6f2154a41055c868109f78
--- /dev/null
+++ b/sandbox/run_ppo_multi-v2.py
@@ -0,0 +1,35 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+ # multiprocess environment
+ env = make_vec_env('MultiColumnArith-v2', n_envs=9)
+ model = PPO2(CnnPolicy, env, verbose=1,
+ gamma=0.5,
+ policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+ tensorboard_log="./tensorboard/v2/")
+
+ while True:
+ model.learn(total_timesteps=100)
+
+ # To demonstrate saving and loading
+ # model.save("ppo2_multicolumn-v0")
+ # del model
+ # model = PPO2.load("ppo2_multicolumn-v0")
+
+ # Enjoy trained agent
+ obs = env.reset()
+ rwd = 0
+ for _ in range(10000000):
+ action, _states = model.predict(obs)
+ obs, rewards, dones, info = env.step(action)
+ rwd += np.sum(rewards)
+ env.render()
+ print(rwd)
diff --git a/sandbox/run_ppo_multi-v3.py b/sandbox/run_ppo_multi-v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5270a23d05a6f09e08308bca85e7e7acb14bc588
--- /dev/null
+++ b/sandbox/run_ppo_multi-v3.py
@@ -0,0 +1,35 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+ # multiprocess environment
+ env = make_vec_env('MultiColumnArith-v3', n_envs=9)
+ model = PPO2(CnnPolicy, env, verbose=1,
+ gamma=0.95,
+ policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+ tensorboard_log="./tensorboard/v3/")
+
+ while True:
+ model.learn(total_timesteps=10)
+
+ # To demonstrate saving and loading
+ # model.save("ppo2_multicolumn-v0")
+ # del model
+ # model = PPO2.load("ppo2_multicolumn-v0")
+
+ # Enjoy trained agent
+ obs = env.reset()
+ rwd = 0
+ for _ in range(100000):
+ action, _states = model.predict(obs)
+ obs, rewards, dones, info = env.step(action)
+ rwd += np.sum(rewards)
+ env.render()
+ print(rwd)
diff --git a/sandbox/run_ppo_multicolumn.py b/sandbox/run_ppo_multicolumn.py
deleted file mode 100644
index e0809f4a73e743e40b663c016112df1f5fa28561..0000000000000000000000000000000000000000
--- a/sandbox/run_ppo_multicolumn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import gym
-from stable_baselines.common import make_vec_env
-from stable_baselines.common.policies import MlpPolicy
-from stable_baselines.common.policies import CnnPolicy
-from stable_baselines import PPO2
-from stable_baselines import SAC
-import tutorenvs
-import numpy as np
-
-
-if __name__ == "__main__":
-
- # multiprocess environment
- env = make_vec_env('MultiColumnArith-v2', n_envs=8)
- model = PPO2(CnnPolicy, env, verbose=1,
- gamma=0.5,
- policy_kwargs={'net_arch': [100, 100, {'vf': [65], 'pi': [65]}]},
- tensorboard_log="./ppo_MultiColumnArith-v0/")
- # model = PPO2(MlpPolicy, env, verbose=1,
- # gamma=0.5,
- # # policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
- # tensorboard_log="./ppo_MultiColumnArith-v0/")
-
- while True:
- model.learn(total_timesteps=9999999999)
- # model.save("ppo2_cartpole")
-
- # del model # remove to demonstrate saving and loading
-
- # model = PPO2.load("ppo2_cartpole")
-
- # Enjoy trained agent
- # obs = env.reset()
- # rwd = 0
- # for _ in range(100):
- # action, _states = model.predict(obs)
- # obs, rewards, dones, info = env.step(action)
- # rwd += np.sum(rewards)
- # env.render()
- # print(rwd)
diff --git a/tutorenvs/__init__.py b/tutorenvs/__init__.py
index 837554f5cec5f100c81c966dfcb9a43831dca397..491b28c8c3cbf36c67c4c9757f28d7e5182c06fd 100644
--- a/tutorenvs/__init__.py
+++ b/tutorenvs/__init__.py
@@ -4,6 +4,7 @@ from tutorenvs.fractions import FractionArithOppEnv
from tutorenvs.multicolumn import MultiColumnAdditionOppEnv
from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv
from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv
+from tutorenvs.multicolumn import MultiColumnAdditionPerceptEnv
register(
id='FractionArith-v0',
@@ -35,3 +36,8 @@ register(
id='MultiColumnArith-v2',
entry_point='tutorenvs:MultiColumnAdditionPixelEnv',
)
+
+register(
+ id='MultiColumnArith-v3',
+ entry_point='tutorenvs:MultiColumnAdditionPerceptEnv',
+)
diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py
index 2da302b3ded89baca60a52442494e0fbdcb25b56..e5091f50bb94e7231760426af082cbf377ca0b88 100644
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -2,6 +2,7 @@ from random import randint
from random import choice
from pprint import pprint
+import cv2 # pytype:disable=import-error
import gym
from gym import error, spaces, utils
from gym.utils import seeding
@@ -84,7 +85,9 @@ class MultiColumnAdditionSymbolic:
if len(lower) == 1:
lower_ones = lower[0]
- self.steps = 0
+ self.num_correct_steps = 0
+ self.num_incorrect_steps = 0
+
self.state = {
'hundreds_carry': '',
'tens_carry': '',
@@ -129,9 +132,14 @@ class MultiColumnAdditionSymbolic:
'answer_ones',
]
- def render(self):
- state = {attr: " " if self.state[attr] == '' else self.state[attr] for
- attr in self.state}
+ def render(self, add_dot=None):
+ img = self.get_image(add_counts=True, add_dot=add_dot)
+ cv2.imshow('vecenv', np.array(img))
+ cv2.waitKey(1)
+
+ def get_image(self, add_counts=False, add_dot=None):
+ state = {attr: " " if self.state[attr] == '' else
+ self.state[attr] for attr in self.state}
output = " %s%s%s \n %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
state["hundreds_carry"],
@@ -149,10 +157,45 @@ class MultiColumnAdditionSymbolic:
state["answer_ones"],
)
- print("------------------------------------------------------")
- print(output)
- print("------------------------------------------------------")
- print()
+ img = Image.new('RGB', (50, 90), color="white")
+ d = ImageDraw.Draw(img)
+ d.text((10, 10), output, fill='black')
+
+ # Draw input fields
+
+ # ones
+ if state['answer_ones'] == " ":
+ d.rectangle(((34, 71), (38, 79)), fill=None, outline='black')
+ # tens
+ if state['answer_tens'] == " ":
+ d.rectangle(((28, 71), (32, 79)), fill=None, outline='black')
+ # hundreds
+ if state['answer_hundreds'] == " ":
+ d.rectangle(((22, 71), (26, 79)), fill=None, outline='black')
+ # thousands
+ if state['answer_thousands'] == " ":
+ d.rectangle(((16, 71), (20, 79)), fill=None, outline='black')
+
+ # ones carry
+ if state['ones_carry'] == " ":
+ d.rectangle(((28, 11), (32, 19)), fill=None, outline='black')
+ # tens carry
+ if state['tens_carry'] == " ":
+ d.rectangle(((22, 11), (26, 19)), fill=None, outline='black')
+ # hundreds carry
+ if state['hundreds_carry'] == " ":
+ d.rectangle(((16, 11), (20, 19)), fill=None, outline='black')
+
+ # append correct/incorrect counts
+ if add_counts:
+ d.text((0, 0), str(self.num_incorrect_steps), fill="red")
+ d.text((0, 10), str(self.num_correct_steps), fill="green")
+
+ if add_dot:
+ d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3),
+ fill=None, outline='blue')
+
+ return img
def get_state(self):
"""
@@ -190,16 +233,20 @@ class MultiColumnAdditionSymbolic:
"""
Give a SAI, it applies it. This method returns feedback (i.e., -1 or 1).
"""
- self.steps += 1
reward = self.evaluate_sai(selection, action, inputs)
+
+ if reward > 0:
+ self.num_correct_steps += 1
+ else:
+ self.num_incorrect_steps += 1
if reward == -1.0:
return reward
if selection == "done":
- print("DONE! Only took %i steps." % self.steps)
- self.render()
- print()
+ # print("DONE! Only took %i steps." % (self.num_correct_steps + self.num_incorrect_steps))
+ # self.render()
+ # print()
# pprint(self.state)
self.set_random_problem()
@@ -381,10 +428,10 @@ class MultiColumnAdditionOppEnv(BaseOppEnv):
def get_rl_operators(self):
return [
- 'copy',
- 'add',
- 'mod10',
- 'div10',
+ ('copy', 1),
+ ('add', 2),
+ ('mod10', 1),
+ ('div10', 1)
]
class MultiColumnAdditionDigitsEnv(gym.Env):
@@ -402,47 +449,6 @@ class MultiColumnAdditionDigitsEnv(gym.Env):
return training_data
def get_rl_state(self):
- # self.state = {
- # 'hundreds_carry': '',
- # 'tens_carry': '',
- # 'ones_carry': '',
- # 'upper_hundreds': upper_hundreds,
- # 'upper_tens': upper_tens,
- # 'upper_ones': upper_ones,
- # 'lower_hundreds': lower_hundreds,
- # 'lower_tens': lower_tens,
- # 'lower_ones': lower_ones,
- # 'operator': '+',
- # 'answer_thousands': '',
- # 'answer_hundreds': '',
- # 'answer_tens': '',
- # 'answer_ones': ''
- # }
- state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for
- attr in self.tutor.state}
-
- output = " %s%s%s \n %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
- state["hundreds_carry"],
- state["tens_carry"],
- state["ones_carry"],
- state["upper_hundreds"],
- state["upper_tens"],
- state["upper_ones"],
- state["lower_hundreds"],
- state["lower_tens"],
- state["lower_ones"],
- state["answer_thousands"],
- state["answer_hundreds"],
- state["answer_tens"],
- state["answer_ones"],
- )
-
- img = Image.new('RGB', (50, 90), color="white")
- d = ImageDraw.Draw(img)
- d.text((10, 10), output, fill='black')
- img.save('test.png')
- print(np.array(img))
-
return self.tutor.state
def __init__(self):
@@ -504,46 +510,7 @@ class MultiColumnAdditionPixelEnv(gym.Env):
metadata = {'render.modes': ['human']}
def get_rl_state(self):
- # self.state = {
- # 'hundreds_carry': '',
- # 'tens_carry': '',
- # 'ones_carry': '',
- # 'upper_hundreds': upper_hundreds,
- # 'upper_tens': upper_tens,
- # 'upper_ones': upper_ones,
- # 'lower_hundreds': lower_hundreds,
- # 'lower_tens': lower_tens,
- # 'lower_ones': lower_ones,
- # 'operator': '+',
- # 'answer_thousands': '',
- # 'answer_hundreds': '',
- # 'answer_tens': '',
- # 'answer_ones': ''
- # }
- state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for
- attr in self.tutor.state}
-
- output = " %s%s%s \n %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
- state["hundreds_carry"],
- state["tens_carry"],
- state["ones_carry"],
- state["upper_hundreds"],
- state["upper_tens"],
- state["upper_ones"],
- state["lower_hundreds"],
- state["lower_tens"],
- state["lower_ones"],
- state["answer_thousands"],
- state["answer_hundreds"],
- state["answer_tens"],
- state["answer_ones"],
- )
-
- img = Image.new('RGB', (50, 90), color="white")
- d = ImageDraw.Draw(img)
- d.text((10, 10), output, fill='black')
- img = img.convert('L')
- # img.save('test.png')
+ img = self.tutor.get_image().convert('L')
return np.expand_dims(np.array(img)/255, axis=2)
def __init__(self):
@@ -596,4 +563,150 @@ class MultiColumnAdditionPixelEnv(gym.Env):
return obs
def render(self, mode='human', close=False):
- self.tutor.render()
+ if mode == "rgb_array":
+ return np.array(self.tutor.get_image(add_counts=True))
+
+ elif mode == "human":
+ self.tutor.render()
+
+class MultiColumnAdditionPerceptEnv(gym.Env):
+ metadata = {'render.modes': ['human']}
+
+ def __init__(self):
+ self.targets = ['answer_ones', 'ones_carry', 'answer_tens',
+ 'tens_carry', 'answer_hundreds', 'hundreds_carry',
+ 'answer_thousands']
+ self.target_xy = [
+ (36, 83),
+ (30, 15),
+ (30, 83),
+ (24, 15),
+ (24, 83),
+ (18, 15),
+ (18, 83)
+ ]
+
+ self.current_target = 0
+
+ self.set_xy()
+
+ self.tutor = MultiColumnAdditionSymbolic()
+ n_selections = len(self.tutor.get_possible_selections())
+
+ print('shape = ', self.get_rl_state().shape)
+
+ self.observation_space = spaces.Box(low=0.0,
+ high=1.0, shape=self.get_rl_state().shape, dtype=np.float32)
+ # self.action_space = spaces.MultiDiscrete([n_selections, 10])
+ self.action_space = spaces.Discrete(15)
+
+ def set_xy(self):
+ self.x, self.y = self.target_xy[self.current_target]
+
+ def get_rl_state(self):
+ img = self.tutor.get_image().convert('L')
+ x = self.x - 50
+ y = self.y - 90
+
+ translate = img.transform((img.size[0]*2, img.size[1]*2), Image.AFFINE, (1, 0, x, 0, 1, y))
+ # cv2.imshow('translated', np.array(translate))
+ # cv2.waitKey(1)
+ return np.expand_dims(np.array(translate)/255, axis=2)
+
+ def step(self, action):
+ s = None
+ reward = -0.01
+
+ if action == 0:
+ # left
+ self.x -= 5
+ elif action == 1:
+ # right
+ self.x += 5
+ elif action == 2:
+ # up
+ self.y += 5
+ elif action == 3:
+ # down
+ self.y -= 5
+ elif action == 4:
+ s = "done"
+ a = "ButtonPressed"
+ i = -1
+ else:
+
+ # answer fields
+ if self.x >= 34 and self.y >= 71 and self.x <= 38 and self.y <=79:
+ s = "answer_ones"
+ elif self.x >= 28 and self.y >= 71 and self.x <= 32 and self.y <=79:
+ s = "answer_tens"
+ elif self.x >= 22 and self.y >= 71 and self.x <= 26 and self.y <=79:
+ s = "answer_hundreds"
+ elif self.x >= 16 and self.y >= 71 and self.x <= 20 and self.y <=79:
+ s = "answer_thousands"
+
+ # carry fields
+ elif self.x >= 28 and self.y >= 11 and self.x <= 32 and self.y <=19:
+ s = "ones_carry"
+ elif self.x >= 22 and self.y >= 11 and self.x <= 26 and self.y <=19:
+ s = "tens_carry"
+ elif self.x >= 16 and self.y >= 11 and self.x <= 20 and self.y <=19:
+ s = "hundreds_carry"
+
+ a = 'UpdateField'
+ i = {'value': str(action - 5)}
+
+ if s != None:
+ reward = self.tutor.apply_sai(s, a, i)
+
+ self.x = min(max(self.x, 0), 50)
+ self.y = min(max(self.y, 0), 90)
+
+ obs = self.get_rl_state()
+ done = (s == 'done' and reward == 1.0)
+ info = {}
+ return obs, reward, done, info
+
+ # s, a, i = self.decode(action)
+ # # print(s, a, i)
+ # # print()
+ # reward = self.tutor.apply_sai(s, a, i)
+ # # print(reward)
+ #
+ # obs = self.get_rl_state()
+ # # pprint(state)
+ # info = {}
+
+ return obs, reward, done, info
+
+ def decode(self, action):
+ # print(action)
+ s = self.tutor.get_possible_selections()[action[0]]
+
+ if s == "done":
+ a = "ButtonPressed"
+ else:
+ a = "UpdateField"
+
+ if s == "done":
+ v = -1
+ if s == "check_convert":
+ v = "x"
+ else:
+ v = action[1]
+
+ i = {'value': str(v)}
+
+ return s, a, i
+
+ def reset(self):
+ self.tutor.set_random_problem()
+ obs = self.get_rl_state()
+ return obs
+
+ def render(self, mode='human', close=False):
+ if mode == "rgb_array":
+ return np.array(self.tutor.get_image(add_counts=True, add_dot=(self.x, self.y)))
+
+ elif mode == "human":
+ self.tutor.render(add_dot=(self.x, self.y))
diff --git a/tutorenvs/utils.py b/tutorenvs/utils.py
index 42479be76b9a1c3231c858843e5303b84f196e09..43c100c5a6d1bf1affac01db5839306adfc1b5db 100644
--- a/tutorenvs/utils.py
+++ b/tutorenvs/utils.py
@@ -5,53 +5,114 @@ from gym import error, spaces, utils
from sklearn.feature_extraction import DictVectorizer
import numpy as np
+class OnlineDictVectorizer():
+
+ def __init__(self, n_features):
+ self.n_features = n_features
+ self.separator = '='
+ self.dtype = np.float32
+ self.reset()
+
+ def reset(self):
+ self.key = {}
+
+ def fit(self, X):
+ """
+ Given a set of X, it updates the key with any new values.
+ """
+
+ for x in X:
+ for f, v in x.items():
+ if isinstance(v, str):
+ f = "%s%s%s" % (f, self.separator, v)
+ if f not in self.key:
+ if len(self.key) < self.n_features:
+ self.key[f] = len(self.key)
+ else:
+ print("Exceeded available features")
+
+ return self
+
+ def transform(self, X):
+ """
+ Transforms the data using existing key mappings.
+ """
+ new_X = np.zeros((len(X), self.n_features), dtype=self.dtype)
+
+ for i, x in enumerate(X):
+ for f, v in x.items():
+ if isinstance(v, str):
+ f = "%s%s%s" % (f, self.separator, v)
+ v = 1
+ try:
+ new_X[i, self.key[f]] = self.dtype(v)
+ except KeyError:
+ pass
+
+ return new_X
+
+ def fit_transform(self, X):
+ """
+ Similar to two calls of fit and transform, but does it all in
+ one iteration rather than two through the data.
+ """
+ new_X = np.zeros((len(X), self.n_features), dtype=self.dtype)
+
+ for i, x in enumerate(X):
+ for f, v in x.items():
+ if isinstance(v, str):
+ f = "%s%s%s" % (f, self.separator, v)
+ v = 1
+
+ if f not in self.key:
+ if len(self.key) < self.n_features:
+ self.key[f] = len(self.key)
+ else:
+ print("Exceeded available features")
+
+ try:
+ new_X[i, self.key[f]] = self.dtype(v)
+ except KeyError:
+ pass
+
+ return new_X
+
+
class BaseOppEnv(gym.Env):
metadata = {'render.modes': ['human']}
- def __init__(self, tutor_class, max_depth=1):
+ def __init__(self, tutor_class, max_depth=0):
print('building env')
self.tutor = tutor_class()
self.max_depth = max_depth
self.internal_memory = {}
- self.possible_attr = set(self.tutor.get_possible_args())
- for _ in range(self.max_depth):
- new = set()
- for opp in self.get_rl_operators():
- for a1 in self.possible_attr:
- for a2 in self.possible_attr:
- new.add((opp, a1, a2))
- self.possible_attr = self.possible_attr.union(new)
- print('# features = %i' % len(self.possible_attr))
-
- self.possible_args = list(set([attr[1] if isinstance(attr, tuple) else
- attr for attr in self.possible_attr]))
- print('# args = %i' % len(self.possible_args))
-
- # one additional option to save result internally
n_selections = len(self.tutor.get_possible_selections()) + 1
- print('getting rl state')
- n_features = len(self.get_rl_state())
- print('done getting rl state')
n_operators = len(self.get_rl_operators())
- n_args = len(self.possible_args)
- self.dv = DictVectorizer()
- self.dv.fit([self.get_rl_state()])
+ n_args = len(self.tutor.get_possible_args())
+
+ branching = 0
+ for opp, arg_count in self.get_rl_operators():
+ branching += n_args**arg_count
+
+ n_features = len(self.tutor.state) + branching**max_depth
+ print('# features = %i' % n_features)
+
+ self.dv = OnlineDictVectorizer(n_features=n_features)
self.observation_space = spaces.Box(low=0.0,
high=1.0, shape=(1, n_features), dtype=np.float32)
self.action_space = spaces.MultiDiscrete([n_selections, n_operators,
n_args, n_args])
- print('done')
def get_rl_operators(self):
return [
- 'copy',
- 'add',
- 'multiply',
- 'mod10',
- 'div10',
+ ('copy', 1),
+ ('add', 2),
+ ('multiply', 2),
+ ('mod10', 1),
+ ('div10', 1)
]
def get_rl_state(self):
@@ -83,11 +144,8 @@ class BaseOppEnv(gym.Env):
state[attr] = self.tutor.state[attr] != ""
# if its in internal memory, then return true, else false.
- for possible_attr in self.possible_attr:
- state[possible_attr] = possible_attr in self.internal_memory
-
- print('done with base attributes in state')
- print('# of base attributes = %i' % len(state))
+ for attr in self.internal_memory:
+ state[attr] = True
# relations (equality, >10)
new_relations = {}
@@ -107,32 +165,21 @@ class BaseOppEnv(gym.Env):
except Exception:
new_relations['greater_than_9(%s)' % str(attr)] = False
- # # equality
- # for attr2 in state:
- # if str(attr) >= str(attr2):
- # continue
-
- # attr2_val = None
- # if attr2 in self.tutor.state:
- # attr2_val = self.tutor.state[attr2]
- # elif attr2 in self.internal_memory:
- # attr2_val = self.internal_memory[attr2]
- # else:
- # attr2_val = ''
- # new_relations['eq(%s,%s)' % (attr, attr2)] = attr_val == attr2_val
-
- print('done with creating new relations')
- print('# of new relations = %i' % len(new_relations))
-
for attr in new_relations:
state[attr] = new_relations[attr]
+ from pprint import pprint
+ pprint(state)
+
+ return state
# convert all attributes to strings
- return {str(attr): state[attr] for attr in state}
+ # return {str(attr): state[attr] for attr in state}
def step(self, action):
try:
s, a, i = self.decode(action)
+
+ print(s, a, i)
if isinstance(s, tuple):
if s in self.internal_memory or i == '':
@@ -153,7 +200,7 @@ class BaseOppEnv(gym.Env):
state = self.get_rl_state()
# pprint(state)
- obs = self.dv.transform([state])[0].toarray()
+ obs = self.dv.fit_transform([state])[0]
info = {}
return obs, reward, done, info
@@ -191,12 +238,17 @@ class BaseOppEnv(gym.Env):
def decode(self, action):
# print(action)
- op = self.get_rl_operators()[action[1]]
- arg1 = self.possible_args[action[2]]
- arg2 = self.possible_args[action[3]]
+ op, arg_count = self.get_rl_operators()[action[1]]
+ arg1 = self.tutor.get_possible_args()[action[2]]
+ arg2 = self.tutor.get_possible_args()[action[3]]
if action[0] == len(self.tutor.get_possible_selections()):
- s = (opp, arg1, arg2)
+ if op == "copy":
+ raise ValueError("cannot copy into internal memory")
+ if arg_count == 1:
+ s = (op, arg1)
+ elif arg_count == 2:
+ s = (op, arg1, arg2)
else:
s = self.tutor.get_possible_selections()[action[0]]
@@ -220,7 +272,7 @@ class BaseOppEnv(gym.Env):
self.tutor.set_random_problem()
state = self.get_rl_state()
self.internal_memory = {}
- obs = self.dv.transform([state])[0].toarray()
+ obs = self.dv.transform([state])[0]
return obs
def render(self, mode='human', close=False):