diff --git a/sandbox/run_ppo_multi-v0.py b/sandbox/run_ppo_multi-v0.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e856616c6f8b15987e052420a56f15e1f952fa
--- /dev/null
+++ b/sandbox/run_ppo_multi-v0.py
@@ -0,0 +1,35 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    env = make_vec_env('MultiColumnArith-v0', n_envs=1)
+    model = PPO2(MlpPolicy, env, verbose=1,
+            gamma=0.5,
+            policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+            tensorboard_log="./tensorboard/v0/")
+
+    while True:
+        model.learn(total_timesteps=100)
+
+        # To demonstrate saving and loading
+        # model.save("ppo2_multicolumn-v0")
+        # del model
+        # model = PPO2.load("ppo2_multicolumn-v0")
+
+        # Enjoy trained agent
+        obs = env.reset()
+        rwd = 0
+        for _ in range(10000):
+            action, _states = model.predict(obs)
+            obs, rewards, dones, info = env.step(action)
+            rwd += np.sum(rewards)
+            env.render()
+        print(rwd)
diff --git a/sandbox/run_ppo_multi-v1.py b/sandbox/run_ppo_multi-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e44d9e277c46f00b8f50f808d389d2717a63e2f
--- /dev/null
+++ b/sandbox/run_ppo_multi-v1.py
@@ -0,0 +1,34 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    env = make_vec_env('MultiColumnArith-v1', n_envs=9)
+    model = PPO2(MlpPolicy, env, verbose=1,
+            gamma=0.5,
+            policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+            tensorboard_log="./tensorboard/")
+
+    while True:
+        model.learn(total_timesteps=100)
+
+        # To demonstrate saving and loading
+        # model.save("ppo2_multicolumn-v0")
+        # del model
+        # model = PPO2.load("ppo2_multicolumn-v0")
+
+        # Enjoy trained agent
+        obs = env.reset()
+        rwd = 0
+        for _ in range(10000):
+            action, _states = model.predict(obs)
+            obs, rewards, dones, info = env.step(action)
+            rwd += np.sum(rewards)
+            env.render()
+        print(rwd)
diff --git a/sandbox/run_ppo_multi-v2.py b/sandbox/run_ppo_multi-v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..256558ea8220ecd5fb6f2154a41055c868109f78
--- /dev/null
+++ b/sandbox/run_ppo_multi-v2.py
@@ -0,0 +1,35 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    env = make_vec_env('MultiColumnArith-v2', n_envs=9)
+    model = PPO2(CnnPolicy, env, verbose=1,
+            gamma=0.5,
+            policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+            tensorboard_log="./tensorboard/v2/")
+
+    while True:
+        model.learn(total_timesteps=100)
+
+        # To demonstrate saving and loading
+        # model.save("ppo2_multicolumn-v0")
+        # del model
+        # model = PPO2.load("ppo2_multicolumn-v0")
+
+        # Enjoy trained agent
+        obs = env.reset()
+        rwd = 0
+        for _ in range(10000000):
+            action, _states = model.predict(obs)
+            obs, rewards, dones, info = env.step(action)
+            rwd += np.sum(rewards)
+            env.render()
+        print(rwd)
diff --git a/sandbox/run_ppo_multi-v3.py b/sandbox/run_ppo_multi-v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..5270a23d05a6f09e08308bca85e7e7acb14bc588
--- /dev/null
+++ b/sandbox/run_ppo_multi-v3.py
@@ -0,0 +1,35 @@
+import gym
+from stable_baselines.common import make_vec_env
+from stable_baselines.common.policies import MlpPolicy
+from stable_baselines.common.policies import CnnPolicy
+from stable_baselines import PPO2
+import tutorenvs
+import numpy as np
+
+
+if __name__ == "__main__":
+
+    # multiprocess environment
+    env = make_vec_env('MultiColumnArith-v3', n_envs=9)
+    model = PPO2(CnnPolicy, env, verbose=1,
+            gamma=0.95,
+            policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
+            tensorboard_log="./tensorboard/v3/")
+
+    while True:
+        model.learn(total_timesteps=10)
+
+        # To demonstrate saving and loading
+        # model.save("ppo2_multicolumn-v0")
+        # del model
+        # model = PPO2.load("ppo2_multicolumn-v0")
+
+        # Enjoy trained agent
+        obs = env.reset()
+        rwd = 0
+        for _ in range(100000):
+            action, _states = model.predict(obs)
+            obs, rewards, dones, info = env.step(action)
+            rwd += np.sum(rewards)
+            env.render()
+        print(rwd)
diff --git a/sandbox/run_ppo_multicolumn.py b/sandbox/run_ppo_multicolumn.py
deleted file mode 100644
index e0809f4a73e743e40b663c016112df1f5fa28561..0000000000000000000000000000000000000000
--- a/sandbox/run_ppo_multicolumn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import gym
-from stable_baselines.common import make_vec_env
-from stable_baselines.common.policies import MlpPolicy
-from stable_baselines.common.policies import CnnPolicy
-from stable_baselines import PPO2
-from stable_baselines import SAC
-import tutorenvs
-import numpy as np
-
-
-if __name__ == "__main__":
-
-    # multiprocess environment
-    env = make_vec_env('MultiColumnArith-v2', n_envs=8)
-    model = PPO2(CnnPolicy, env, verbose=1,
-            gamma=0.5,
-            policy_kwargs={'net_arch': [100, 100, {'vf': [65], 'pi': [65]}]},
-            tensorboard_log="./ppo_MultiColumnArith-v0/")
-    # model = PPO2(MlpPolicy, env, verbose=1,
-    #         gamma=0.5,
-    #         # policy_kwargs={'net_arch': [65, 65, {'vf': [65], 'pi': [65]}]},
-    #         tensorboard_log="./ppo_MultiColumnArith-v0/")
-
-    while True:
-        model.learn(total_timesteps=9999999999)
-        # model.save("ppo2_cartpole")
-
-        # del model # remove to demonstrate saving and loading
-
-        # model = PPO2.load("ppo2_cartpole")
-
-        # Enjoy trained agent
-        # obs = env.reset()
-        # rwd = 0
-        # for _ in range(100):
-        #     action, _states = model.predict(obs)
-        #     obs, rewards, dones, info = env.step(action)
-        #     rwd += np.sum(rewards)
-        #     env.render()
-        # print(rwd)
diff --git a/tutorenvs/__init__.py b/tutorenvs/__init__.py
index 837554f5cec5f100c81c966dfcb9a43831dca397..491b28c8c3cbf36c67c4c9757f28d7e5182c06fd 100644
--- a/tutorenvs/__init__.py
+++ b/tutorenvs/__init__.py
@@ -4,6 +4,7 @@ from tutorenvs.fractions import FractionArithOppEnv
 from tutorenvs.multicolumn import MultiColumnAdditionOppEnv
 from tutorenvs.multicolumn import MultiColumnAdditionDigitsEnv
 from tutorenvs.multicolumn import MultiColumnAdditionPixelEnv
+from tutorenvs.multicolumn import MultiColumnAdditionPerceptEnv
 
 register(
     id='FractionArith-v0',
@@ -35,3 +36,8 @@ register(
     id='MultiColumnArith-v2',
     entry_point='tutorenvs:MultiColumnAdditionPixelEnv',
 )
+
+register(
+    id='MultiColumnArith-v3',
+    entry_point='tutorenvs:MultiColumnAdditionPerceptEnv',
+)
diff --git a/tutorenvs/multicolumn.py b/tutorenvs/multicolumn.py
index 2da302b3ded89baca60a52442494e0fbdcb25b56..e5091f50bb94e7231760426af082cbf377ca0b88 100644
--- a/tutorenvs/multicolumn.py
+++ b/tutorenvs/multicolumn.py
@@ -2,6 +2,7 @@ from random import randint
 from random import choice
 from pprint import pprint
 
+import cv2  # pytype:disable=import-error
 import gym
 from gym import error, spaces, utils
 from gym.utils import seeding
@@ -84,7 +85,9 @@ class MultiColumnAdditionSymbolic:
         if len(lower) == 1:
             lower_ones = lower[0]
 
-        self.steps = 0
+        self.num_correct_steps = 0
+        self.num_incorrect_steps = 0
+
         self.state = {
             'hundreds_carry': '',
             'tens_carry': '',
@@ -129,9 +132,14 @@ class MultiColumnAdditionSymbolic:
             'answer_ones',
             ]
 
-    def render(self):
-        state = {attr: " " if self.state[attr] == '' else self.state[attr] for
-                attr in self.state}
+    def render(self, add_dot=None):
+        img = self.get_image(add_counts=True, add_dot=add_dot)
+        cv2.imshow('vecenv', np.array(img))
+        cv2.waitKey(1)
+
+    def get_image(self, add_counts=False, add_dot=None):
+        state = {attr: " " if self.state[attr] == '' else
+                self.state[attr] for attr in self.state}
 
         output = " %s%s%s \n  %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
                 state["hundreds_carry"],
@@ -149,10 +157,45 @@ class MultiColumnAdditionSymbolic:
                 state["answer_ones"],
                 )
 
-        print("------------------------------------------------------")
-        print(output)
-        print("------------------------------------------------------")
-        print()
+        img = Image.new('RGB', (50, 90), color="white")
+        d = ImageDraw.Draw(img)
+        d.text((10, 10), output, fill='black')
+
+        # Draw input fields
+
+        # ones
+        if state['answer_ones'] == " ":
+            d.rectangle(((34, 71), (38, 79)), fill=None, outline='black')
+        # tens
+        if state['answer_tens'] == " ":
+            d.rectangle(((28, 71), (32, 79)), fill=None, outline='black')
+        # hundreds
+        if state['answer_hundreds'] == " ":
+            d.rectangle(((22, 71), (26, 79)), fill=None, outline='black')
+        # thousands
+        if state['answer_thousands'] == " ":
+            d.rectangle(((16, 71), (20, 79)), fill=None, outline='black')
+
+        # ones carry
+        if state['ones_carry'] == " ":
+            d.rectangle(((28, 11), (32, 19)), fill=None, outline='black')
+        # tens carry
+        if state['tens_carry'] == " ":
+            d.rectangle(((22, 11), (26, 19)), fill=None, outline='black')
+        # hundreds carry
+        if state['hundreds_carry'] == " ":
+            d.rectangle(((16, 11), (20, 19)), fill=None, outline='black')
+
+        # append correct/incorrect counts
+        if add_counts:
+            d.text((0, 0), str(self.num_incorrect_steps), fill="red")
+            d.text((0, 10), str(self.num_correct_steps), fill="green")
+
+        if add_dot:
+            d.ellipse((add_dot[0]-3, add_dot[1]-3, add_dot[0]+3, add_dot[1]+3),
+                    fill=None, outline='blue')
+
+        return img
 
     def get_state(self):
         """
@@ -190,16 +233,20 @@ class MultiColumnAdditionSymbolic:
         """
         Give a SAI, it applies it. This method returns feedback (i.e., -1 or 1).
         """
-        self.steps += 1
         reward = self.evaluate_sai(selection, action, inputs)
+        
+        if reward > 0:
+            self.num_correct_steps += 1
+        else:
+            self.num_incorrect_steps += 1
 
         if reward == -1.0:
             return reward
 
         if selection == "done":
-            print("DONE! Only took %i steps." % self.steps)
-            self.render()
-            print()
+            # print("DONE! Only took %i steps." % (self.num_correct_steps + self.num_incorrect_steps))
+            # self.render()
+            # print()
             # pprint(self.state)
             self.set_random_problem()
 
@@ -381,10 +428,10 @@ class MultiColumnAdditionOppEnv(BaseOppEnv):
 
     def get_rl_operators(self):
         return [
-                'copy',
-                'add',
-                'mod10',
-                'div10',
+                ('copy', 1),
+                ('add', 2),
+                ('mod10', 1),
+                ('div10', 1)
                 ]
 
 class MultiColumnAdditionDigitsEnv(gym.Env):
@@ -402,47 +449,6 @@ class MultiColumnAdditionDigitsEnv(gym.Env):
         return training_data
 
     def get_rl_state(self):
-        # self.state = {
-        #     'hundreds_carry': '',
-        #     'tens_carry': '',
-        #     'ones_carry': '',
-        #     'upper_hundreds': upper_hundreds,
-        #     'upper_tens': upper_tens,
-        #     'upper_ones': upper_ones,
-        #     'lower_hundreds': lower_hundreds,
-        #     'lower_tens': lower_tens,
-        #     'lower_ones': lower_ones,
-        #     'operator': '+',
-        #     'answer_thousands': '',
-        #     'answer_hundreds': '',
-        #     'answer_tens': '',
-        #     'answer_ones': ''
-        # }
-        state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for
-                attr in self.tutor.state}
-
-        output = " %s%s%s \n  %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
-                state["hundreds_carry"],
-                state["tens_carry"],
-                state["ones_carry"],
-                state["upper_hundreds"],
-                state["upper_tens"],
-                state["upper_ones"],
-                state["lower_hundreds"],
-                state["lower_tens"],
-                state["lower_ones"],
-                state["answer_thousands"],
-                state["answer_hundreds"],
-                state["answer_tens"],
-                state["answer_ones"],
-                )
-
-        img = Image.new('RGB', (50, 90), color="white")
-        d = ImageDraw.Draw(img)
-        d.text((10, 10), output, fill='black')
-        img.save('test.png')
-        print(np.array(img))
-
         return self.tutor.state
 
     def __init__(self):
@@ -504,46 +510,7 @@ class MultiColumnAdditionPixelEnv(gym.Env):
     metadata = {'render.modes': ['human']}
 
     def get_rl_state(self):
-        # self.state = {
-        #     'hundreds_carry': '',
-        #     'tens_carry': '',
-        #     'ones_carry': '',
-        #     'upper_hundreds': upper_hundreds,
-        #     'upper_tens': upper_tens,
-        #     'upper_ones': upper_ones,
-        #     'lower_hundreds': lower_hundreds,
-        #     'lower_tens': lower_tens,
-        #     'lower_ones': lower_ones,
-        #     'operator': '+',
-        #     'answer_thousands': '',
-        #     'answer_hundreds': '',
-        #     'answer_tens': '',
-        #     'answer_ones': ''
-        # }
-        state = {attr: " " if self.tutor.state[attr] == '' else self.tutor.state[attr] for
-                attr in self.tutor.state}
-
-        output = " %s%s%s \n  %s%s%s\n+ %s%s%s\n-----\n %s%s%s%s\n" % (
-                state["hundreds_carry"],
-                state["tens_carry"],
-                state["ones_carry"],
-                state["upper_hundreds"],
-                state["upper_tens"],
-                state["upper_ones"],
-                state["lower_hundreds"],
-                state["lower_tens"],
-                state["lower_ones"],
-                state["answer_thousands"],
-                state["answer_hundreds"],
-                state["answer_tens"],
-                state["answer_ones"],
-                )
-
-        img = Image.new('RGB', (50, 90), color="white")
-        d = ImageDraw.Draw(img)
-        d.text((10, 10), output, fill='black')
-        img = img.convert('L')
-        # img.save('test.png')
+        img = self.tutor.get_image().convert('L')
         return np.expand_dims(np.array(img)/255, axis=2)
 
     def __init__(self):
@@ -596,4 +563,150 @@ class MultiColumnAdditionPixelEnv(gym.Env):
         return obs
 
     def render(self, mode='human', close=False):
-        self.tutor.render()
+        if mode == "rgb_array":
+            return np.array(self.tutor.get_image(add_counts=True))
+
+        elif mode == "human":
+            self.tutor.render()
+
+class MultiColumnAdditionPerceptEnv(gym.Env):
+    metadata = {'render.modes': ['human']}
+
+    def __init__(self):
+        self.targets = ['answer_ones', 'ones_carry', 'answer_tens',
+                'tens_carry', 'answer_hundreds', 'hundreds_carry',
+                'answer_thousands']
+        self.target_xy = [
+                (36, 83),
+                (30, 15),
+                (30, 83),
+                (24, 15),
+                (24, 83),
+                (18, 15),
+                (18, 83)
+                ]
+
+        self.current_target = 0
+
+        self.set_xy()
+
+        self.tutor = MultiColumnAdditionSymbolic()
+        n_selections = len(self.tutor.get_possible_selections())
+
+        print('shape = ', self.get_rl_state().shape)
+
+        self.observation_space = spaces.Box(low=0.0,
+                high=1.0, shape=self.get_rl_state().shape, dtype=np.float32)
+        # self.action_space = spaces.MultiDiscrete([n_selections, 10])
+        self.action_space = spaces.Discrete(15)
+
+    def set_xy(self):
+        self.x, self.y = self.target_xy[self.current_target]
+
+    def get_rl_state(self):
+        img = self.tutor.get_image().convert('L')
+        x = self.x - 50
+        y = self.y - 90
+
+        translate = img.transform((img.size[0]*2, img.size[1]*2), Image.AFFINE, (1, 0, x, 0, 1, y))
+        # cv2.imshow('translated', np.array(translate))
+        # cv2.waitKey(1)
+        return np.expand_dims(np.array(translate)/255, axis=2)
+
+    def step(self, action):
+        s = None
+        reward = -0.01
+
+        if action == 0:
+            # left
+            self.x -= 5
+        elif action == 1:
+            # right
+            self.x += 5
+        elif action == 2:
+            # up
+            self.y += 5
+        elif action == 3:
+            # down
+            self.y -= 5
+        elif action == 4:
+            s = "done"
+            a = "ButtonPressed"
+            i = -1
+        else:
+
+            # answer fields
+            if self.x >= 34 and self.y >= 71 and self.x <= 38 and self.y <=79:
+                s = "answer_ones"
+            elif self.x >= 28 and self.y >= 71 and self.x <= 32 and self.y <=79:
+                s = "answer_tens"
+            elif self.x >= 22 and self.y >= 71 and self.x <= 26 and self.y <=79:
+                s = "answer_hundreds"
+            elif self.x >= 16 and self.y >= 71 and self.x <= 20 and self.y <=79:
+                s = "answer_thousands"
+
+            # carry fields
+            elif self.x >= 28 and self.y >= 11 and self.x <= 32 and self.y <=19:
+                s = "ones_carry"
+            elif self.x >= 22 and self.y >= 11 and self.x <= 26 and self.y <=19:
+                s = "tens_carry"
+            elif self.x >= 16 and self.y >= 11 and self.x <= 20 and self.y <=19:
+                s = "hundreds_carry"
+
+            a = 'UpdateField'
+            i = {'value': str(action - 5)}
+
+        if s != None:
+            reward = self.tutor.apply_sai(s, a, i)
+
+        self.x = min(max(self.x, 0), 50)
+        self.y = min(max(self.y, 0), 90)
+
+        obs = self.get_rl_state()
+        done = (s == 'done' and reward == 1.0)
+        info = {}
+        return obs, reward, done, info
+
+        # s, a, i = self.decode(action)
+        # # print(s, a, i)
+        # # print()
+        # reward = self.tutor.apply_sai(s, a, i)
+        # # print(reward)
+        # 
+        # obs = self.get_rl_state()
+        # # pprint(state)
+        # info = {}
+
+        return obs, reward, done, info
+
+    def decode(self, action):
+        # print(action)
+        s = self.tutor.get_possible_selections()[action[0]]
+
+        if s == "done":
+            a = "ButtonPressed"
+        else:
+            a = "UpdateField"
+        
+        if s == "done":
+            v = -1
+        if s == "check_convert":
+            v = "x"
+        else:
+            v = action[1]
+
+        i = {'value': str(v)}
+
+        return s, a, i
+
+    def reset(self):
+        self.tutor.set_random_problem()
+        obs = self.get_rl_state()
+        return obs
+
+    def render(self, mode='human', close=False):
+        if mode == "rgb_array":
+            return np.array(self.tutor.get_image(add_counts=True, add_dot=(self.x, self.y)))
+
+        elif mode == "human":
+            self.tutor.render(add_dot=(self.x, self.y))
diff --git a/tutorenvs/utils.py b/tutorenvs/utils.py
index 42479be76b9a1c3231c858843e5303b84f196e09..43c100c5a6d1bf1affac01db5839306adfc1b5db 100644
--- a/tutorenvs/utils.py
+++ b/tutorenvs/utils.py
@@ -5,53 +5,114 @@ from gym import error, spaces, utils
 from sklearn.feature_extraction import DictVectorizer
 import numpy as np
 
+class OnlineDictVectorizer():
+
+    def __init__(self, n_features):
+        self.n_features = n_features
+        self.separator = '='
+        self.dtype = np.float32
+        self.reset()
+
+    def reset(self):
+        self.key = {}
+
+    def fit(self, X):
+        """
+        Given a set of X, it updates the key with any new values.
+        """
+        
+        for x in X:
+            for f, v in x.items():
+                if isinstance(v, str):
+                    f = "%s%s%s" % (f, self.separator, v)
+                if f not in self.key:
+                    if len(self.key) < self.n_features:
+                        self.key[f] = len(self.key)
+                    else:
+                        print("Exceeded available features")
+
+        return self
+
+    def transform(self, X):
+        """
+        Transforms the data using existing key mappings.
+        """
+        new_X = np.zeros((len(X), self.n_features), dtype=self.dtype)
+
+        for i, x in enumerate(X):
+            for f, v in x.items():
+                if isinstance(v, str):
+                    f = "%s%s%s" % (f, self.separator, v)
+                    v = 1
+                try:
+                    new_X[i, self.key[f]] = self.dtype(v)
+                except KeyError:
+                    pass
+
+        return new_X
+
+    def fit_transform(self, X):
+        """
+        Similar to two calls of fit and transform, but does it all in
+        one iteration rather than two through the data.
+        """
+        new_X = np.zeros((len(X), self.n_features), dtype=self.dtype)
+
+        for i, x in enumerate(X):
+            for f, v in x.items():
+                if isinstance(v, str):
+                    f = "%s%s%s" % (f, self.separator, v)
+                    v = 1
+
+                if f not in self.key:
+                    if len(self.key) < self.n_features:
+                        self.key[f] = len(self.key)
+                    else:
+                        print("Exceeded available features")
+
+                try:
+                    new_X[i, self.key[f]] = self.dtype(v)
+                except KeyError:
+                    pass
+
+        return new_X
+
+
 class BaseOppEnv(gym.Env):
     metadata = {'render.modes': ['human']}
 
-    def __init__(self, tutor_class, max_depth=1):
+    def __init__(self, tutor_class, max_depth=0):
         print('building env')
         self.tutor = tutor_class()
 
         self.max_depth = max_depth
         self.internal_memory = {}
 
-        self.possible_attr = set(self.tutor.get_possible_args())
-        for _ in range(self.max_depth):
-            new = set()
-            for opp in self.get_rl_operators():
-                for a1 in self.possible_attr:
-                    for a2 in self.possible_attr:
-                        new.add((opp, a1, a2))
-            self.possible_attr = self.possible_attr.union(new)
-        print('# features = %i' % len(self.possible_attr))
-
-        self.possible_args = list(set([attr[1] if isinstance(attr, tuple) else
-            attr for attr in self.possible_attr]))
-        print('# args = %i' % len(self.possible_args))
-        
-        # one additional option to save result internally
         n_selections = len(self.tutor.get_possible_selections()) + 1
-        print('getting rl state')
-        n_features = len(self.get_rl_state()) 
-        print('done getting rl state')
         n_operators = len(self.get_rl_operators())
-        n_args = len(self.possible_args)
-        self.dv = DictVectorizer()
-        self.dv.fit([self.get_rl_state()])
+        n_args = len(self.tutor.get_possible_args())
+
+        branching = 0
+        for opp, arg_count in self.get_rl_operators():
+            branching += n_args**arg_count
+
+        n_features = len(self.tutor.state) + branching**max_depth
+        print('# features = %i' % n_features)
+
+        self.dv = OnlineDictVectorizer(n_features=n_features)
 
         self.observation_space = spaces.Box(low=0.0,
                 high=1.0, shape=(1, n_features), dtype=np.float32)
         self.action_space = spaces.MultiDiscrete([n_selections, n_operators,
             n_args, n_args])
-        print('done')
 
     def get_rl_operators(self):
         return [
-                'copy',
-                'add',
-                'multiply',
-                'mod10',
-                'div10',
+                ('copy', 1),
+                ('add', 2),
+                ('multiply', 2),
+                ('mod10', 1),
+                ('div10', 1)
                 ]
 
     def get_rl_state(self):
@@ -83,11 +144,8 @@ class BaseOppEnv(gym.Env):
             state[attr] = self.tutor.state[attr] != ""
 
         # if its in internal memory, then return true, else false.
-        for possible_attr in self.possible_attr:
-            state[possible_attr] = possible_attr in self.internal_memory
-
-        print('done with base attributes in state')
-        print('# of base attributes = %i' % len(state))
+        for attr in self.internal_memory:
+            state[attr] = True
 
         # relations (equality, >10)
         new_relations = {}
@@ -107,32 +165,21 @@ class BaseOppEnv(gym.Env):
             except Exception:
                 new_relations['greater_than_9(%s)' % str(attr)] = False
 
-            # # equality
-            # for attr2 in state:
-            #     if str(attr) >= str(attr2):
-            #         continue
-
-            #     attr2_val = None
-            #     if attr2 in self.tutor.state:
-            #         attr2_val = self.tutor.state[attr2]
-            #     elif attr2 in self.internal_memory:
-            #         attr2_val = self.internal_memory[attr2]
-            #     else:
-            #         attr2_val = ''
-            #     new_relations['eq(%s,%s)' % (attr, attr2)] = attr_val == attr2_val
-
-        print('done with creating new relations')
-        print('# of new relations = %i' % len(new_relations))
-
         for attr in new_relations:
             state[attr] = new_relations[attr]
 
+        from pprint import pprint
+        pprint(state)
+
+        return state
         # convert all attributes to strings
-        return {str(attr): state[attr] for attr in state}
+        # return {str(attr): state[attr] for attr in state}
 
     def step(self, action):
         try:
             s, a, i = self.decode(action)
+
+            print(s, a, i)
             
             if isinstance(s, tuple):
                 if s in self.internal_memory or i == '':
@@ -153,7 +200,7 @@ class BaseOppEnv(gym.Env):
         
         state = self.get_rl_state()
         # pprint(state)
-        obs = self.dv.transform([state])[0].toarray()
+        obs = self.dv.fit_transform([state])[0]
         info = {}
 
         return obs, reward, done, info
@@ -191,12 +238,17 @@ class BaseOppEnv(gym.Env):
     def decode(self, action):
         # print(action)
 
-        op = self.get_rl_operators()[action[1]]
-        arg1 = self.possible_args[action[2]]
-        arg2 = self.possible_args[action[3]]
+        op, arg_count = self.get_rl_operators()[action[1]]
+        arg1 = self.tutor.get_possible_args()[action[2]]
+        arg2 = self.tutor.get_possible_args()[action[3]]
 
         if action[0] == len(self.tutor.get_possible_selections()):
-            s = (opp, arg1, arg2)
+            if op == "copy":
+                raise ValueError("cannot copy into internal memory")
+            if arg_count == 1:
+                s = (op, arg1)
+            elif arg_count == 2:
+                s = (op, arg1, arg2)
         else:
             s = self.tutor.get_possible_selections()[action[0]]
 
@@ -220,7 +272,7 @@ class BaseOppEnv(gym.Env):
         self.tutor.set_random_problem()
         state = self.get_rl_state()
         self.internal_memory = {}
-        obs = self.dv.transform([state])[0].toarray()
+        obs = self.dv.transform([state])[0]
         return obs
 
     def render(self, mode='human', close=False):