diff --git a/requirements.txt b/requirements.txt
index 122d0b2..9948384 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 benchmark-environments @ git+https://github.com/HumanCompatibleAI/benchmark-environments.git
 imitation @ git+https://github.com/HumanCompatibleAI/imitation.git@e99844
 stable-baselines @ git+https://github.com/hill-a/stable-baselines.git
-gym[mujoco]
+gym[box2d,mujoco]
 matplotlib
 numpy
 pandas
@@ -9,5 +9,6 @@ pymdptoolbox
 seaborn
 setuptools
 scipy
+#TODO(adam): upgrade to 1.15?
 tensorflow>=1.13,<1.14
 xarray
diff --git a/runners/sparsify_point_maze.sh b/runners/sparsify_point_maze.sh
new file mode 100755
index 0000000..5482de3
--- /dev/null
+++ b/runners/sparsify_point_maze.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Copyright 2020 Adam Gleave
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script to sparsify pretrained reward models generated by `transfer_point_maze.sh`
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+. ${DIR}/common.sh
+
+ENV_TRAIN="imitation/PointMazeLeftVel-v0"
+TRANSITION_P=0.05
+
+if [[ ${fast} == "true" ]]; then
+  # intended for debugging
+  COMPARISON_TIMESTEPS="fast"
+  PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze_fast
+  SPARSE_OUTPUT=${OUTPUT_ROOT}/sparse_point_maze_fast
+else
+  COMPARISON_TIMESTEPS=""
+  EVAL_TIMESTEPS=100000
+  PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze
+  SPARSE_OUTPUT=${OUTPUT_ROOT}/sparse_point_maze
+fi
+
+MIXED_POLICY_PATH=${TRANSITION_P}:random:dummy:ppo2:${PM_OUTPUT}/expert/train/policies/final
+for name in comparison_expert comparison_mixture comparison_random; do
+  if [[ ${name} == "comparison_expert" ]]; then
+    extra_flags="dataset_factory_kwargs.policy_type=ppo2 \
+                 dataset_factory_kwargs.policy_path=${PM_OUTPUT}/expert/train/policies/final"
+  elif [[ ${name} == "comparison_mixture" ]]; then
+    extra_flags="dataset_factory_kwargs.policy_type=mixture \
+                 dataset_factory_kwargs.policy_path=${MIXED_POLICY_PATH}"
+  elif [[ ${name} == "comparison_random" ]]; then
+    extra_flags=""
+  else
+    echo "BUG: unknown name ${name}"
+    exit 1
+  fi
+  parallel --header : --results ${SPARSE_OUTPUT}/parallel/${name} \
+    $(call_script "model_comparison" "with") \
+    env_name=${ENV_TRAIN} ${extra_flags} \
+    ellp_loss no_rescale target_reward_type=evaluating_rewards/Zero-v0 \
+    seed={seed} source_reward_type={source_reward_type} \
+    source_reward_path=${PM_OUTPUT}/reward/{source_reward_path}/{source_reward_suffix} \
+    ${COMPARISON_TIMESTEPS} log_dir=${SPARSE_OUTPUT}/${name}/{source_reward_path}/{seed} \
+    ::: source_reward_type evaluating_rewards/PointMazeGroundTruthWithCtrl-v0 \
+        evaluating_rewards/PointMazeGroundTruthNoCtrl-v0 \
+        evaluating_rewards/RewardModel-v0 evaluating_rewards/RewardModel-v0 \
+        imitation/RewardNet_unshaped-v0 imitation/RewardNet_unshaped-v0 \
+    :::+ source_reward_path withctrl noctrl preferences regress irl_state_only irl_state_action \
+    :::+ source_reward_suffix dummy dummy model model checkpoints/final/discrim/reward_net \
+                              checkpoints/final/discrim/reward_net \
+    ::: seed 0 1 2
+done
diff --git a/setup.cfg b/setup.cfg
index 6844f35..e256b03 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,6 +22,7 @@ ignore = W503,E203
 known_first_party=evaluating_rewards,tests
 default_section=THIRDPARTY
 multi_line_output=3
+include_trailing_comma=True
 force_sort_within_sections=True
 line_length=100
 
diff --git a/src/evaluating_rewards/analysis/plot_pm_reward.py b/src/evaluating_rewards/analysis/plot_pm_reward.py
index b5def9d..a384cc1 100644
--- a/src/evaluating_rewards/analysis/plot_pm_reward.py
+++ b/src/evaluating_rewards/analysis/plot_pm_reward.py
@@ -68,7 +68,7 @@ def default_config():
 
 @plot_pm_reward_ex.config
 def logging_config(log_root, models, reward_type, reward_path):
-    data_root = os.path.join(log_root, "model_comparison")
+    data_root = os.path.join(serialize.get_output_dir(), "model_comparison")
     if models is None:
         log_dir = os.path.join(
             log_root, reward_type.replace("/", "_"), reward_path.replace("/", "_")
@@ -101,9 +101,9 @@ def dense_no_ctrl_sparsified():
     pos_lim = 0.15
     # Use lists of tuples rather than OrderedDict as Sacred reorders dictionaries
     models = [
-        ("Dense", "evaluating_rewards/PointMassDenseNoCtrl-v0", "dummy"),
+        ("Dense\n(Manual)", "evaluating_rewards/PointMassDenseNoCtrl-v0", "dummy"),
         (
-            "Sparsified",
+            "Sparsified\n(Learned)",
             "evaluating_rewards/RewardModel-v0",
             os.path.join(
                 "evaluating_rewards_PointMassLine-v0",
@@ -111,7 +111,7 @@ def dense_no_ctrl_sparsified():
                 "model",
             ),
         ),
-        ("Sparse", "evaluating_rewards/PointMassSparseNoCtrl-v0", "dummy"),
+        ("Sparse\n(Manual)", "evaluating_rewards/PointMassSparseNoCtrl-v0", "dummy"),
     ]
     _ = locals()  # quieten flake8 unused variable warning
     del _
diff --git a/src/evaluating_rewards/envs/__init__.py b/src/evaluating_rewards/envs/__init__.py
index 3134c8a..c3561bc 100644
--- a/src/evaluating_rewards/envs/__init__.py
+++ b/src/evaluating_rewards/envs/__init__.py
@@ -17,7 +17,11 @@
 import gym
 import imitation.envs.examples  # noqa: F401  pylint:disable=unused-import
 
-from evaluating_rewards.envs import mujoco, point_mass  # noqa: F401  pylint:disable=unused-import
+from evaluating_rewards.envs import (  # noqa: F401  pylint:disable=unused-import
+    lunar_lander,
+    mujoco,
+    point_mass,
+)
 
 PROJECT_ROOT = "evaluating_rewards.envs"
 PM_ROOT = f"{PROJECT_ROOT}.point_mass"
@@ -74,3 +78,15 @@ def register_mujoco():
 
 
 register_mujoco()
+
+gym.register(
+    id="evaluating_rewards/LunarLanderContinuous-v0",
+    entry_point=f"{PROJECT_ROOT}.lunar_lander:LunarLanderContinuousObservable",
+    reward_threshold=200,
+)
+gym.register(
+    id="evaluating_rewards/LunarLanderContinuousOriginalShaping-v0",
+    entry_point=f"{PROJECT_ROOT}.lunar_lander:LunarLanderContinuousObservable",
+    kwargs=dict(fix_shaping=False),
+    reward_threshold=200,
+)
diff --git a/src/evaluating_rewards/envs/core.py b/src/evaluating_rewards/envs/core.py
new file mode 100644
index 0000000..ec22369
--- /dev/null
+++ b/src/evaluating_rewards/envs/core.py
@@ -0,0 +1,65 @@
+# Copyright 2019 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base classes for environment rewards."""
+
+import abc
+
+import gym
+from imitation.util import serialize
+import tensorflow as tf
+
+from evaluating_rewards import rewards
+
+
+class HardcodedReward(rewards.BasicRewardModel, serialize.LayersSerializable):
+    """Hardcoded (non-trainable) reward model for a Gym environment."""
+
+    def __init__(self, observation_space: gym.Space, action_space: gym.Space, **kwargs):
+        """Constructs the reward model.
+
+        Args:
+            observation_space: The observation space of the environment.
+            action_space: The action space of the environment.
+            **kwargs: Extra parameters to serialize and store in the instance,
+                    accessible as attributes.
+        """
+        rewards.BasicRewardModel.__init__(self, observation_space, action_space)
+        serialize.LayersSerializable.__init__(
+            self,
+            layers={},
+            observation_space=observation_space,
+            action_space=action_space,
+            **kwargs,
+        )
+        self._reward = self.build_reward()
+
+    def __getattr__(self, name):
+        try:
+            return self._kwargs[name]
+        except KeyError:
+            raise AttributeError(f"Attribute '{name}' not present in self._kwargs")
+
+    @abc.abstractmethod
+    def build_reward(self) -> tf.Tensor:
+        """Computes reward from observation, action and next observation.
+
+        Returns:
+            A tensor containing reward, shape (batch_size,).
+        """
+
+    @property
+    def reward(self):
+        """Reward tensor, shape (batch_size,)."""
+        return self._reward
diff --git a/src/evaluating_rewards/envs/lunar_lander.py b/src/evaluating_rewards/envs/lunar_lander.py
new file mode 100644
index 0000000..db582b2
--- /dev/null
+++ b/src/evaluating_rewards/envs/lunar_lander.py
@@ -0,0 +1,169 @@
+# Copyright 2020 Adam Gleave
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Reward function for Gym LunarLander-v2 environment."""
+
+from gym import spaces
+from gym.envs.box2d import lunar_lander
+from imitation.util import registry
+import numpy as np
+import tensorflow as tf
+
+from evaluating_rewards import serialize as reward_serialize
+from evaluating_rewards.envs import core
+
+TERMINAL_POTENTIAL = -174  # chosen to be similar to initial potential value
+
+
+class LunarLanderContinuousObservable(lunar_lander.LunarLanderContinuous):
+    """LunarLander environment lightly modified from Gym to make reward a function of observation.
+
+    Adds `self.game_over` and `self.lander.awake` flags to state, which are used by Gym
+    internally to compute the reward. They are computed by the Box2D simulator, and cannot easily
+    be derived from the rest of the state.
+
+    `game_over` is set based on contact forces on the lunar lander. The `lander.awake` flag is
+    set when the body is not "asleep":
+        "When Box2D determines that a body [...] has come to rest, the body enters a sleep state"
+         (see https://box2d.org/documentation/md__d_1__git_hub_box2d_docs_dynamics.html).
+    """
+
+    def __init__(self, time_limit: int = 1000, fix_shaping: bool = True):
+        # Need to set self.time_limit before super().__init__() since __init__() calls reset()
+        self.time_limit = time_limit
+        super().__init__()
+        self.observation_space = spaces.Box(-np.inf, np.inf, shape=(11,), dtype=np.float32)
+        self.fix_shaping = fix_shaping
+        self.time_remaining = None
+
+    def step(self, action):
+        prev_shaping = self.prev_shaping
+        self.time_remaining -= 1
+        obs, rew, done, info = super().step(action)
+        time_up = self.time_remaining <= 0
+        extra_obs = [
+            1.0 if self.game_over else 0.0,
+            1.0 if self.lander.awake else 0.0,
+            1.0 if time_up else 0.0,
+        ]
+        obs = np.concatenate((obs, extra_obs))
+
+        done = done | time_up
+        if done and self.fix_shaping:
+            # Gym does not apply shaping or control cost to final reward.
+            # No control cost is weird but harmless. No shaping is problematic though so we fix
+            # it to satisfy Ng et al (1999)'s conditions.
+            # Take final state to always have potential TERMINAL_POTENTIAL.
+            # This constant doesn't actually effect RL policy, but makes reward look less odd.
+            rew += TERMINAL_POTENTIAL - prev_shaping
+
+        return obs, rew, done, info
+
+    def reset(self):
+        self.time_remaining = self.time_limit + 1  # step() gets called once during reset
+        # NOTE: do not need to change observations here since super().reset() calls step()
+        return super().reset()
+
+
+def _potential(obs: tf.Tensor) -> tf.Tensor:
+    """Potential function used to compute shaping.
+
+    Based on `shaping` variable in `LunarLander.step()`.
+    """
+    leg_contact = obs[:, 6] + obs[:, 7]
+    l2 = tf.sqrt(tf.math.square(obs[:, 0]) + tf.math.square(obs[:, 1]))
+    l2 += tf.sqrt(tf.math.square(obs[:, 2]) + tf.math.square(obs[:, 3]))
+    return 10 * leg_contact - 100 * l2 - 100 * tf.abs(obs[:, 4])
+
+
+class LunarLanderContinuousGroundTruthReward(core.HardcodedReward):
+    """Reward for LunarLanderContinuousObservable. Matches ground truth with default settings."""
+
+    def __init__(
+        self,
+        observation_space: spaces.Space,
+        action_space: spaces.Space,
+        ctrl_coef: float = 1.0,
+        shaping_coef: float = 1.0,
+    ):
+        """Constructs the reward model.
+
+        Args:
+            observation_space: The observation space of the environment.
+            action_space: The action space of the environment.
+            ctrl_coef: Multiplier for the control cost. 1.0 equals ground truth; 0.0 disables.
+            shaping_coef: Multiplier for potential shaping. 1.0 equals ground truth; 0.0 disables.
+        """
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            ctrl_coef=ctrl_coef,
+            shaping_coef=shaping_coef,
+        )
+
+    def build_reward(self) -> tf.Tensor:
+        """Intended to match the reward returned by gym.LunarLander.
+
+        Known differences:
+          - Will disagree on states *after* episode termination due to non-Markovian leg contact
+            condition in Gym.
+
+        Returns:
+            A Tensor containing predicted rewards.
+        """
+        # Sparse reward
+        game_over = (tf.abs(self._proc_next_obs[:, 0]) >= 1.0) | (self._proc_next_obs[:, 8] > 0)
+        landed_safely = self._proc_next_obs[:, 9] == 0.0
+        time_up = self._proc_next_obs[:, 10] == 0.0
+        done = game_over | landed_safely | time_up
+        # Note time out is neither penalized nor rewarded by sparse_reward
+        sparse_reward = -100.0 * tf.cast(game_over, tf.float32)
+        sparse_reward += 100.0 * tf.cast(landed_safely, tf.float32)
+
+        # Control cost
+        m_thrust = self._proc_act[:, 0] > 0
+        m_power_when_act = 0.5 * (tf.clip_by_value(self._proc_act[:, 0], 0.0, 1.0) + 1.0)
+        m_power = tf.where(m_thrust, m_power_when_act, 0.0 * m_power_when_act)
+        abs_side_act = tf.abs(self._proc_act[:, 1])
+        s_thrust = abs_side_act > 0.5
+        s_power_when_act = tf.clip_by_value(abs_side_act, 0.5, 1.0)
+        s_power = tf.where(s_thrust, s_power_when_act, 0.0 * s_power_when_act)
+        ctrl_cost = -0.3 * m_power - 0.03 * s_power
+        # Gym does not apply control cost to final step. (Seems weird, but OK.)
+        ctrl_cost = tf.where(done, 0 * ctrl_cost, ctrl_cost)
+
+        # Shaping
+        # Note this assumes no discount (matching Gym implementation), which will make it
+        # not *quite* potential shaping for any RL algorithm using discounting.
+        shaping = (1 - tf.cast(done, tf.float32)) * _potential(self._proc_next_obs)
+        shaping += TERMINAL_POTENTIAL * tf.cast(done, tf.float32)
+        shaping -= _potential(self._proc_obs)
+
+        return sparse_reward + self.shaping_coef * shaping + self.ctrl_coef * ctrl_cost
+
+
+def _register_rewards():
+    density = {"Dense": {}, "Sparse": {"shaping_coef": 0.0}}
+    control = {"WithCtrl": {}, "NoCtrl": {"ctrl_coef": 0.0}}
+    for k1, cfg1 in density.items():
+        for k2, cfg2 in control.items():
+            fn = registry.build_loader_fn_require_space(
+                LunarLanderContinuousGroundTruthReward, **cfg1, **cfg2,
+            )
+            reward_serialize.reward_registry.register(
+                key=f"evaluating_rewards/LunarLanderContinuous{k1}{k2}-v0", value=fn,
+            )
+
+
+_register_rewards()
diff --git a/src/evaluating_rewards/envs/mujoco.py b/src/evaluating_rewards/envs/mujoco.py
index 637c036..865a755 100644
--- a/src/evaluating_rewards/envs/mujoco.py
+++ b/src/evaluating_rewards/envs/mujoco.py
@@ -12,63 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Reward functions for Gym environments."""
-
-import abc
+"""Reward functions for Gym MuJoCo environments."""
 
 import gym
-from imitation.util import registry, serialize
+from imitation.util import registry
 import numpy as np
 from stable_baselines.common import vec_env
 import tensorflow as tf
 
-from evaluating_rewards import rewards
 from evaluating_rewards import serialize as reward_serialize
+from evaluating_rewards.envs import core
 
 
-class MujocoHardcodedReward(rewards.BasicRewardModel, serialize.LayersSerializable):
-    """Hardcoded (non-trainable) reward model for a MuJoCo environment."""
-
-    def __init__(self, observation_space: gym.Space, action_space: gym.Space, **kwargs):
-        """Constructs the reward model.
-
-        Args:
-            observation_space: The observation space of the environment.
-            action_space: The action space of the environment.
-            **kwargs: Extra parameters to serialize and store in the instance,
-                    accessible as attributes.
-        """
-        rewards.BasicRewardModel.__init__(self, observation_space, action_space)
-        serialize.LayersSerializable.__init__(
-            self,
-            layers={},
-            observation_space=observation_space,
-            action_space=action_space,
-            **kwargs,
-        )
-        self._reward = self.build_reward()
-
-    def __getattr__(self, name):
-        try:
-            return self._kwargs[name]
-        except KeyError:
-            raise AttributeError(f"Attribute '{name}' not present in self._kwargs")
-
-    @abc.abstractmethod
-    def build_reward(self) -> tf.Tensor:
-        """Computes reward from observation, action and next observation.
-
-        Returns:
-            A tensor containing reward, shape (batch_size,).
-        """
-
-    @property
-    def reward(self):
-        """Reward tensor, shape (batch_size,)."""
-        return self._reward
-
-
-class HalfCheetahGroundTruthReward(MujocoHardcodedReward):
+class HalfCheetahGroundTruthReward(core.HardcodedReward):
     """Reward for HalfCheetah-v2. Matches ground truth with default settings."""
 
     def __init__(
@@ -119,7 +75,7 @@ def build_reward(self) -> tf.Tensor:
         return reward
 
 
-class HopperGroundTruthReward(MujocoHardcodedReward):
+class HopperGroundTruthReward(core.HardcodedReward):
     """Reward for Hopper-v2. Matches ground truth with default settings."""
 
     def __init__(
@@ -190,7 +146,7 @@ def build_reward(self) -> tf.Tensor:
         return reward
 
 
-class HopperBackflipReward(MujocoHardcodedReward):
+class HopperBackflipReward(core.HardcodedReward):
     """Reward for Hopper-v2 to make it do a backflip, rather than hop forward.
 
     Based on reward function in footnote of:
@@ -248,7 +204,7 @@ def build_reward(self) -> tf.Tensor:
         return reward
 
 
-class PointMazeReward(MujocoHardcodedReward):
+class PointMazeReward(core.HardcodedReward):
     """Reward for imitation/PointMaze{Left,Right}Vel-v0.
 
     This in turn is based on on Fu et al (2018)'s PointMaze environment:
diff --git a/src/evaluating_rewards/envs/point_mass.py b/src/evaluating_rewards/envs/point_mass.py
index 2e09678..f8c9bb9 100644
--- a/src/evaluating_rewards/envs/point_mass.py
+++ b/src/evaluating_rewards/envs/point_mass.py
@@ -19,13 +19,14 @@
 import gym
 from imitation.envs import resettable_env
 from imitation.policies import serialize as policy_serialize
-from imitation.util import registry, serialize
+from imitation.util import registry
 import numpy as np
 from stable_baselines.common import policies
 import tensorflow as tf
 
 from evaluating_rewards import rewards
 from evaluating_rewards import serialize as reward_serialize
+from evaluating_rewards.envs import core
 
 
 class PointMassEnv(resettable_env.ResettableEnv):
@@ -140,36 +141,28 @@ def close(self):
             self.viewer = None
 
 
-class PointMassGroundTruth(rewards.BasicRewardModel, serialize.LayersSerializable):
+class PointMassGroundTruth(core.HardcodedReward):
     """RewardModel representing the true (dense) reward in PointMass."""
 
     def __init__(
         self, observation_space: gym.Space, action_space: gym.Space, ctrl_coef: float = 1.0
     ):
-        serialize.LayersSerializable.__init__(**locals(), layers={})
-
         self.ndim, remainder = divmod(observation_space.shape[0], 3)
         assert remainder == 0
-        self.ctrl_coef = ctrl_coef
-
-        rewards.BasicRewardModel.__init__(self, observation_space, action_space)
-        self._reward = self.build_reward()
+        super().__init__(
+            observation_space=observation_space, action_space=action_space, ctrl_coef=ctrl_coef
+        )
 
     def build_reward(self):
         """Computes reward from observation and action in PointMass environment."""
-        pos = self._proc_obs[:, 0 : self.ndim]
-        goal = self._proc_obs[:, 2 * self.ndim : 3 * self.ndim]
+        pos = self._proc_next_obs[:, 0 : self.ndim]
+        goal = self._proc_next_obs[:, (2 * self.ndim) : (3 * self.ndim)]
         dist = tf.norm(pos - goal, axis=-1)
         ctrl_cost = tf.reduce_sum(tf.square(self._proc_act), axis=-1)
         return -dist - self.ctrl_coef * ctrl_cost
 
-    @property
-    def reward(self):
-        """Reward tensor."""
-        return self._reward
 
-
-class PointMassSparseReward(rewards.BasicRewardModel, serialize.LayersSerializable):
+class PointMassSparseReward(core.HardcodedReward):
     """A sparse reward for the point mass being close to the goal.
 
     Should produce similar behavior to PointMassGroundTruth. However, it is not
@@ -195,16 +188,15 @@ def __init__(
                     The larger this is, the more dissimilar the reward model and resulting
                     policy will be from PointMassGroundTruth.
         """
-        serialize.LayersSerializable.__init__(**locals(), layers={})
-
         self.ndim, remainder = divmod(observation_space.shape[0], 3)
         assert remainder == 0
-        self.ctrl_coef = ctrl_coef
-        self.threshold = threshold
-        self.goal_offset = goal_offset
-        rewards.BasicRewardModel.__init__(self, observation_space, action_space)
-
-        self._reward = self.build_reward()
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            ctrl_coef=ctrl_coef,
+            threshold=threshold,
+            goal_offset=goal_offset,
+        )
 
     def build_reward(self):
         """Computes reward from observation and action in PointMass environment."""
@@ -217,23 +209,14 @@ def build_reward(self):
         ctrl_cost = tf.reduce_sum(tf.square(self._proc_act), axis=-1)
         return goal_reward - self.ctrl_coef * ctrl_cost
 
-    @property
-    def reward(self):
-        """Reward tensor."""
-        return self._reward
 
-
-class PointMassShaping(rewards.BasicRewardModel, serialize.LayersSerializable):
+class PointMassShaping(core.HardcodedReward):
     """Potential shaping term, based on distance to goal."""
 
     def __init__(self, observation_space: gym.Space, action_space: gym.Space):
-        serialize.LayersSerializable.__init__(**locals(), layers={})
-
         self.ndim, remainder = divmod(observation_space.shape[0], 3)
         assert remainder == 0
-
-        rewards.BasicRewardModel.__init__(self, observation_space, action_space)
-        self._reward = self.build_reward()
+        super().__init__(observation_space=observation_space, action_space=action_space)
 
     def build_reward(self):
         """Computes shaping from current and next observations."""
@@ -248,11 +231,6 @@ def dist(obs):
 
         return old_dist - new_dist
 
-    @property
-    def reward(self):
-        """Reward tensor."""
-        return self._reward
-
 
 class PointMassDenseReward(rewards.LinearCombinationModelWrapper):
     """Sparse reward plus potential shaping."""
diff --git a/src/evaluating_rewards/experiments/env_rewards.py b/src/evaluating_rewards/experiments/env_rewards.py
index 0f6e609..33145c4 100644
--- a/src/evaluating_rewards/experiments/env_rewards.py
+++ b/src/evaluating_rewards/experiments/env_rewards.py
@@ -22,6 +22,9 @@
 REWARDS_BY_ENV = {
     "evaluating_rewards/HalfCheetah-v3": ["evaluating_rewards/HalfCheetahGroundTruth.*-v0"],
     "evaluating_rewards/Hopper-v3": ["evaluating_rewards/Hopper.*-v0"],
+    "evaluating_rewards/LunarLanderContinuous-v0": [
+        "evaluating_rewards/LunarLanderContinuous.*-v0"
+    ],
     "evaluating_rewards/PointMassLine-v0": ["evaluating_rewards/PointMass.*-v0"],
     "imitation/PointMazeLeftVel-v0": ["evaluating_rewards/PointMaze.*-v0"],
     "imitation/PointMazeRightVel-v0": ["evaluating_rewards/PointMaze.*-v0"],
diff --git a/src/evaluating_rewards/experiments/point_mass_analysis.py b/src/evaluating_rewards/experiments/point_mass_analysis.py
index 945b694..6560fe2 100644
--- a/src/evaluating_rewards/experiments/point_mass_analysis.py
+++ b/src/evaluating_rewards/experiments/point_mass_analysis.py
@@ -57,7 +57,16 @@ def plot_reward(rew: xarray.DataArray, cmap: str = "RdBu", **kwargs) -> plt.Figu
         )
         # By default xarray ignores figsize and does its own size calculation. Override.
         figsize = mpl.rcParams.get("figure.figsize")
-        facet = rew.plot(x="Accel.", y="Velocity", col="Pos.", cmap=cmap, figsize=figsize, **kwargs)
+        facet = rew.plot(
+            x="Accel.",
+            y="Velocity",
+            col="Pos.",
+            cmap=cmap,
+            figsize=figsize,
+            linewidth=0.1,
+            edgecolor="k",
+            **kwargs,
+        )
 
         if "row" in kwargs:
             # xarray adds row labels in a hard-to-spot far-right side.
diff --git a/src/evaluating_rewards/interpretability.py b/src/evaluating_rewards/interpretability.py
new file mode 100644
index 0000000..9e93221
--- /dev/null
+++ b/src/evaluating_rewards/interpretability.py
@@ -0,0 +1,140 @@
+# Copyright 2020 Adam Gleave
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classes and methods to help interpret reward functions."""
+
+import math
+from typing import NamedTuple, Sequence
+
+import gym
+from imitation.util import rollout
+import matplotlib.pyplot as plt
+import numpy as np
+from stable_baselines.common import policies, vec_env
+
+
+# TODO(adam): consider making rollout.Transitions a dataclass and subclassing?
+class RenderedTransitions(NamedTuple):
+    """A batch of obs-act-obs-rew-done transitions.
+
+    Usually generated by combining and processing several Trajectories via
+    `flatten_trajectory()`.
+
+    Attributes:
+        obs: Previous observations. Shape: (batch_size, ) + observation_shape.
+            The i'th observation `obs[i]` in this array is the observation seen
+            by the agent when choosing action `act[i]`.
+        act: Actions. Shape: (batch_size, ) + action_shape.
+        next_obs: New observation. Shape: (batch_size, ) + observation_shape.
+            The i'th observation `next_obs[i]` in this array is the observation
+            after the agent has taken action `act[i]`.
+        rew: Reward. Shape: (batch_size, ).
+            The reward `rew[i]` at the i'th timestep is received after the agent has
+            taken action `act[i]`.
+        done: Boolean array indicating episode termination. Shape: (batch_size, ).
+            `done[i]` is true iff `next_obs[i]` the last observation of an episode.
+        imgs: Image. Shape: (batch_size, ) + image_shape.
+    """
+
+    obs: np.ndarray
+    acts: np.ndarray
+    next_obs: np.ndarray
+    rews: np.ndarray
+    dones: np.ndarray
+    imgs: np.ndarray
+
+
+class RenderWrapper(gym.Wrapper):
+    """Wraps a Gym environment, adding rendered images to the info dict under key 'img'.
+
+    Note this does not render the initial observation (there is no info dict returned by
+    `reset()`). For interpretability this is of limited importance since the reward of initial
+    state(s) will not change the optimal policy.
+    """
+
+    def step(self, action):
+        ob, rew, done, info = self.env.step(action)
+        img = self.env.render(mode="rgb_array")
+        info["img"] = img
+        return ob, rew, done, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+
+def rendered_rollout(
+    venv: vec_env.VecEnv,
+    policy: policies.BasePolicy,
+    n_timesteps: int,
+    *,
+    truncate: bool = True,
+    **kwargs,
+) -> RenderedTransitions:
+    """Rollout `policy` on `venv` for `n_timesteps`.
+
+    Similar to `imitation.util.rollout.generate_transitions`, but returns `RenderedTransitions`,
+    including a rendered representation of each state in the `imgs` attribute.
+
+    Args:
+         venv: A vector environment to rollout in.
+         policy: The policy to rollout.
+         n_timesteps: The number of timesteps of data to collect.
+         truncate: if True, return exactly n_timesteps; otherwise, it is a lower bound.
+         kwargs: Passed through to `rollout.generate_trajectories`.
+    """
+    trajs = rollout.generate_trajectories(
+        policy, venv, sample_until=rollout.min_timesteps(n_timesteps), **kwargs
+    )
+    transitions = rollout.flatten_trajectories(trajs)
+    imgs = []
+    for traj in trajs:
+        imgs += [info["img"] for info in traj.infos]
+    if truncate and n_timesteps is not None:
+        transitions = rollout.Transitions(*(arr[:n_timesteps] for arr in transitions))
+        imgs = imgs[:n_timesteps]
+
+    imgs = np.array(imgs)
+    assert len(imgs) == len(transitions.obs)
+    return RenderedTransitions(**transitions._asdict(), imgs=imgs)
+
+
+# TODO(adam): maybe want something more extreme than quantile sampling?
+# e.g. top 1%, top 5%, top 10%; or particular S.D.s of reward.
+def quantile_sample(preds: np.ndarray, num_samples: int = 10) -> Sequence[int]:
+    """Sample `num_samples` quantiles from preds."""
+    num_points = len(preds)
+    if num_samples > num_points:
+        raise ValueError("num_samples must be less than the number of datapoints.")
+
+    sorted_idxs = np.argsort(preds)
+    subset = np.linspace(0, num_points - 1, num=num_samples)
+    subset = np.ceil(subset).astype(np.int)
+    # TODO(adam): is this actually guaranteed not to clash?
+    assert len(np.unique(subset)) == len(subset)
+
+    return sorted_idxs[subset]
+
+
+def plot_renders(
+    imgs: np.ndarray, reward_preds: np.ndarray, ncols: int = 3, cell_size: float = 5.0
+):
+    """Plots an array of images."""
+    n_imgs = imgs.shape[0]
+    nrows = math.ceil(n_imgs / ncols)
+    figsize = (cell_size * nrows, cell_size * ncols)
+    _, axs = plt.subplots(nrows, ncols, figsize=figsize, sharex=True, sharey=True)
+    for i, img in enumerate(imgs):
+        ax = axs[i // ncols][i % ncols]
+        ax.imshow(img)
+        ax.set_title("{:.2f}".format(reward_preds[i]))
diff --git a/src/evaluating_rewards/scripts/expert_demos.py b/src/evaluating_rewards/scripts/expert_demos.py
index 5ea309f..12f8bc4 100644
--- a/src/evaluating_rewards/scripts/expert_demos.py
+++ b/src/evaluating_rewards/scripts/expert_demos.py
@@ -15,9 +15,43 @@
 """Thin wrapper around imitation.scripts.expert_demos."""
 
 from imitation.scripts import expert_demos
+import stable_baselines
 
 from evaluating_rewards.scripts import script_utils
 
+
+@expert_demos.expert_demos_ex.named_config
+def lunar_lander():
+    """PPO on LunarLander"""
+    env_name = "evaluating_rewards/LunarLanderContinuous-v0"
+    # Hyperparams from https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/ppo2.yml
+    num_vec = 16
+    init_rl_kwargs = dict(
+        n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01,
+    )
+    _ = locals()
+    del _
+
+
+@expert_demos.expert_demos_ex.named_config
+def lunar_lander_sac():
+    """SAC on LunarLander"""
+    env_name = "evaluating_rewards/LunarLanderContinuous-v0"
+    # Hyperparams from https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/sac.yml
+    num_vec = 1
+    total_timesteps = int(5e5)
+    init_rl_kwargs = dict(
+        model_class=stable_baselines.SAC,
+        policy_class=stable_baselines.sac.policies.MlpPolicy,
+        batch_size=256,
+        learning_starts=1000,
+    )
+    log_interval = 10000
+    policy_save_interval = 10000
+    _ = locals()
+    del _
+
+
 if __name__ == "__main__":
     script_utils.add_logging_config(expert_demos.expert_demos_ex, "expert_demos")
     script_utils.experiment_main(expert_demos.expert_demos_ex, "expert_demos", sacred_symlink=False)
diff --git a/src/evaluating_rewards/scripts/model_comparison.py b/src/evaluating_rewards/scripts/model_comparison.py
index f7e6e6f..0801c5e 100644
--- a/src/evaluating_rewards/scripts/model_comparison.py
+++ b/src/evaluating_rewards/scripts/model_comparison.py
@@ -82,6 +82,14 @@ def alternating_maximization():
     del _
 
 
+@model_comparison_ex.named_config
+def undiscounted():
+    """Undiscounted potential shaping."""
+    comparison_kwargs = {  # noqa: F841  pylint:disable=unused-variable
+        "model_wrapper": functools.partial(comparisons.equivalence_model_wrapper, discount=1.0),
+    }
+
+
 @model_comparison_ex.named_config
 def affine_only():
     """Equivalence class consists of just affine transformations."""
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index ed36f68..768c8c3 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -25,7 +25,7 @@
 import tensorflow as tf
 
 from evaluating_rewards import datasets, rewards, serialize
-from evaluating_rewards.envs import mujoco, point_mass
+from evaluating_rewards.envs import lunar_lander, mujoco, point_mass
 from tests import common
 
 ENVS = ["FrozenLake-v0", "CartPole-v1", "Pendulum-v0"]
@@ -46,6 +46,11 @@
         "model_class": mujoco.HopperBackflipReward,
         "kwargs": {},
     },
+    "lunar_lander_ground_truth": {
+        "env_name": "evaluating_rewards/LunarLanderContinuous-v0",
+        "model_class": lunar_lander.LunarLanderContinuousGroundTruthReward,
+        "kwargs": {},
+    },
     "point_maze_ground_truth": {
         "env_name": "imitation/PointMazeLeftVel-v0",
         "model_class": mujoco.PointMazeReward,
@@ -82,17 +87,31 @@
 ]
 
 GROUND_TRUTH = {
+    # id: (env_name, reward_name, absolute_tolerance)
     "half_cheetah": (
         "evaluating_rewards/HalfCheetah-v3",
         "evaluating_rewards/HalfCheetahGroundTruthForwardWithCtrl-v0",
+        5e-5,
     ),
     "hopper": (
         "evaluating_rewards/Hopper-v3",
         "evaluating_rewards/HopperGroundTruthForwardWithCtrl-v0",
+        5e-5,
+    ),
+    "lunar_lander": (
+        "evaluating_rewards/LunarLanderContinuous-v0",
+        "evaluating_rewards/LunarLanderContinuousDenseWithCtrl-v0",
+        2e-4,
+    ),
+    "point_mass": (
+        "evaluating_rewards/PointMassLine-v0",
+        "evaluating_rewards/PointMassGroundTruth-v0",
+        5e-5,
     ),
     "point_maze": (
         "imitation/PointMazeLeftVel-v0",
         "evaluating_rewards/PointMazeGroundTruthWithCtrl-v0",
+        5e-5,
     ),
 }
 
@@ -181,8 +200,10 @@ def make_model(env):
     return helper_serialize_identity(make_model)
 
 
-@pytest.mark.parametrize("env_name,reward_id", GROUND_TRUTH.values(), ids=list(GROUND_TRUTH.keys()))
-def test_ground_truth_similar_to_gym(graph, session, venv, reward_id):
+@pytest.mark.parametrize(
+    "env_name,reward_id,atol", GROUND_TRUTH.values(), ids=list(GROUND_TRUTH.keys())
+)
+def test_ground_truth_similar_to_gym(graph, session, venv, reward_id, atol):
     """Checks that reward models predictions match those of Gym reward."""
     # Generate rollouts, recording Gym reward
     policy = base.RandomPolicy(venv.observation_space, venv.action_space)
@@ -198,7 +219,7 @@ def test_ground_truth_similar_to_gym(graph, session, venv, reward_id):
         pred_reward = rewards.evaluate_models({"m": reward_model}, batch)["m"]
 
     # Are the predictions close to true Gym reward?
-    np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
+    np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=atol)
 
 
 REWARD_LEN = 10000