diff --git a/requirements.txt b/requirements.txt index 122d0b2..9948384 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ benchmark-environments @ git+https://github.com/HumanCompatibleAI/benchmark-environments.git imitation @ git+https://github.com/HumanCompatibleAI/imitation.git@e99844 stable-baselines @ git+https://github.com/hill-a/stable-baselines.git -gym[mujoco] +gym[box2d,mujoco] matplotlib numpy pandas @@ -9,5 +9,6 @@ pymdptoolbox seaborn setuptools scipy +#TODO(adam): upgrade to 1.15? tensorflow>=1.13,<1.14 xarray diff --git a/runners/sparsify_point_maze.sh b/runners/sparsify_point_maze.sh new file mode 100755 index 0000000..5482de3 --- /dev/null +++ b/runners/sparsify_point_maze.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Copyright 2020 Adam Gleave +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Script to sparsify pretrained reward models generated by `transfer_point_maze.sh` + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +. ${DIR}/common.sh + +ENV_TRAIN="imitation/PointMazeLeftVel-v0" +TRANSITION_P=0.05 + +if [[ ${fast} == "true" ]]; then + # intended for debugging + COMPARISON_TIMESTEPS="fast" + PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze_fast + SPARSE_OUTPUT=${OUTPUT_ROOT}/sparse_point_maze_fast +else + COMPARISON_TIMESTEPS="" + EVAL_TIMESTEPS=100000 + PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze + SPARSE_OUTPUT=${OUTPUT_ROOT}/sparse_point_maze +fi + +MIXED_POLICY_PATH=${TRANSITION_P}:random:dummy:ppo2:${PM_OUTPUT}/expert/train/policies/final +for name in comparison_expert comparison_mixture comparison_random; do + if [[ ${name} == "comparison_expert" ]]; then + extra_flags="dataset_factory_kwargs.policy_type=ppo2 \ + dataset_factory_kwargs.policy_path=${PM_OUTPUT}/expert/train/policies/final" + elif [[ ${name} == "comparison_mixture" ]]; then + extra_flags="dataset_factory_kwargs.policy_type=mixture \ + dataset_factory_kwargs.policy_path=${MIXED_POLICY_PATH}" + elif [[ ${name} == "comparison_random" ]]; then + extra_flags="" + else + echo "BUG: unknown name ${name}" + exit 1 + fi + parallel --header : --results ${SPARSE_OUTPUT}/parallel/${name} \ + $(call_script "model_comparison" "with") \ + env_name=${ENV_TRAIN} ${extra_flags} \ + ellp_loss no_rescale target_reward_type=evaluating_rewards/Zero-v0 \ + seed={seed} source_reward_type={source_reward_type} \ + source_reward_path=${PM_OUTPUT}/reward/{source_reward_path}/{source_reward_suffix} \ + ${COMPARISON_TIMESTEPS} log_dir=${SPARSE_OUTPUT}/${name}/{source_reward_path}/{seed} \ + ::: source_reward_type evaluating_rewards/PointMazeGroundTruthWithCtrl-v0 \ + evaluating_rewards/PointMazeGroundTruthNoCtrl-v0 \ + evaluating_rewards/RewardModel-v0 evaluating_rewards/RewardModel-v0 \ + imitation/RewardNet_unshaped-v0 imitation/RewardNet_unshaped-v0 \ + :::+ source_reward_path withctrl noctrl preferences regress irl_state_only irl_state_action \ + :::+ source_reward_suffix dummy dummy model model checkpoints/final/discrim/reward_net \ + checkpoints/final/discrim/reward_net \ + ::: seed 0 1 2 +done diff --git a/setup.cfg b/setup.cfg index 6844f35..e256b03 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ ignore = W503,E203 known_first_party=evaluating_rewards,tests default_section=THIRDPARTY multi_line_output=3 +include_trailing_comma=True force_sort_within_sections=True line_length=100 diff --git a/src/evaluating_rewards/analysis/plot_pm_reward.py b/src/evaluating_rewards/analysis/plot_pm_reward.py index b5def9d..a384cc1 100644 --- a/src/evaluating_rewards/analysis/plot_pm_reward.py +++ b/src/evaluating_rewards/analysis/plot_pm_reward.py @@ -68,7 +68,7 @@ def default_config(): @plot_pm_reward_ex.config def logging_config(log_root, models, reward_type, reward_path): - data_root = os.path.join(log_root, "model_comparison") + data_root = os.path.join(serialize.get_output_dir(), "model_comparison") if models is None: log_dir = os.path.join( log_root, reward_type.replace("/", "_"), reward_path.replace("/", "_") @@ -101,9 +101,9 @@ def dense_no_ctrl_sparsified(): pos_lim = 0.15 # Use lists of tuples rather than OrderedDict as Sacred reorders dictionaries models = [ - ("Dense", "evaluating_rewards/PointMassDenseNoCtrl-v0", "dummy"), + ("Dense\n(Manual)", "evaluating_rewards/PointMassDenseNoCtrl-v0", "dummy"), ( - "Sparsified", + "Sparsified\n(Learned)", "evaluating_rewards/RewardModel-v0", os.path.join( "evaluating_rewards_PointMassLine-v0", @@ -111,7 +111,7 @@ def dense_no_ctrl_sparsified(): "model", ), ), - ("Sparse", "evaluating_rewards/PointMassSparseNoCtrl-v0", "dummy"), + ("Sparse\n(Manual)", "evaluating_rewards/PointMassSparseNoCtrl-v0", "dummy"), ] _ = locals() # quieten flake8 unused variable warning del _ diff --git a/src/evaluating_rewards/envs/__init__.py b/src/evaluating_rewards/envs/__init__.py index 3134c8a..c3561bc 100644 --- a/src/evaluating_rewards/envs/__init__.py +++ b/src/evaluating_rewards/envs/__init__.py @@ -17,7 +17,11 @@ import gym import imitation.envs.examples # noqa: F401 pylint:disable=unused-import -from evaluating_rewards.envs import mujoco, point_mass # noqa: F401 pylint:disable=unused-import +from evaluating_rewards.envs import ( # noqa: F401 pylint:disable=unused-import + lunar_lander, + mujoco, + point_mass, +) PROJECT_ROOT = "evaluating_rewards.envs" PM_ROOT = f"{PROJECT_ROOT}.point_mass" @@ -74,3 +78,15 @@ def register_mujoco(): register_mujoco() + +gym.register( + id="evaluating_rewards/LunarLanderContinuous-v0", + entry_point=f"{PROJECT_ROOT}.lunar_lander:LunarLanderContinuousObservable", + reward_threshold=200, +) +gym.register( + id="evaluating_rewards/LunarLanderContinuousOriginalShaping-v0", + entry_point=f"{PROJECT_ROOT}.lunar_lander:LunarLanderContinuousObservable", + kwargs=dict(fix_shaping=False), + reward_threshold=200, +) diff --git a/src/evaluating_rewards/envs/core.py b/src/evaluating_rewards/envs/core.py new file mode 100644 index 0000000..ec22369 --- /dev/null +++ b/src/evaluating_rewards/envs/core.py @@ -0,0 +1,65 @@ +# Copyright 2019 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base classes for environment rewards.""" + +import abc + +import gym +from imitation.util import serialize +import tensorflow as tf + +from evaluating_rewards import rewards + + +class HardcodedReward(rewards.BasicRewardModel, serialize.LayersSerializable): + """Hardcoded (non-trainable) reward model for a Gym environment.""" + + def __init__(self, observation_space: gym.Space, action_space: gym.Space, **kwargs): + """Constructs the reward model. + + Args: + observation_space: The observation space of the environment. + action_space: The action space of the environment. + **kwargs: Extra parameters to serialize and store in the instance, + accessible as attributes. + """ + rewards.BasicRewardModel.__init__(self, observation_space, action_space) + serialize.LayersSerializable.__init__( + self, + layers={}, + observation_space=observation_space, + action_space=action_space, + **kwargs, + ) + self._reward = self.build_reward() + + def __getattr__(self, name): + try: + return self._kwargs[name] + except KeyError: + raise AttributeError(f"Attribute '{name}' not present in self._kwargs") + + @abc.abstractmethod + def build_reward(self) -> tf.Tensor: + """Computes reward from observation, action and next observation. + + Returns: + A tensor containing reward, shape (batch_size,). + """ + + @property + def reward(self): + """Reward tensor, shape (batch_size,).""" + return self._reward diff --git a/src/evaluating_rewards/envs/lunar_lander.py b/src/evaluating_rewards/envs/lunar_lander.py new file mode 100644 index 0000000..db582b2 --- /dev/null +++ b/src/evaluating_rewards/envs/lunar_lander.py @@ -0,0 +1,169 @@ +# Copyright 2020 Adam Gleave +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Reward function for Gym LunarLander-v2 environment.""" + +from gym import spaces +from gym.envs.box2d import lunar_lander +from imitation.util import registry +import numpy as np +import tensorflow as tf + +from evaluating_rewards import serialize as reward_serialize +from evaluating_rewards.envs import core + +TERMINAL_POTENTIAL = -174 # chosen to be similar to initial potential value + + +class LunarLanderContinuousObservable(lunar_lander.LunarLanderContinuous): + """LunarLander environment lightly modified from Gym to make reward a function of observation. + + Adds `self.game_over` and `self.lander.awake` flags to state, which are used by Gym + internally to compute the reward. They are computed by the Box2D simulator, and cannot easily + be derived from the rest of the state. + + `game_over` is set based on contact forces on the lunar lander. The `lander.awake` flag is + set when the body is not "asleep": + "When Box2D determines that a body [...] has come to rest, the body enters a sleep state" + (see https://box2d.org/documentation/md__d_1__git_hub_box2d_docs_dynamics.html). + """ + + def __init__(self, time_limit: int = 1000, fix_shaping: bool = True): + # Need to set self.time_limit before super().__init__() since __init__() calls reset() + self.time_limit = time_limit + super().__init__() + self.observation_space = spaces.Box(-np.inf, np.inf, shape=(11,), dtype=np.float32) + self.fix_shaping = fix_shaping + self.time_remaining = None + + def step(self, action): + prev_shaping = self.prev_shaping + self.time_remaining -= 1 + obs, rew, done, info = super().step(action) + time_up = self.time_remaining <= 0 + extra_obs = [ + 1.0 if self.game_over else 0.0, + 1.0 if self.lander.awake else 0.0, + 1.0 if time_up else 0.0, + ] + obs = np.concatenate((obs, extra_obs)) + + done = done | time_up + if done and self.fix_shaping: + # Gym does not apply shaping or control cost to final reward. + # No control cost is weird but harmless. No shaping is problematic though so we fix + # it to satisfy Ng et al (1999)'s conditions. + # Take final state to always have potential TERMINAL_POTENTIAL. + # This constant doesn't actually effect RL policy, but makes reward look less odd. + rew += TERMINAL_POTENTIAL - prev_shaping + + return obs, rew, done, info + + def reset(self): + self.time_remaining = self.time_limit + 1 # step() gets called once during reset + # NOTE: do not need to change observations here since super().reset() calls step() + return super().reset() + + +def _potential(obs: tf.Tensor) -> tf.Tensor: + """Potential function used to compute shaping. + + Based on `shaping` variable in `LunarLander.step()`. + """ + leg_contact = obs[:, 6] + obs[:, 7] + l2 = tf.sqrt(tf.math.square(obs[:, 0]) + tf.math.square(obs[:, 1])) + l2 += tf.sqrt(tf.math.square(obs[:, 2]) + tf.math.square(obs[:, 3])) + return 10 * leg_contact - 100 * l2 - 100 * tf.abs(obs[:, 4]) + + +class LunarLanderContinuousGroundTruthReward(core.HardcodedReward): + """Reward for LunarLanderContinuousObservable. Matches ground truth with default settings.""" + + def __init__( + self, + observation_space: spaces.Space, + action_space: spaces.Space, + ctrl_coef: float = 1.0, + shaping_coef: float = 1.0, + ): + """Constructs the reward model. + + Args: + observation_space: The observation space of the environment. + action_space: The action space of the environment. + ctrl_coef: Multiplier for the control cost. 1.0 equals ground truth; 0.0 disables. + shaping_coef: Multiplier for potential shaping. 1.0 equals ground truth; 0.0 disables. + """ + super().__init__( + observation_space=observation_space, + action_space=action_space, + ctrl_coef=ctrl_coef, + shaping_coef=shaping_coef, + ) + + def build_reward(self) -> tf.Tensor: + """Intended to match the reward returned by gym.LunarLander. + + Known differences: + - Will disagree on states *after* episode termination due to non-Markovian leg contact + condition in Gym. + + Returns: + A Tensor containing predicted rewards. + """ + # Sparse reward + game_over = (tf.abs(self._proc_next_obs[:, 0]) >= 1.0) | (self._proc_next_obs[:, 8] > 0) + landed_safely = self._proc_next_obs[:, 9] == 0.0 + time_up = self._proc_next_obs[:, 10] == 0.0 + done = game_over | landed_safely | time_up + # Note time out is neither penalized nor rewarded by sparse_reward + sparse_reward = -100.0 * tf.cast(game_over, tf.float32) + sparse_reward += 100.0 * tf.cast(landed_safely, tf.float32) + + # Control cost + m_thrust = self._proc_act[:, 0] > 0 + m_power_when_act = 0.5 * (tf.clip_by_value(self._proc_act[:, 0], 0.0, 1.0) + 1.0) + m_power = tf.where(m_thrust, m_power_when_act, 0.0 * m_power_when_act) + abs_side_act = tf.abs(self._proc_act[:, 1]) + s_thrust = abs_side_act > 0.5 + s_power_when_act = tf.clip_by_value(abs_side_act, 0.5, 1.0) + s_power = tf.where(s_thrust, s_power_when_act, 0.0 * s_power_when_act) + ctrl_cost = -0.3 * m_power - 0.03 * s_power + # Gym does not apply control cost to final step. (Seems weird, but OK.) + ctrl_cost = tf.where(done, 0 * ctrl_cost, ctrl_cost) + + # Shaping + # Note this assumes no discount (matching Gym implementation), which will make it + # not *quite* potential shaping for any RL algorithm using discounting. + shaping = (1 - tf.cast(done, tf.float32)) * _potential(self._proc_next_obs) + shaping += TERMINAL_POTENTIAL * tf.cast(done, tf.float32) + shaping -= _potential(self._proc_obs) + + return sparse_reward + self.shaping_coef * shaping + self.ctrl_coef * ctrl_cost + + +def _register_rewards(): + density = {"Dense": {}, "Sparse": {"shaping_coef": 0.0}} + control = {"WithCtrl": {}, "NoCtrl": {"ctrl_coef": 0.0}} + for k1, cfg1 in density.items(): + for k2, cfg2 in control.items(): + fn = registry.build_loader_fn_require_space( + LunarLanderContinuousGroundTruthReward, **cfg1, **cfg2, + ) + reward_serialize.reward_registry.register( + key=f"evaluating_rewards/LunarLanderContinuous{k1}{k2}-v0", value=fn, + ) + + +_register_rewards() diff --git a/src/evaluating_rewards/envs/mujoco.py b/src/evaluating_rewards/envs/mujoco.py index 637c036..865a755 100644 --- a/src/evaluating_rewards/envs/mujoco.py +++ b/src/evaluating_rewards/envs/mujoco.py @@ -12,63 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Reward functions for Gym environments.""" - -import abc +"""Reward functions for Gym MuJoCo environments.""" import gym -from imitation.util import registry, serialize +from imitation.util import registry import numpy as np from stable_baselines.common import vec_env import tensorflow as tf -from evaluating_rewards import rewards from evaluating_rewards import serialize as reward_serialize +from evaluating_rewards.envs import core -class MujocoHardcodedReward(rewards.BasicRewardModel, serialize.LayersSerializable): - """Hardcoded (non-trainable) reward model for a MuJoCo environment.""" - - def __init__(self, observation_space: gym.Space, action_space: gym.Space, **kwargs): - """Constructs the reward model. - - Args: - observation_space: The observation space of the environment. - action_space: The action space of the environment. - **kwargs: Extra parameters to serialize and store in the instance, - accessible as attributes. - """ - rewards.BasicRewardModel.__init__(self, observation_space, action_space) - serialize.LayersSerializable.__init__( - self, - layers={}, - observation_space=observation_space, - action_space=action_space, - **kwargs, - ) - self._reward = self.build_reward() - - def __getattr__(self, name): - try: - return self._kwargs[name] - except KeyError: - raise AttributeError(f"Attribute '{name}' not present in self._kwargs") - - @abc.abstractmethod - def build_reward(self) -> tf.Tensor: - """Computes reward from observation, action and next observation. - - Returns: - A tensor containing reward, shape (batch_size,). - """ - - @property - def reward(self): - """Reward tensor, shape (batch_size,).""" - return self._reward - - -class HalfCheetahGroundTruthReward(MujocoHardcodedReward): +class HalfCheetahGroundTruthReward(core.HardcodedReward): """Reward for HalfCheetah-v2. Matches ground truth with default settings.""" def __init__( @@ -119,7 +75,7 @@ def build_reward(self) -> tf.Tensor: return reward -class HopperGroundTruthReward(MujocoHardcodedReward): +class HopperGroundTruthReward(core.HardcodedReward): """Reward for Hopper-v2. Matches ground truth with default settings.""" def __init__( @@ -190,7 +146,7 @@ def build_reward(self) -> tf.Tensor: return reward -class HopperBackflipReward(MujocoHardcodedReward): +class HopperBackflipReward(core.HardcodedReward): """Reward for Hopper-v2 to make it do a backflip, rather than hop forward. Based on reward function in footnote of: @@ -248,7 +204,7 @@ def build_reward(self) -> tf.Tensor: return reward -class PointMazeReward(MujocoHardcodedReward): +class PointMazeReward(core.HardcodedReward): """Reward for imitation/PointMaze{Left,Right}Vel-v0. This in turn is based on on Fu et al (2018)'s PointMaze environment: diff --git a/src/evaluating_rewards/envs/point_mass.py b/src/evaluating_rewards/envs/point_mass.py index 2e09678..f8c9bb9 100644 --- a/src/evaluating_rewards/envs/point_mass.py +++ b/src/evaluating_rewards/envs/point_mass.py @@ -19,13 +19,14 @@ import gym from imitation.envs import resettable_env from imitation.policies import serialize as policy_serialize -from imitation.util import registry, serialize +from imitation.util import registry import numpy as np from stable_baselines.common import policies import tensorflow as tf from evaluating_rewards import rewards from evaluating_rewards import serialize as reward_serialize +from evaluating_rewards.envs import core class PointMassEnv(resettable_env.ResettableEnv): @@ -140,36 +141,28 @@ def close(self): self.viewer = None -class PointMassGroundTruth(rewards.BasicRewardModel, serialize.LayersSerializable): +class PointMassGroundTruth(core.HardcodedReward): """RewardModel representing the true (dense) reward in PointMass.""" def __init__( self, observation_space: gym.Space, action_space: gym.Space, ctrl_coef: float = 1.0 ): - serialize.LayersSerializable.__init__(**locals(), layers={}) - self.ndim, remainder = divmod(observation_space.shape[0], 3) assert remainder == 0 - self.ctrl_coef = ctrl_coef - - rewards.BasicRewardModel.__init__(self, observation_space, action_space) - self._reward = self.build_reward() + super().__init__( + observation_space=observation_space, action_space=action_space, ctrl_coef=ctrl_coef + ) def build_reward(self): """Computes reward from observation and action in PointMass environment.""" - pos = self._proc_obs[:, 0 : self.ndim] - goal = self._proc_obs[:, 2 * self.ndim : 3 * self.ndim] + pos = self._proc_next_obs[:, 0 : self.ndim] + goal = self._proc_next_obs[:, (2 * self.ndim) : (3 * self.ndim)] dist = tf.norm(pos - goal, axis=-1) ctrl_cost = tf.reduce_sum(tf.square(self._proc_act), axis=-1) return -dist - self.ctrl_coef * ctrl_cost - @property - def reward(self): - """Reward tensor.""" - return self._reward - -class PointMassSparseReward(rewards.BasicRewardModel, serialize.LayersSerializable): +class PointMassSparseReward(core.HardcodedReward): """A sparse reward for the point mass being close to the goal. Should produce similar behavior to PointMassGroundTruth. However, it is not @@ -195,16 +188,15 @@ def __init__( The larger this is, the more dissimilar the reward model and resulting policy will be from PointMassGroundTruth. """ - serialize.LayersSerializable.__init__(**locals(), layers={}) - self.ndim, remainder = divmod(observation_space.shape[0], 3) assert remainder == 0 - self.ctrl_coef = ctrl_coef - self.threshold = threshold - self.goal_offset = goal_offset - rewards.BasicRewardModel.__init__(self, observation_space, action_space) - - self._reward = self.build_reward() + super().__init__( + observation_space=observation_space, + action_space=action_space, + ctrl_coef=ctrl_coef, + threshold=threshold, + goal_offset=goal_offset, + ) def build_reward(self): """Computes reward from observation and action in PointMass environment.""" @@ -217,23 +209,14 @@ def build_reward(self): ctrl_cost = tf.reduce_sum(tf.square(self._proc_act), axis=-1) return goal_reward - self.ctrl_coef * ctrl_cost - @property - def reward(self): - """Reward tensor.""" - return self._reward - -class PointMassShaping(rewards.BasicRewardModel, serialize.LayersSerializable): +class PointMassShaping(core.HardcodedReward): """Potential shaping term, based on distance to goal.""" def __init__(self, observation_space: gym.Space, action_space: gym.Space): - serialize.LayersSerializable.__init__(**locals(), layers={}) - self.ndim, remainder = divmod(observation_space.shape[0], 3) assert remainder == 0 - - rewards.BasicRewardModel.__init__(self, observation_space, action_space) - self._reward = self.build_reward() + super().__init__(observation_space=observation_space, action_space=action_space) def build_reward(self): """Computes shaping from current and next observations.""" @@ -248,11 +231,6 @@ def dist(obs): return old_dist - new_dist - @property - def reward(self): - """Reward tensor.""" - return self._reward - class PointMassDenseReward(rewards.LinearCombinationModelWrapper): """Sparse reward plus potential shaping.""" diff --git a/src/evaluating_rewards/experiments/env_rewards.py b/src/evaluating_rewards/experiments/env_rewards.py index 0f6e609..33145c4 100644 --- a/src/evaluating_rewards/experiments/env_rewards.py +++ b/src/evaluating_rewards/experiments/env_rewards.py @@ -22,6 +22,9 @@ REWARDS_BY_ENV = { "evaluating_rewards/HalfCheetah-v3": ["evaluating_rewards/HalfCheetahGroundTruth.*-v0"], "evaluating_rewards/Hopper-v3": ["evaluating_rewards/Hopper.*-v0"], + "evaluating_rewards/LunarLanderContinuous-v0": [ + "evaluating_rewards/LunarLanderContinuous.*-v0" + ], "evaluating_rewards/PointMassLine-v0": ["evaluating_rewards/PointMass.*-v0"], "imitation/PointMazeLeftVel-v0": ["evaluating_rewards/PointMaze.*-v0"], "imitation/PointMazeRightVel-v0": ["evaluating_rewards/PointMaze.*-v0"], diff --git a/src/evaluating_rewards/experiments/point_mass_analysis.py b/src/evaluating_rewards/experiments/point_mass_analysis.py index 945b694..6560fe2 100644 --- a/src/evaluating_rewards/experiments/point_mass_analysis.py +++ b/src/evaluating_rewards/experiments/point_mass_analysis.py @@ -57,7 +57,16 @@ def plot_reward(rew: xarray.DataArray, cmap: str = "RdBu", **kwargs) -> plt.Figu ) # By default xarray ignores figsize and does its own size calculation. Override. figsize = mpl.rcParams.get("figure.figsize") - facet = rew.plot(x="Accel.", y="Velocity", col="Pos.", cmap=cmap, figsize=figsize, **kwargs) + facet = rew.plot( + x="Accel.", + y="Velocity", + col="Pos.", + cmap=cmap, + figsize=figsize, + linewidth=0.1, + edgecolor="k", + **kwargs, + ) if "row" in kwargs: # xarray adds row labels in a hard-to-spot far-right side. diff --git a/src/evaluating_rewards/interpretability.py b/src/evaluating_rewards/interpretability.py new file mode 100644 index 0000000..9e93221 --- /dev/null +++ b/src/evaluating_rewards/interpretability.py @@ -0,0 +1,140 @@ +# Copyright 2020 Adam Gleave +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes and methods to help interpret reward functions.""" + +import math +from typing import NamedTuple, Sequence + +import gym +from imitation.util import rollout +import matplotlib.pyplot as plt +import numpy as np +from stable_baselines.common import policies, vec_env + + +# TODO(adam): consider making rollout.Transitions a dataclass and subclassing? +class RenderedTransitions(NamedTuple): + """A batch of obs-act-obs-rew-done transitions. + + Usually generated by combining and processing several Trajectories via + `flatten_trajectory()`. + + Attributes: + obs: Previous observations. Shape: (batch_size, ) + observation_shape. + The i'th observation `obs[i]` in this array is the observation seen + by the agent when choosing action `act[i]`. + act: Actions. Shape: (batch_size, ) + action_shape. + next_obs: New observation. Shape: (batch_size, ) + observation_shape. + The i'th observation `next_obs[i]` in this array is the observation + after the agent has taken action `act[i]`. + rew: Reward. Shape: (batch_size, ). + The reward `rew[i]` at the i'th timestep is received after the agent has + taken action `act[i]`. + done: Boolean array indicating episode termination. Shape: (batch_size, ). + `done[i]` is true iff `next_obs[i]` the last observation of an episode. + imgs: Image. Shape: (batch_size, ) + image_shape. + """ + + obs: np.ndarray + acts: np.ndarray + next_obs: np.ndarray + rews: np.ndarray + dones: np.ndarray + imgs: np.ndarray + + +class RenderWrapper(gym.Wrapper): + """Wraps a Gym environment, adding rendered images to the info dict under key 'img'. + + Note this does not render the initial observation (there is no info dict returned by + `reset()`). For interpretability this is of limited importance since the reward of initial + state(s) will not change the optimal policy. + """ + + def step(self, action): + ob, rew, done, info = self.env.step(action) + img = self.env.render(mode="rgb_array") + info["img"] = img + return ob, rew, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +def rendered_rollout( + venv: vec_env.VecEnv, + policy: policies.BasePolicy, + n_timesteps: int, + *, + truncate: bool = True, + **kwargs, +) -> RenderedTransitions: + """Rollout `policy` on `venv` for `n_timesteps`. + + Similar to `imitation.util.rollout.generate_transitions`, but returns `RenderedTransitions`, + including a rendered representation of each state in the `imgs` attribute. + + Args: + venv: A vector environment to rollout in. + policy: The policy to rollout. + n_timesteps: The number of timesteps of data to collect. + truncate: if True, return exactly n_timesteps; otherwise, it is a lower bound. + kwargs: Passed through to `rollout.generate_trajectories`. + """ + trajs = rollout.generate_trajectories( + policy, venv, sample_until=rollout.min_timesteps(n_timesteps), **kwargs + ) + transitions = rollout.flatten_trajectories(trajs) + imgs = [] + for traj in trajs: + imgs += [info["img"] for info in traj.infos] + if truncate and n_timesteps is not None: + transitions = rollout.Transitions(*(arr[:n_timesteps] for arr in transitions)) + imgs = imgs[:n_timesteps] + + imgs = np.array(imgs) + assert len(imgs) == len(transitions.obs) + return RenderedTransitions(**transitions._asdict(), imgs=imgs) + + +# TODO(adam): maybe want something more extreme than quantile sampling? +# e.g. top 1%, top 5%, top 10%; or particular S.D.s of reward. +def quantile_sample(preds: np.ndarray, num_samples: int = 10) -> Sequence[int]: + """Sample `num_samples` quantiles from preds.""" + num_points = len(preds) + if num_samples > num_points: + raise ValueError("num_samples must be less than the number of datapoints.") + + sorted_idxs = np.argsort(preds) + subset = np.linspace(0, num_points - 1, num=num_samples) + subset = np.ceil(subset).astype(np.int) + # TODO(adam): is this actually guaranteed not to clash? + assert len(np.unique(subset)) == len(subset) + + return sorted_idxs[subset] + + +def plot_renders( + imgs: np.ndarray, reward_preds: np.ndarray, ncols: int = 3, cell_size: float = 5.0 +): + """Plots an array of images.""" + n_imgs = imgs.shape[0] + nrows = math.ceil(n_imgs / ncols) + figsize = (cell_size * nrows, cell_size * ncols) + _, axs = plt.subplots(nrows, ncols, figsize=figsize, sharex=True, sharey=True) + for i, img in enumerate(imgs): + ax = axs[i // ncols][i % ncols] + ax.imshow(img) + ax.set_title("{:.2f}".format(reward_preds[i])) diff --git a/src/evaluating_rewards/scripts/expert_demos.py b/src/evaluating_rewards/scripts/expert_demos.py index 5ea309f..12f8bc4 100644 --- a/src/evaluating_rewards/scripts/expert_demos.py +++ b/src/evaluating_rewards/scripts/expert_demos.py @@ -15,9 +15,43 @@ """Thin wrapper around imitation.scripts.expert_demos.""" from imitation.scripts import expert_demos +import stable_baselines from evaluating_rewards.scripts import script_utils + +@expert_demos.expert_demos_ex.named_config +def lunar_lander(): + """PPO on LunarLander""" + env_name = "evaluating_rewards/LunarLanderContinuous-v0" + # Hyperparams from https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/ppo2.yml + num_vec = 16 + init_rl_kwargs = dict( + n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01, + ) + _ = locals() + del _ + + +@expert_demos.expert_demos_ex.named_config +def lunar_lander_sac(): + """SAC on LunarLander""" + env_name = "evaluating_rewards/LunarLanderContinuous-v0" + # Hyperparams from https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/sac.yml + num_vec = 1 + total_timesteps = int(5e5) + init_rl_kwargs = dict( + model_class=stable_baselines.SAC, + policy_class=stable_baselines.sac.policies.MlpPolicy, + batch_size=256, + learning_starts=1000, + ) + log_interval = 10000 + policy_save_interval = 10000 + _ = locals() + del _ + + if __name__ == "__main__": script_utils.add_logging_config(expert_demos.expert_demos_ex, "expert_demos") script_utils.experiment_main(expert_demos.expert_demos_ex, "expert_demos", sacred_symlink=False) diff --git a/src/evaluating_rewards/scripts/model_comparison.py b/src/evaluating_rewards/scripts/model_comparison.py index f7e6e6f..0801c5e 100644 --- a/src/evaluating_rewards/scripts/model_comparison.py +++ b/src/evaluating_rewards/scripts/model_comparison.py @@ -82,6 +82,14 @@ def alternating_maximization(): del _ +@model_comparison_ex.named_config +def undiscounted(): + """Undiscounted potential shaping.""" + comparison_kwargs = { # noqa: F841 pylint:disable=unused-variable + "model_wrapper": functools.partial(comparisons.equivalence_model_wrapper, discount=1.0), + } + + @model_comparison_ex.named_config def affine_only(): """Equivalence class consists of just affine transformations.""" diff --git a/tests/test_rewards.py b/tests/test_rewards.py index ed36f68..768c8c3 100644 --- a/tests/test_rewards.py +++ b/tests/test_rewards.py @@ -25,7 +25,7 @@ import tensorflow as tf from evaluating_rewards import datasets, rewards, serialize -from evaluating_rewards.envs import mujoco, point_mass +from evaluating_rewards.envs import lunar_lander, mujoco, point_mass from tests import common ENVS = ["FrozenLake-v0", "CartPole-v1", "Pendulum-v0"] @@ -46,6 +46,11 @@ "model_class": mujoco.HopperBackflipReward, "kwargs": {}, }, + "lunar_lander_ground_truth": { + "env_name": "evaluating_rewards/LunarLanderContinuous-v0", + "model_class": lunar_lander.LunarLanderContinuousGroundTruthReward, + "kwargs": {}, + }, "point_maze_ground_truth": { "env_name": "imitation/PointMazeLeftVel-v0", "model_class": mujoco.PointMazeReward, @@ -82,17 +87,31 @@ ] GROUND_TRUTH = { + # id: (env_name, reward_name, absolute_tolerance) "half_cheetah": ( "evaluating_rewards/HalfCheetah-v3", "evaluating_rewards/HalfCheetahGroundTruthForwardWithCtrl-v0", + 5e-5, ), "hopper": ( "evaluating_rewards/Hopper-v3", "evaluating_rewards/HopperGroundTruthForwardWithCtrl-v0", + 5e-5, + ), + "lunar_lander": ( + "evaluating_rewards/LunarLanderContinuous-v0", + "evaluating_rewards/LunarLanderContinuousDenseWithCtrl-v0", + 2e-4, + ), + "point_mass": ( + "evaluating_rewards/PointMassLine-v0", + "evaluating_rewards/PointMassGroundTruth-v0", + 5e-5, ), "point_maze": ( "imitation/PointMazeLeftVel-v0", "evaluating_rewards/PointMazeGroundTruthWithCtrl-v0", + 5e-5, ), } @@ -181,8 +200,10 @@ def make_model(env): return helper_serialize_identity(make_model) -@pytest.mark.parametrize("env_name,reward_id", GROUND_TRUTH.values(), ids=list(GROUND_TRUTH.keys())) -def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): +@pytest.mark.parametrize( + "env_name,reward_id,atol", GROUND_TRUTH.values(), ids=list(GROUND_TRUTH.keys()) +) +def test_ground_truth_similar_to_gym(graph, session, venv, reward_id, atol): """Checks that reward models predictions match those of Gym reward.""" # Generate rollouts, recording Gym reward policy = base.RandomPolicy(venv.observation_space, venv.action_space) @@ -198,7 +219,7 @@ def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): pred_reward = rewards.evaluate_models({"m": reward_model}, batch)["m"] # Are the predictions close to true Gym reward? - np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5) + np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=atol) REWARD_LEN = 10000