GPflow · jesnie · Oct 24, 2022 · Oct 20, 2022
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -8,7 +8,7 @@ on:
 
 env:
   penv: './poetryenv poetryenvs'
-  run: 'run python benchmark'
+  run: 'run python -m benchmark'
   bex: 'benchmark/examples'
   res: 'gh-pages/benchmark_results'
   plots: 'gh-pages/docs/benchmark_plots/'
@@ -35,15 +35,15 @@ jobs:
           ${penv}/torch_max install
       - name: Run benchmarks
         run: |
-          ${penv}/np_max ${run}/run_benchmark.py ${bex}/np_example.py ${res}
-          ${penv}/tf_max ${run}/run_benchmark.py ${bex}/tf_example.py ${res}
-          ${penv}/tf_max ${run}/run_benchmark.py ${bex}/tf_example.py --modifiers=no_compile ${res}
-          ${penv}/jax_max ${run}/run_benchmark.py ${bex}/jax_example.py ${res}
-          ${penv}/jax_max ${run}/run_benchmark.py ${bex}/jax_example.py --modifiers=no_jit ${res}
-          ${penv}/torch_max ${run}/run_benchmark.py ${bex}/torch_example.py ${res}
+          ${penv}/np_max ${run}.run_benchmark ${bex}/np_example.py ${res}
+          ${penv}/tf_max ${run}.run_benchmark ${bex}/tf_example.py ${res}
+          ${penv}/tf_max ${run}.run_benchmark ${bex}/tf_example.py --modifiers=no_compile ${res}
+          ${penv}/jax_max ${run}.run_benchmark ${bex}/jax_example.py ${res}
+          ${penv}/jax_max ${run}.run_benchmark ${bex}/jax_example.py --modifiers=no_jit ${res}
+          ${penv}/torch_max ${run}.run_benchmark ${bex}/torch_example.py ${res}
       - name: Plot benchmarks
         run: |
-          ${penv}/np_max ${run}/plot_benchmarks.py ${res}
+          ${penv}/np_max ${run}.plot_benchmarks ${res}
           mkdir -p ${plots}
           mv ${res}/overhead.png ${plots}
       - name: Commit new benchmark results

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -9,7 +9,7 @@ Most recent results are shown in our
 To run a benchmark use:
 
 ```bash
-python benchmark/run_benchmark.py \
+python -m benchmark.run_benchmark \
     <path to example script> \
     [--modifiers=<other modification to the script>] \
     <output_directory>
@@ -18,7 +18,7 @@ python benchmark/run_benchmark.py \
 Then plot the results with:
 
 ```bash
-python benchmark/plot_benchmarks.py <output_directory>
+python -m benchmark.plot_benchmarks <output_directory>
 ```
 
 The plotter will plot all results found in the output directory, so your can run `run_benchmark.py`
@@ -30,13 +30,13 @@ poetry install
 ./poetryenv -r poetryenvs install
 
 # Run all benchmarks:
-./poetryenv poetryenvs/np_max run python benchmark/run_benchmark.py benchmark/examples/np_example.py benchmark_results
-./poetryenv poetryenvs/tf_max run python benchmark/run_benchmark.py benchmark/examples/tf_example.py benchmark_results
-./poetryenv poetryenvs/tf_max run python benchmark/run_benchmark.py benchmark/examples/tf_example.py --modifiers=no_compile benchmark_results
-./poetryenv poetryenvs/jax_max run python benchmark/run_benchmark.py benchmark/examples/jax_example.py benchmark_results
-./poetryenv poetryenvs/jax_max run python benchmark/run_benchmark.py benchmark/examples/jax_example.py --modifiers=no_jit benchmark_results
-./poetryenv poetryenvs/torch_max run python benchmark/run_benchmark.py benchmark/examples/torch_example.py benchmark_results
+./poetryenv poetryenvs/np_max run python -m benchmark.run_benchmark benchmark/examples/np_example.py benchmark_results
+./poetryenv poetryenvs/tf_max run python -m benchmark.run_benchmark benchmark/examples/tf_example.py benchmark_results
+./poetryenv poetryenvs/tf_max run python -m benchmark.run_benchmark benchmark/examples/tf_example.py --modifiers=no_compile benchmark_results
+./poetryenv poetryenvs/jax_max run python -m benchmark.run_benchmark benchmark/examples/jax_example.py benchmark_results
+./poetryenv poetryenvs/jax_max run python -m benchmark.run_benchmark benchmark/examples/jax_example.py --modifiers=no_jit benchmark_results
+./poetryenv poetryenvs/torch_max run python -m benchmark.run_benchmark benchmark/examples/torch_example.py benchmark_results
 
 # Plot results:
-poetry run python benchmark/plot_benchmarks.py benchmark_results
+poetry run python -m benchmark.plot_benchmarks benchmark_results
 ```
diff --git a/benchmark/plot_benchmarks.py b/benchmark/plot_benchmarks.py
@@ -13,17 +13,23 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
+from typing import Any, Optional
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from matplotlib.axes import Axes
+
+from .stats import Stats
+
+NDArray = Any
 
 
 def plot(output_dir: Path) -> None:
     result_dfs = [pd.read_csv(f) for f in output_dir.glob("results_*.csv")]
     results_df = pd.concat(result_dfs, axis="index", ignore_index=True)
 
-    n_columns = 2
+    n_columns = 3
     n_rows = len(results_df.name.unique())
     width = 6 * n_columns
     height = 4 * n_rows
@@ -37,74 +43,46 @@ def plot(output_dir: Path) -> None:
 
     for i, (ax_name, ax_df) in enumerate(results_df.groupby("name")):
         line_xs = []
-        line_y_with_means = []
-        line_y_with_uppers = []
-        line_y_with_lowers = []
-        line_y_without_means = []
-        line_y_without_uppers = []
-        line_y_without_lowers = []
-        line_y_overhead_means = []
-        line_y_overhead_uppers = []
-        line_y_overhead_lowers = []
+        line_ys = []
 
         for timestamp, timestamp_df in ax_df.groupby("timestamp"):
-            by_cs = timestamp_df.groupby("check_shapes")
-            mean_by_cs = by_cs.time_s.mean()
-            std_by_cs = by_cs.time_s.std().fillna(0.0)
-            var_by_cs = by_cs.time_s.var().fillna(0.0)
-
-            with_mean = mean_by_cs[True]
-            with_mean_sq = with_mean ** 2
-            with_std = std_by_cs[True]
-            with_var = var_by_cs[True]
-            without_mean = mean_by_cs[False]
-            without_mean_sq = without_mean ** 2
-            without_std = std_by_cs[False]
-            without_var = var_by_cs[False]
-
-            overhead_mean = (with_mean / without_mean) - 1
-            # https://en.wikipedia.org/wiki/Ratio_distribution#Uncorrelated_noncentral_normal_ratio
-            overhead_var = (with_mean_sq / without_mean_sq) * (
-                (with_var / with_mean_sq) + (without_var / without_mean_sq)
-            )
-            overhead_std = np.sqrt(overhead_var)
-
             line_xs.append(timestamp)
-            line_y_with_means.append(with_mean)
-            line_y_with_uppers.append(with_mean + 1.96 * with_std)
-            line_y_with_lowers.append(with_mean - 1.96 * with_std)
-            line_y_without_means.append(without_mean)
-            line_y_without_uppers.append(without_mean + 1.96 * without_std)
-            line_y_without_lowers.append(without_mean - 1.96 * without_std)
-            line_y_overhead_means.append(100 * overhead_mean)
-            line_y_overhead_uppers.append(100 * (overhead_mean + 1.96 * overhead_std))
-            line_y_overhead_lowers.append(100 * (overhead_mean - 1.96 * overhead_std))
+            line_ys.append(Stats.new(timestamp_df))
+
+        def plot_mean_and_std(
+            ax: Axes, prefix: str, *, label: Optional[str] = None, scale: float = 1.0
+        ) -> None:
+            mean_name = f"{prefix}_mean"
+            std_name = f"{prefix}_std"
+
+            # pylint: disable=cell-var-from-loop
+            mean: NDArray = np.array([getattr(y, mean_name) for y in line_ys]) * scale
+            std: NDArray = np.array([getattr(y, std_name) for y in line_ys]) * scale
+            lower: NDArray = mean - 1.96 * std
+            upper: NDArray = mean + 1.96 * std
+
+            (mean_line,) = ax.plot(line_xs, mean, label=label)
+            color = mean_line.get_color()
+            ax.fill_between(line_xs, lower, upper, color=color, alpha=0.3)
+
+            ax.set_title(ax_name)
+            ax.tick_params(axis="x", labelrotation=30)
+            if np.min(lower) > 0:
+                ax.set_ylim(bottom=0.0)
 
         ax = axes[i][0]
-        (mean_line,) = ax.plot(line_xs, line_y_with_means, label="with check_shapes")
-        color = mean_line.get_color()
-        ax.fill_between(line_xs, line_y_with_lowers, line_y_with_uppers, color=color, alpha=0.3)
-        (mean_line,) = ax.plot(line_xs, line_y_without_means, label="without check_shapes")
-        color = mean_line.get_color()
-        ax.fill_between(
-            line_xs, line_y_without_lowers, line_y_without_uppers, color=color, alpha=0.3
-        )
-        ax.set_title(ax_name)
+        plot_mean_and_std(ax, "with", label="with check_shapes")
+        plot_mean_and_std(ax, "without", label="without check_shapes")
         ax.set_ylabel("time / s")
-        ax.tick_params(axis="x", labelrotation=30)
         ax.legend()
 
         ax = axes[i][1]
-        (mean_line,) = ax.plot(line_xs, line_y_overhead_means)
-        color = mean_line.get_color()
-        ax.fill_between(
-            line_xs, line_y_overhead_lowers, line_y_overhead_uppers, color=color, alpha=0.3
-        )
-        ax.set_title(ax_name)
+        plot_mean_and_std(ax, "abs_overhead")
+        ax.set_ylabel("overhead / s")
+
+        ax = axes[i][2]
+        plot_mean_and_std(ax, "rel_overhead", scale=100.0)
         ax.set_ylabel("% overhead")
-        if np.min(line_y_overhead_lowers) >= 0:
-            ax.set_ylim(bottom=0.0)
-        ax.tick_params(axis="x", labelrotation=30)
 
     fig.tight_layout()
     fig.savefig(output_dir / "overhead.png")

diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
@@ -21,6 +21,8 @@
 
 import pandas as pd
 
+from .stats import Stats
+
 TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S.%f"
 
 
@@ -42,6 +44,8 @@ class Modifier(NamedTuple):
     Modifier(r"@inherit_check_shapes", ""),
     Modifier(r"@check_shapes\(.*?\)", ""),
     Modifier(r"cs\((.*?), \".*?\"\)", r"\1"),
+    Modifier(r"from check_shapes import \(.*?\)", ""),
+    Modifier(r"from check_shapes[^(]*?^", ""),
 )
 
 
@@ -52,12 +56,12 @@ class Modifier(NamedTuple):
 
 
 def run_modified_script(
-    script: Path, modifiers: Modifiers, reps: int, keep: bool, output_dir: Path
+    script: Path, tmp_name: str, modifiers: Modifiers, reps: int, keep: bool, output_dir: Path
 ) -> Sequence[float]:
-    modified = output_dir / "tmp.py"
+    modified = output_dir / f"{tmp_name}.py"
     src = script.read_text()
     for modifier in modifiers:
-        src = re.sub(modifier.pattern, modifier.repl, src)
+        src = re.sub(modifier.pattern, modifier.repl, src, flags=re.MULTILINE + re.DOTALL)
     modified.write_text(src)
 
     timings = []
@@ -94,7 +98,7 @@ def run_benchmark(
     }
 
     modifiers = tuple(m for ms in modifier_strs for m in _MODIFIERS[ms])
-    with_timings = run_modified_script(script, modifiers, reps, keep, output_dir)
+    with_timings = run_modified_script(script, "with", modifiers, reps, keep, output_dir)
     with_df = pd.DataFrame(
         {
             **shared_data,
@@ -104,7 +108,7 @@ def run_benchmark(
     )
 
     modifiers = _CHECK_SHAPES_MODIFIER + modifiers
-    without_timings = run_modified_script(script, modifiers, reps, keep, output_dir)
+    without_timings = run_modified_script(script, "without", modifiers, reps, keep, output_dir)
     without_df = pd.DataFrame(
         {
             **shared_data,
@@ -117,6 +121,10 @@ def run_benchmark(
     csv_path = output_dir / f"results_{name}_{timestamp_str}.csv"
     df.to_csv(csv_path, index=False)
 
+    stats = Stats.new(df)
+    print(f"Relative overhead: {stats.rel_overhead_mean:.2%} +/- {stats.rel_overhead_std:.2%}")
+    print(f"Absolute overhead: {stats.abs_overhead_mean:.2}s +/- {stats.abs_overhead_std:.2}s")
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Modify a script, then times its execution.")

diff --git a/benchmark/stats.py b/benchmark/stats.py
@@ -0,0 +1,69 @@
+# Copyright 2022 The GPflow Contributors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+
+import numpy as np
+import pandas as pd
+
+
+@dataclass
+class Stats:
+
+    with_mean: float
+    with_std: float
+    without_mean: float
+    without_std: float
+    rel_overhead_mean: float
+    rel_overhead_std: float
+    abs_overhead_mean: float
+    abs_overhead_std: float
+
+    @staticmethod
+    def new(df: pd.DataFrame) -> "Stats":
+        by_cs = df.groupby("check_shapes")
+        mean_by_cs = by_cs.time_s.mean()
+        std_by_cs = by_cs.time_s.std().fillna(0.0)
+        var_by_cs = by_cs.time_s.var().fillna(0.0)
+
+        with_mean = mean_by_cs[True]
+        with_mean_sq = with_mean ** 2
+        with_std = std_by_cs[True]
+        with_var = var_by_cs[True]
+
+        without_mean = mean_by_cs[False]
+        without_mean_sq = without_mean ** 2
+        without_std = std_by_cs[False]
+        without_var = var_by_cs[False]
+
+        rel_overhead_mean = (with_mean / without_mean) - 1
+        # https://en.wikipedia.org/wiki/Ratio_distribution#Uncorrelated_noncentral_normal_ratio
+        rel_overhead_var = (with_mean_sq / without_mean_sq) * (
+            (with_var / with_mean_sq) + (without_var / without_mean_sq)
+        )
+        rel_overhead_std = np.sqrt(rel_overhead_var)
+
+        abs_overhead_mean = with_mean - without_mean
+        abs_overhead_var = with_var + without_var
+        abs_overhead_std = np.sqrt(abs_overhead_var)
+
+        return Stats(
+            with_mean=with_mean,
+            with_std=with_std,
+            without_mean=without_mean,
+            without_std=without_std,
+            rel_overhead_mean=rel_overhead_mean,
+            rel_overhead_std=rel_overhead_std,
+            abs_overhead_mean=abs_overhead_mean,
+            abs_overhead_std=abs_overhead_std,
+        )