From c6e5cf2e7472ab872a537327a03ef9eb9fcef2a1 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 17 Mar 2021 13:34:17 -0700
Subject: [PATCH] Benchmark improvements.

* Build `+RTS -A256m -RTS` into default ghc-options for benchmark,
  so we don't have to specify this separately on the command line.
  This is necessary to get accurate benchmark results; otherwise
  we are largely measuring garbage collecting, some not related
  to the current benchmark.
* Switch back from gauge to tasty-bench.
* Allow specifying BASELINE file in 'make bench' for comparison
  (otherwise the latest is chosen by default).
* Remove obsolete reference to weigh-pandoc from CONTRIBUTING.md.
* Remove `-Rghc-timing` from 'make bench'.
---
 CONTRIBUTING.md               |  3 ---
 Makefile                      | 18 +++++++-----------
 benchmark/benchmark-pandoc.hs |  4 ++--
 pandoc.cabal                  |  6 ++++--
 4 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 090cc0a4f..cf8d3aa03 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -318,9 +318,6 @@ With stack:
 
     stack bench
 
-You can also build `weigh-pandoc` (`stack build pandoc:weigh-pandoc`)
-to get some statistics on memory usage.  (Eventually this should
-be incorporated into the benchmark suite.)
 
 Using the REPL
 --------------
diff --git a/Makefile b/Makefile
index 0ef0406f8..5e5bc7c51 100644
--- a/Makefile
+++ b/Makefile
@@ -7,21 +7,19 @@ DOCKERIMAGE=registry.gitlab.b-data.ch/ghc/ghc4pandoc:8.10.4
 COMMIT=$(shell git rev-parse --short HEAD)
 TIMESTAMP=$(shell date "+%Y%m%d_%H%M")
 LATESTBENCH=$(word 1,$(shell ls -t bench_*.csv 2>/dev/null))
-ifeq ($(LATESTBENCH),)
-BASELINE=
+BASELINE?=$(LATESTBENCH)
+ifeq ($(BASELINE),)
+BASELINECMD=
 else
-BASELINE=--baseline $(LATESTBENCH)
+BASELINECMD=--baseline $(BASELINE)
 endif
 GHCOPTS=-fdiagnostics-color=always -j4 +RTS -A256m -RTS
 WEBSITE=../../web/pandoc.org
 REVISION?=1
-# Note: for benchmarks we use +RTS -A256m -I0 -RTS ; otherwise the benchmarks
-# are measuring garbage collecting, and this can vary depending on which
-# other benchmarks are run.
 # For gauge:
-BENCHARGS?=--small --ci=0.90 --match=pattern $(PATTERN) +RTS -T -A256m -I0 -RTS
+# BENCHARGS?=--small --ci=0.90 --match=pattern $(PATTERN)
 # For tasty-bench:
-# BENCHARGS?=--csv bench_$(TIMESTAMP).csv --timeout=6 +RTS -T -A256m -I0 -RTS $(if $(PATTERN),--pattern "$(PATTERN)",)
+BENCHARGS?=--csv bench_$(TIMESTAMP).csv $(BASELINECMD) --timeout=6 +RTS -T -RTS $(if $(PATTERN),--pattern "$(PATTERN)",)
 
 quick:
 	stack install --ghc-options='$(GHCOPTS)' --install-ghc --flag 'pandoc:embed_data_files' --fast --test --ghc-options='$(GHCOPTS)' --test-arguments='-j4 --hide-successes $(TESTARGS)'
@@ -58,11 +56,9 @@ ghcid-test:
 
 bench:
 	stack bench \
-	  --ghc-options '-Rghc-timing $(GHCOPTS)' \
+	  --ghc-options '$(GHCOPTS)' \
 	  --benchmark-arguments='$(BENCHARGS)' 2>&1 | \
 	  tee "bench_latest.txt"
-	perl -ne 'if (/\r/) { s/\x1b\[[0-9;]*[mGK]//g;s/^.*\r//;print; }' \
-	  bench_latest.txt > "bench_$(TIMESTAMP).txt"
 
 reformat:
 	for f in $(SOURCEFILES); do echo $$f; stylish-haskell -i $$f ; done
diff --git a/benchmark/benchmark-pandoc.hs b/benchmark/benchmark-pandoc.hs
index 111e63274..1890a998f 100644
--- a/benchmark/benchmark-pandoc.hs
+++ b/benchmark/benchmark-pandoc.hs
@@ -23,8 +23,8 @@ import Control.Monad.Except (throwError)
 import qualified Text.Pandoc.UTF8 as UTF8
 import qualified Data.ByteString as B
 import qualified Data.Text as T
--- import Test.Tasty.Bench
-import Gauge
+import Test.Tasty.Bench
+-- import Gauge
 import qualified Data.ByteString.Lazy as BL
 import Data.Maybe (mapMaybe)
 import Data.List (sortOn)
diff --git a/pandoc.cabal b/pandoc.cabal
index 2f49f0ac3..3a7436370 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -826,9 +826,11 @@ benchmark benchmark-pandoc
   hs-source-dirs:  benchmark
   build-depends:   bytestring,
                    containers,
-                   gauge       >= 0.2     && < 0.3,
-                   -- tasty-bench >= 0.2     && <= 0.3,
+                   -- gauge       >= 0.2     && < 0.3,
+                   tasty-bench >= 0.2     && <= 0.3,
                    mtl         >= 2.2     && < 2.3,
                    text        >= 1.1.1.0 && < 1.3,
                    time,
                    deepseq
+  -- we increase heap size to avoid benchmarking garbage collection:
+  ghc-options:     -rtsopts -with-rtsopts=-A256m -threaded