feat(benchmarks): auto-generate Interpretation/Recommendation and add -report-only flag

generate_report.go: - buildInterpretation: derives narrative from p99/p50 tail-latency ratio, per-action complexity trend (% increase vs discover baseline), concurrency scaling efficiency (GOMAXPROCS=1 vs 16), and cache warm/cold delta - buildRecommendation: identifies the best throughput/cost GOMAXPROCS level from scaling efficiency and adds production sizing guidance run_benchmarks.sh: - Add -report-only <dir> flag: re-runs parse_results.go + generate_report.go against an existing results directory without rerunning benchmarks REPORT_TEMPLATE.md: - Replace manual placeholders with __INTERPRETATION__ and __RECOMMENDATION__ markers filled by the generator
2026-04-09 22:34:52 +05:30
parent e6accc3f26
commit e0d7e3508f
3 changed files with 229 additions and 4 deletions
--- a/benchmarks/reports/REPORT_TEMPLATE.md
+++ b/benchmarks/reports/REPORT_TEMPLATE.md
@@ -36,11 +36,11 @@ adapter-internal latency from network variables.

 ### Interpretation

-_Review the numbers above and add interpretation here._
+__INTERPRETATION__

 ### Recommendation

-_Add sizing and tuning recommendations here._
+__RECOMMENDATION__

 ---

--- a/benchmarks/run_benchmarks.sh
+++ b/benchmarks/run_benchmarks.sh
@@ -20,7 +20,6 @@ set -euo pipefail

 SCRIPT_START=$(date +%s)
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-RESULTS_DIR="$REPO_ROOT/benchmarks/results/$(date +%Y-%m-%d_%H-%M-%S)"
 BENCH_PKG="./benchmarks/e2e/..."
 BENCH_TIMEOUT="10m"
 BENCH_TIME_SERIAL="10s"
@@ -31,6 +30,38 @@ BENCH_COUNT=1             # benchstat uses the 3 serial files for stability
 ONIX_VERSION="$(git -C "$REPO_ROOT" describe --tags --abbrev=0 2>/dev/null || echo "dev")"
 REPORT_TEMPLATE="$REPO_ROOT/benchmarks/reports/REPORT_TEMPLATE.md"

+# ── -report-only <dir>: regenerate report from an existing results directory ──
+if [[ "${1:-}" == "-report-only" ]]; then
+  RESULTS_DIR="${2:-}"
+  if [[ -z "$RESULTS_DIR" ]]; then
+    echo "Usage: bash benchmarks/run_benchmarks.sh -report-only <results-dir>"
+    echo "Example: bash benchmarks/run_benchmarks.sh -report-only benchmarks/results/2026-04-09_10-30-00"
+    exit 1
+  fi
+  if [[ ! -d "$RESULTS_DIR" ]]; then
+    echo "ERROR: results directory not found: $RESULTS_DIR"
+    exit 1
+  fi
+  echo "=== Regenerating report from existing results ==="
+  echo "Results dir : $RESULTS_DIR"
+  echo ""
+  cd "$REPO_ROOT"
+  echo "Parsing results to CSV..."
+  go run "$REPO_ROOT/benchmarks/tools/parse_results.go" \
+    -dir="$RESULTS_DIR" -out="$RESULTS_DIR" 2>&1 || true
+  echo ""
+  echo "Generating benchmark report..."
+  go run "$REPO_ROOT/benchmarks/tools/generate_report.go" \
+    -dir="$RESULTS_DIR" \
+    -template="$REPORT_TEMPLATE" \
+    -version="$ONIX_VERSION"
+  echo ""
+  echo "Done. Report written to: $RESULTS_DIR/BENCHMARK_REPORT.md"
+  exit 0
+fi
+
+RESULTS_DIR="$REPO_ROOT/benchmarks/results/$(date +%Y-%m-%d_%H-%M-%S)"
+
 cd "$REPO_ROOT"

 # ── benchstat is declared as a go tool in go.mod; no separate install needed ──
--- a/benchmarks/tools/generate_report.go
+++ b/benchmarks/tools/generate_report.go
@@ -146,6 +146,10 @@ func main() {
 	// ── Build throughput table ─────────────────────────────────────────────────
 	throughputTable := buildThroughputTable(throughput)

+	// ── Generate interpretation and recommendation ─────────────────────────────
+	interpretation := buildInterpretation(perc, latency, throughput, warmUS, coldUS)
+	recommendation := buildRecommendation(throughput)
+
 	// ── Apply substitutions ────────────────────────────────────────────────────
 	replacements := map[string]string{
 		"__TIMESTAMP__":        timestamp,
@@ -183,6 +187,8 @@ func main() {
 		"__CACHE_DELTA__":      cacheDelta,
 		"__THROUGHPUT_TABLE__":  throughputTable,
 		"__BENCHSTAT_SUMMARY__": benchstat,
+		"__INTERPRETATION__":   interpretation,
+		"__RECOMMENDATION__":   recommendation,
 	}

 	for placeholder, value := range replacements {
@@ -399,3 +405,191 @@ func readFileOrDefault(path, def string) string {
 	}
 	return strings.TrimRight(string(b), "\n")
 }
+
+// ── Narrative generators ───────────────────────────────────────────────────────
+
+// buildInterpretation generates a data-driven interpretation paragraph from the
+// benchmark results. It covers tail-latency control, action complexity trend,
+// concurrency scaling efficiency, and cache impact.
+func buildInterpretation(
+	perc map[string]string,
+	latency map[string]map[string]string,
+	throughput []map[string]string,
+	warmUS, coldUS string,
+) string {
+	var sb strings.Builder
+
+	p50 := parseFloatOrZero(perc["p50_µs"])
+	p99 := parseFloatOrZero(perc["p99_µs"])
+	meanDiscover := parseFloatOrZero(latency["BenchmarkBAPCaller_Discover"]["mean_ms"]) * 1000
+
+	// Tail-latency control.
+	if p50 > 0 && p99 > 0 {
+		ratio := p99 / p50
+		quality := "good"
+		if ratio > 5 {
+			quality = "poor"
+		} else if ratio > 3 {
+			quality = "moderate"
+		}
+		sb.WriteString(fmt.Sprintf(
+			"The adapter delivers a p50 latency of **%.0f µs** for the discover action. "+
+				"The p99/p50 ratio is **%.1f×**, indicating %s tail-latency control — "+
+				"spikes are %s relative to the median.\n\n",
+			p50, ratio, quality, tailDescription(ratio),
+		))
+	} else if meanDiscover > 0 {
+		sb.WriteString(fmt.Sprintf(
+			"The adapter delivers a mean latency of **%.0f µs** for the discover action. "+
+				"Run with `-bench=BenchmarkBAPCaller_Discover_Percentiles` to obtain p50/p95/p99 data.\n\n",
+			meanDiscover,
+		))
+	}
+
+	// Action complexity trend.
+	selectMS := parseFloatOrZero(latency["BenchmarkBAPCaller_AllActions/select"]["mean_ms"]) * 1000
+	initMS := parseFloatOrZero(latency["BenchmarkBAPCaller_AllActions/init"]["mean_ms"]) * 1000
+	confirmMS := parseFloatOrZero(latency["BenchmarkBAPCaller_AllActions/confirm"]["mean_ms"]) * 1000
+	if meanDiscover > 0 && selectMS > 0 && initMS > 0 && confirmMS > 0 {
+		sb.WriteString(fmt.Sprintf(
+			"Latency scales with payload complexity: select (+%.0f%%), init (+%.0f%%), confirm (+%.0f%%) "+
+				"vs the discover baseline. Allocation counts track proportionally, driven by JSON "+
+				"unmarshalling and schema validation of larger payloads.\n\n",
+			pctChange(meanDiscover, selectMS),
+			pctChange(meanDiscover, initMS),
+			pctChange(meanDiscover, confirmMS),
+		))
+	}
+
+	// Concurrency scaling.
+	lat1 := latencyAtCPU(throughput, "1")
+	lat16 := latencyAtCPU(throughput, "16")
+	if lat1 > 0 && lat16 > 0 {
+		improvement := lat1 / lat16
+		sb.WriteString(fmt.Sprintf(
+			"Concurrency scaling is effective: mean latency drops from **%.0f µs** at GOMAXPROCS=1 "+
+				"to **%.0f µs** at GOMAXPROCS=16 — a **%.1f× improvement**.",
+			lat1*1000, lat16*1000, improvement,
+		))
+		if improvement < 4 {
+			sb.WriteString(" Gains taper beyond 8 cores, suggesting a shared serialisation point " +
+				"(likely schema validation or key derivation).")
+		}
+		sb.WriteString("\n\n")
+	}
+
+	// Cache impact.
+	w := parseFloatOrZero(warmUS)
+	c := parseFloatOrZero(coldUS)
+	if w > 0 && c > 0 {
+		delta := math.Abs(w-c) / w * 100
+		if delta < 5 {
+			sb.WriteString(fmt.Sprintf(
+				"The Redis key-manager cache shows **no measurable impact** in this setup "+
+					"(warm vs cold delta: %.0f µs, %.1f%% of mean). "+
+					"miniredis is in-process; signing and schema validation dominate. "+
+					"Cache benefit would be visible with real Redis over a network.",
+				math.Abs(w-c), delta,
+			))
+		} else {
+			sb.WriteString(fmt.Sprintf(
+				"The Redis key-manager cache provides a **%.0f µs improvement** (%.1f%%) "+
+					"on the warm path vs cold.",
+				math.Abs(w-c), delta,
+			))
+		}
+		sb.WriteString("\n")
+	}
+
+	if sb.Len() == 0 {
+		return "_Insufficient data to generate interpretation. Ensure all benchmark scenarios completed successfully._"
+	}
+	return strings.TrimRight(sb.String(), "\n")
+}
+
+// buildRecommendation generates a sizing and tuning recommendation based on the
+// concurrency sweep results.
+func buildRecommendation(throughput []map[string]string) string {
+	if len(throughput) == 0 {
+		return "_Run the concurrency sweep to generate sizing recommendations._"
+	}
+
+	// Find the GOMAXPROCS level with best scaling efficiency (RPS gain per core).
+	type cpuPoint struct {
+		cpu int
+		rps float64
+		lat float64
+	}
+	var points []cpuPoint
+	for _, row := range throughput {
+		cpu := int(parseFloatOrZero(row["gomaxprocs"]))
+		rps := parseFloatOrZero(row["rps"])
+		lat := parseFloatOrZero(row["mean_latency_ms"]) * 1000
+		if cpu > 0 && lat > 0 {
+			points = append(points, cpuPoint{cpu, rps, lat})
+		}
+	}
+
+	if len(points) == 0 {
+		return "_Run the concurrency sweep (parallel_cpu*.txt) to generate sizing recommendations._"
+	}
+
+	// Find sweet spot: largest latency improvement per doubling of cores.
+	bestEffCPU := points[0].cpu
+	bestEff := 0.0
+	for i := 1; i < len(points); i++ {
+		if points[i-1].lat > 0 {
+			eff := (points[i-1].lat - points[i].lat) / points[i-1].lat
+			if eff > bestEff {
+				bestEff = eff
+				bestEffCPU = points[i].cpu
+			}
+		}
+	}
+
+	var sb strings.Builder
+	sb.WriteString(fmt.Sprintf(
+		"**%d cores** offers the best throughput/cost ratio based on the concurrency sweep — "+
+			"scaling efficiency begins to taper beyond this point.\n\n",
+		bestEffCPU,
+	))
+	sb.WriteString("The adapter is ready for staged load testing against a real BPP. " +
+		"For production sizing, start with the recommended core count above and adjust based " +
+		"on observed throughput targets. If schema validation dominates CPU (likely at high " +
+		"concurrency), profile with `go tool pprof` using the commands in B5 to isolate the bottleneck.")
+
+	return sb.String()
+}
+
+// ── Narrative helpers ──────────────────────────────────────────────────────────
+
+func tailDescription(ratio float64) string {
+	switch {
+	case ratio <= 2:
+		return "minimal"
+	case ratio <= 3:
+		return "modest"
+	case ratio <= 5:
+		return "noticeable"
+	default:
+		return "significant"
+	}
+}
+
+func pctChange(base, val float64) float64 {
+	if base == 0 {
+		return 0
+	}
+	return (val - base) / base * 100
+}
+
+func latencyAtCPU(throughput []map[string]string, cpu string) float64 {
+	for _, row := range throughput {
+		if row["gomaxprocs"] == cpu {
+			if v := parseFloatOrZero(row["mean_latency_ms"]); v > 0 {
+				return v
+			}
+		}
+	}
+	return 0
+}