From 7f9823258a73e700d71dd5d46a211703a7d60d6b Mon Sep 17 00:00:00 2001 From: Oneric Date: Tue, 18 Feb 2025 14:18:39 +0100 Subject: [PATCH] metrics: adjust router and db buckets Most HTTPS requests actually fall into the single-digit millisecond range or below on average. Even the more costly endpoints almost always average around the lower third of the millisecond magnitude. Only endpoints doing synchronous remote HTTP fetches (e.g. for signing keys) occasionally spike into the order of seconds. As is, the bucket resolution is completely unfit to reason about anything and even just averages are better indications. Most database queries take less than a millisecond and even in total almost all take less than 50ms for me. Decode time is but a tiny fraction of that and queue time usually only takes a small part of total time too (but may spike on high load). Shift the buckets down to be able to give insight into all relevant cases. In particular this allows to determine whether high averages are the result of generally high processing times or just a few outliers lifting the whole average up (e.g. slow network fetches). Exact numbers are biased towards my setup for lack of other comparison data, but at least the order of magnitude should be ok everywhere. --- lib/pleroma/web/telemetry.ex | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/pleroma/web/telemetry.ex b/lib/pleroma/web/telemetry.ex index 3686dc852..053e0c479 100644 --- a/lib/pleroma/web/telemetry.ex +++ b/lib/pleroma/web/telemetry.ex @@ -130,7 +130,7 @@ defp distribution_metrics do unit: {:native, :second}, tags: [:route], reporter_options: [ - buckets: [0.1, 0.2, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000] + buckets: [0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.5, 0.75, 1, 2, 5, 15] ] ), @@ -232,8 +232,7 @@ defp summary_fallback_metrics(byte_unit \\ :byte) do # and we can use sum + counter to get the average between polls from their change # But for repo query times we need to use a full distribution - simple_buckets = [0, 1, 2, 4, 8, 16] - simple_buckets_quick = for t <- simple_buckets, do: t / 100.0 + simple_buckets = [1, 2, 4, 8, 16, 32] # Already included in distribution metrics anyway: # phoenix.router_dispatch.stop.duration @@ -253,7 +252,7 @@ defp summary_fallback_metrics(byte_unit \\ :byte) do measurement: :decode_time, unit: {:native, :millisecond}, reporter_options: [ - buckets: simple_buckets_quick + buckets: [0.001, 0.0025, 0.005, 0.01, 0.02, 0.05, 0.1, 0.5] ] ), distribution("pleroma.repo.query.query_time.fdist", @@ -261,7 +260,7 @@ defp summary_fallback_metrics(byte_unit \\ :byte) do measurement: :query_time, unit: {:native, :millisecond}, reporter_options: [ - buckets: simple_buckets + buckets: [0.1, 0.2, 0.5, 1, 1.5, 3, 5, 10, 25, 50] ] ), distribution("pleroma.repo.query.idle_time.fdist",