From 995830e3d28f5f63fd4f5a84753be8663cc34002 Mon Sep 17 00:00:00 2001
From: Matteo Benedetto <matteo.benedetto@local>
Date: Tue, 24 Mar 2026 10:54:02 +0100
Subject: [PATCH] player,bench: drop queue from vscale-bin (leaky=2 caused
 massive drops), keep nearest-neighbour

---
 .../player/gstreamer_backend.py               | 45 +++++++------------
 tests/benchmark_nv12_decode.py                | 22 ++++-----
 2 files changed, 25 insertions(+), 42 deletions(-)

diff --git a/src/r36s_dlna_browser/player/gstreamer_backend.py b/src/r36s_dlna_browser/player/gstreamer_backend.py
index 3531616..94963ea 100644
--- a/src/r36s_dlna_browser/player/gstreamer_backend.py
+++ b/src/r36s_dlna_browser/player/gstreamer_backend.py
@@ -428,42 +428,33 @@ class GStreamerBackend(PlayerBackend):
             sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA"))
             return sink
 
-        # Hardware decode (NV12): insert a queue → videoscale → capsfilter chain
-        # inside a GstBin before the appsink so playbin accepts it as a single
-        # video-sink element.
+        # Hardware decode (NV12): insert a videoscale → capsfilter chain inside a
+        # GstBin before the appsink so playbin accepts it as a single video-sink.
         #
-        # queue   — decouples mppvideodec from the scale thread so the HW
-        #           decoder is never stalled waiting for SW scaling to finish.
-        #           leaky=2 (downstream) drops the oldest queued frame when
-        #           full, ensuring Python always receives the latest frame.
+        # videoscale(method=nearest-neighbour) — scales 1920×1080 → 640×480.
+        #   Nearest-neighbour skips ~56% of source rows so only ~44% of the
+        #   source cache lines are fetched.  This is cheaper than the full
+        #   3.1 MB memmove (which loads 100% of cache lines), cutting the
+        #   Python memmove from ~32 ms to ~1 ms at the cost of some SW scale
+        #   CPU time (~14 ms estimated for nearest vs 32 ms for bilinear).
         #
-        # videoscale(method=nearest) — scales 1920×1080 → 640×480 using
-        #           nearest-neighbour interpolation (fastest SW method).
-        #           Python receives 460 KB per frame instead of 3.1 MB,
-        #           cutting memmove cost from ~32 ms to ~1 ms (30× reduction).
-        #
-        # capsfilter — enforces the target resolution and NV12 format so
-        #           GStreamer's autoplugging can insert any needed conversion.
+        # capsfilter — enforces the output NV12 dimensions.
         app_w, app_h = self._viewport[0], self._viewport[1]
         scale_w, scale_h = (app_w or 640), (app_h or 480)
-        log.info("NV12 appsink: queue → videoscale(nearest) → %dx%d before appsink", scale_w, scale_h)
+        log.info("NV12 appsink: videoscale(nearest) → %dx%d before appsink", scale_w, scale_h)
 
-        queue      = self._gst.ElementFactory.make("queue",       "vqueue")
         scale      = self._gst.ElementFactory.make("videoscale",  "vscale")
         capsfilter = self._gst.ElementFactory.make("capsfilter",  "vcaps")
-        if queue is None or scale is None or capsfilter is None:
+        if scale is None or capsfilter is None:
             # Core elements unavailable — fall back to unscaled NV12.
-            log.warning("queue/videoscale/capsfilter unavailable; using unscaled NV12 appsink")
+            log.warning("videoscale/capsfilter unavailable; using unscaled NV12 appsink")
             sink.set_property("caps", self._gst.Caps.from_string(
                 "video/x-raw,format=NV12;video/x-raw,format=BGRA"))
             return sink
 
-        # nearest-neighbour: reads only the needed source pixels (strided),
-        # much cheaper than bilinear which reads all adjacent pixels.
+        # nearest-neighbour: accesses only the source pixels needed for each
+        # output sample (strided reads), skipping ~56% of source rows entirely.
         scale.set_property("method", 0)
-        # drop oldest buffered frame when queue is full — keep the latest.
-        queue.set_property("max-size-buffers", 4)
-        queue.set_property("leaky", 2)
         capsfilter.set_property(
             "caps",
             self._gst.Caps.from_string(
@@ -471,19 +462,17 @@ class GStreamerBackend(PlayerBackend):
             ),
         )
 
-        # Wire queue → scale → capsfilter → appsink inside a bin.
+        # Wire scale → capsfilter → appsink inside a bin.
         bin_ = self._gst.Bin.new("vscale-bin")
-        bin_.add(queue)
         bin_.add(scale)
         bin_.add(capsfilter)
         bin_.add(sink)
-        queue.link(scale)
         scale.link(capsfilter)
         capsfilter.link(sink)
 
-        # Expose the queue element's sink pad as the bin's ghost sink pad
+        # Expose the scale element's sink pad as the bin's ghost sink pad
         # so playbin can push decoded frames into the bin.
-        sink_pad = queue.get_static_pad("sink")
+        sink_pad = scale.get_static_pad("sink")
         ghost = self._gst.GhostPad.new("sink", sink_pad)
         ghost.set_active(True)
         bin_.add_pad(ghost)
diff --git a/tests/benchmark_nv12_decode.py b/tests/benchmark_nv12_decode.py
index 0d6d6e5..245ca39 100644
--- a/tests/benchmark_nv12_decode.py
+++ b/tests/benchmark_nv12_decode.py
@@ -114,17 +114,13 @@ appsink.set_property("drop", True)
 video_sink = appsink
 
 if hw_decoders and "--noscale" not in sys.argv:
-    queue_el = Gst.ElementFactory.make("queue", "vqueue")
     scale_el = Gst.ElementFactory.make("videoscale", "vscale")
     cfilt_el = Gst.ElementFactory.make("capsfilter", "vcaps")
-    if scale_el is not None and cfilt_el is not None and queue_el is not None:
-        # nearest-neighbour: reads ~1/7 of source pixels (strided), far cheaper
-        # than bilinear which reads all adjacent pixels for each output sample.
+    if scale_el is not None and cfilt_el is not None:
+        # nearest-neighbour: skips ~56% of source rows so only ~44% of the
+        # source cache lines are loaded vs 100% for memmove of the full frame.
+        # Bilinear must read adjacent rows too, making it slower than nearest.
         scale_el.set_property("method", 0)
-        # queue decouples mppvideodec from the scale thread so the decoder is
-        # never stalled waiting for software scaling to finish.
-        queue_el.set_property("max-size-buffers", 4)
-        queue_el.set_property("leaky", 2)   # drop oldest when full → keep latest
         cfilt_el.set_property(
             "caps",
             Gst.Caps.from_string(
@@ -132,21 +128,19 @@ if hw_decoders and "--noscale" not in sys.argv:
             ),
         )
         bin_ = Gst.Bin.new("vscale-bin")
-        bin_.add(queue_el)
         bin_.add(scale_el)
         bin_.add(cfilt_el)
         bin_.add(appsink)
-        queue_el.link(scale_el)
         scale_el.link(cfilt_el)
         cfilt_el.link(appsink)
-        sink_pad = queue_el.get_static_pad("sink")
+        sink_pad = scale_el.get_static_pad("sink")
         ghost = Gst.GhostPad.new("sink", sink_pad)
         ghost.set_active(True)
         bin_.add_pad(ghost)
         video_sink = bin_
-        print(f"[scale] queue → videoscale(nearest) → {SCALE_W}×{SCALE_H} NV12")
+        print(f"[scale] videoscale(nearest) → {SCALE_W}×{SCALE_H} NV12")
     else:
-        print("[scale] queue/videoscale element unavailable — falling back to unscaled NV12")
+        print("[scale] videoscale element unavailable — falling back to unscaled NV12")
         appsink.set_property(
             "caps", Gst.Caps.from_string("video/x-raw,format=NV12;video/x-raw,format=BGRA")
         )
@@ -158,7 +152,7 @@ elif hw_decoders:
 else:
     appsink.set_property("caps", Gst.Caps.from_string("video/x-raw,format=BGRA"))
 
-print(f"[caps] video-sink: {'GstBin(videoscale+capsfilter+appsink)' if video_sink is not appsink else 'appsink'}")
+print(f"[caps] video-sink: {'GstBin(videoscale-nearest+capsfilter+appsink)' if video_sink is not appsink else 'appsink'}")
 
 pipeline.set_property("video-sink", video_sink)
 pipeline.set_property("uri", url)