From 995830e3d28f5f63fd4f5a84753be8663cc34002 Mon Sep 17 00:00:00 2001 From: Matteo Benedetto Date: Tue, 24 Mar 2026 10:54:02 +0100 Subject: [PATCH] player,bench: drop queue from vscale-bin (leaky=2 caused massive drops), keep nearest-neighbour --- .../player/gstreamer_backend.py | 45 +++++++------------ tests/benchmark_nv12_decode.py | 22 ++++----- 2 files changed, 25 insertions(+), 42 deletions(-) diff --git a/src/r36s_dlna_browser/player/gstreamer_backend.py b/src/r36s_dlna_browser/player/gstreamer_backend.py index 3531616..94963ea 100644 --- a/src/r36s_dlna_browser/player/gstreamer_backend.py +++ b/src/r36s_dlna_browser/player/gstreamer_backend.py @@ -428,42 +428,33 @@ class GStreamerBackend(PlayerBackend): sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA")) return sink - # Hardware decode (NV12): insert a queue → videoscale → capsfilter chain - # inside a GstBin before the appsink so playbin accepts it as a single - # video-sink element. + # Hardware decode (NV12): insert a videoscale → capsfilter chain inside a + # GstBin before the appsink so playbin accepts it as a single video-sink. # - # queue — decouples mppvideodec from the scale thread so the HW - # decoder is never stalled waiting for SW scaling to finish. - # leaky=2 (downstream) drops the oldest queued frame when - # full, ensuring Python always receives the latest frame. + # videoscale(method=nearest-neighbour) — scales 1920×1080 → 640×480. + # Nearest-neighbour skips ~56% of source rows so only ~44% of the + # source cache lines are fetched. This is cheaper than the full + # 3.1 MB memmove (which loads 100% of cache lines), cutting the + # Python memmove from ~32 ms to ~1 ms at the cost of some SW scale + # CPU time (~14 ms estimated for nearest vs 32 ms for bilinear). # - # videoscale(method=nearest) — scales 1920×1080 → 640×480 using - # nearest-neighbour interpolation (fastest SW method). - # Python receives 460 KB per frame instead of 3.1 MB, - # cutting memmove cost from ~32 ms to ~1 ms (30× reduction). - # - # capsfilter — enforces the target resolution and NV12 format so - # GStreamer's autoplugging can insert any needed conversion. + # capsfilter — enforces the output NV12 dimensions. app_w, app_h = self._viewport[0], self._viewport[1] scale_w, scale_h = (app_w or 640), (app_h or 480) - log.info("NV12 appsink: queue → videoscale(nearest) → %dx%d before appsink", scale_w, scale_h) + log.info("NV12 appsink: videoscale(nearest) → %dx%d before appsink", scale_w, scale_h) - queue = self._gst.ElementFactory.make("queue", "vqueue") scale = self._gst.ElementFactory.make("videoscale", "vscale") capsfilter = self._gst.ElementFactory.make("capsfilter", "vcaps") - if queue is None or scale is None or capsfilter is None: + if scale is None or capsfilter is None: # Core elements unavailable — fall back to unscaled NV12. - log.warning("queue/videoscale/capsfilter unavailable; using unscaled NV12 appsink") + log.warning("videoscale/capsfilter unavailable; using unscaled NV12 appsink") sink.set_property("caps", self._gst.Caps.from_string( "video/x-raw,format=NV12;video/x-raw,format=BGRA")) return sink - # nearest-neighbour: reads only the needed source pixels (strided), - # much cheaper than bilinear which reads all adjacent pixels. + # nearest-neighbour: accesses only the source pixels needed for each + # output sample (strided reads), skipping ~56% of source rows entirely. scale.set_property("method", 0) - # drop oldest buffered frame when queue is full — keep the latest. - queue.set_property("max-size-buffers", 4) - queue.set_property("leaky", 2) capsfilter.set_property( "caps", self._gst.Caps.from_string( @@ -471,19 +462,17 @@ class GStreamerBackend(PlayerBackend): ), ) - # Wire queue → scale → capsfilter → appsink inside a bin. + # Wire scale → capsfilter → appsink inside a bin. bin_ = self._gst.Bin.new("vscale-bin") - bin_.add(queue) bin_.add(scale) bin_.add(capsfilter) bin_.add(sink) - queue.link(scale) scale.link(capsfilter) capsfilter.link(sink) - # Expose the queue element's sink pad as the bin's ghost sink pad + # Expose the scale element's sink pad as the bin's ghost sink pad # so playbin can push decoded frames into the bin. - sink_pad = queue.get_static_pad("sink") + sink_pad = scale.get_static_pad("sink") ghost = self._gst.GhostPad.new("sink", sink_pad) ghost.set_active(True) bin_.add_pad(ghost) diff --git a/tests/benchmark_nv12_decode.py b/tests/benchmark_nv12_decode.py index 0d6d6e5..245ca39 100644 --- a/tests/benchmark_nv12_decode.py +++ b/tests/benchmark_nv12_decode.py @@ -114,17 +114,13 @@ appsink.set_property("drop", True) video_sink = appsink if hw_decoders and "--noscale" not in sys.argv: - queue_el = Gst.ElementFactory.make("queue", "vqueue") scale_el = Gst.ElementFactory.make("videoscale", "vscale") cfilt_el = Gst.ElementFactory.make("capsfilter", "vcaps") - if scale_el is not None and cfilt_el is not None and queue_el is not None: - # nearest-neighbour: reads ~1/7 of source pixels (strided), far cheaper - # than bilinear which reads all adjacent pixels for each output sample. + if scale_el is not None and cfilt_el is not None: + # nearest-neighbour: skips ~56% of source rows so only ~44% of the + # source cache lines are loaded vs 100% for memmove of the full frame. + # Bilinear must read adjacent rows too, making it slower than nearest. scale_el.set_property("method", 0) - # queue decouples mppvideodec from the scale thread so the decoder is - # never stalled waiting for software scaling to finish. - queue_el.set_property("max-size-buffers", 4) - queue_el.set_property("leaky", 2) # drop oldest when full → keep latest cfilt_el.set_property( "caps", Gst.Caps.from_string( @@ -132,21 +128,19 @@ if hw_decoders and "--noscale" not in sys.argv: ), ) bin_ = Gst.Bin.new("vscale-bin") - bin_.add(queue_el) bin_.add(scale_el) bin_.add(cfilt_el) bin_.add(appsink) - queue_el.link(scale_el) scale_el.link(cfilt_el) cfilt_el.link(appsink) - sink_pad = queue_el.get_static_pad("sink") + sink_pad = scale_el.get_static_pad("sink") ghost = Gst.GhostPad.new("sink", sink_pad) ghost.set_active(True) bin_.add_pad(ghost) video_sink = bin_ - print(f"[scale] queue → videoscale(nearest) → {SCALE_W}×{SCALE_H} NV12") + print(f"[scale] videoscale(nearest) → {SCALE_W}×{SCALE_H} NV12") else: - print("[scale] queue/videoscale element unavailable — falling back to unscaled NV12") + print("[scale] videoscale element unavailable — falling back to unscaled NV12") appsink.set_property( "caps", Gst.Caps.from_string("video/x-raw,format=NV12;video/x-raw,format=BGRA") ) @@ -158,7 +152,7 @@ elif hw_decoders: else: appsink.set_property("caps", Gst.Caps.from_string("video/x-raw,format=BGRA")) -print(f"[caps] video-sink: {'GstBin(videoscale+capsfilter+appsink)' if video_sink is not appsink else 'appsink'}") +print(f"[caps] video-sink: {'GstBin(videoscale-nearest+capsfilter+appsink)' if video_sink is not appsink else 'appsink'}") pipeline.set_property("video-sink", video_sink) pipeline.set_property("uri", url)