Browse Source

player,bench: drop queue from vscale-bin (leaky=2 caused massive drops), keep nearest-neighbour

main
Matteo Benedetto 1 week ago
parent
commit
995830e3d2
  1. 45
      src/r36s_dlna_browser/player/gstreamer_backend.py
  2. 22
      tests/benchmark_nv12_decode.py

45
src/r36s_dlna_browser/player/gstreamer_backend.py

@ -428,42 +428,33 @@ class GStreamerBackend(PlayerBackend):
sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA")) sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA"))
return sink return sink
# Hardware decode (NV12): insert a queue → videoscale → capsfilter chain # Hardware decode (NV12): insert a videoscale → capsfilter chain inside a
# inside a GstBin before the appsink so playbin accepts it as a single # GstBin before the appsink so playbin accepts it as a single video-sink.
# video-sink element.
# #
# queue — decouples mppvideodec from the scale thread so the HW # videoscale(method=nearest-neighbour) — scales 1920×1080 → 640×480.
# decoder is never stalled waiting for SW scaling to finish. # Nearest-neighbour skips ~56% of source rows so only ~44% of the
# leaky=2 (downstream) drops the oldest queued frame when # source cache lines are fetched. This is cheaper than the full
# full, ensuring Python always receives the latest frame. # 3.1 MB memmove (which loads 100% of cache lines), cutting the
# Python memmove from ~32 ms to ~1 ms at the cost of some SW scale
# CPU time (~14 ms estimated for nearest vs 32 ms for bilinear).
# #
# videoscale(method=nearest) — scales 1920×1080 → 640×480 using # capsfilter — enforces the output NV12 dimensions.
# nearest-neighbour interpolation (fastest SW method).
# Python receives 460 KB per frame instead of 3.1 MB,
# cutting memmove cost from ~32 ms to ~1 ms (30× reduction).
#
# capsfilter — enforces the target resolution and NV12 format so
# GStreamer's autoplugging can insert any needed conversion.
app_w, app_h = self._viewport[0], self._viewport[1] app_w, app_h = self._viewport[0], self._viewport[1]
scale_w, scale_h = (app_w or 640), (app_h or 480) scale_w, scale_h = (app_w or 640), (app_h or 480)
log.info("NV12 appsink: queue → videoscale(nearest) → %dx%d before appsink", scale_w, scale_h) log.info("NV12 appsink: videoscale(nearest) → %dx%d before appsink", scale_w, scale_h)
queue = self._gst.ElementFactory.make("queue", "vqueue")
scale = self._gst.ElementFactory.make("videoscale", "vscale") scale = self._gst.ElementFactory.make("videoscale", "vscale")
capsfilter = self._gst.ElementFactory.make("capsfilter", "vcaps") capsfilter = self._gst.ElementFactory.make("capsfilter", "vcaps")
if queue is None or scale is None or capsfilter is None: if scale is None or capsfilter is None:
# Core elements unavailable — fall back to unscaled NV12. # Core elements unavailable — fall back to unscaled NV12.
log.warning("queue/videoscale/capsfilter unavailable; using unscaled NV12 appsink") log.warning("videoscale/capsfilter unavailable; using unscaled NV12 appsink")
sink.set_property("caps", self._gst.Caps.from_string( sink.set_property("caps", self._gst.Caps.from_string(
"video/x-raw,format=NV12;video/x-raw,format=BGRA")) "video/x-raw,format=NV12;video/x-raw,format=BGRA"))
return sink return sink
# nearest-neighbour: reads only the needed source pixels (strided), # nearest-neighbour: accesses only the source pixels needed for each
# much cheaper than bilinear which reads all adjacent pixels. # output sample (strided reads), skipping ~56% of source rows entirely.
scale.set_property("method", 0) scale.set_property("method", 0)
# drop oldest buffered frame when queue is full — keep the latest.
queue.set_property("max-size-buffers", 4)
queue.set_property("leaky", 2)
capsfilter.set_property( capsfilter.set_property(
"caps", "caps",
self._gst.Caps.from_string( self._gst.Caps.from_string(
@ -471,19 +462,17 @@ class GStreamerBackend(PlayerBackend):
), ),
) )
# Wire queue → scale → capsfilter → appsink inside a bin. # Wire scale → capsfilter → appsink inside a bin.
bin_ = self._gst.Bin.new("vscale-bin") bin_ = self._gst.Bin.new("vscale-bin")
bin_.add(queue)
bin_.add(scale) bin_.add(scale)
bin_.add(capsfilter) bin_.add(capsfilter)
bin_.add(sink) bin_.add(sink)
queue.link(scale)
scale.link(capsfilter) scale.link(capsfilter)
capsfilter.link(sink) capsfilter.link(sink)
# Expose the queue element's sink pad as the bin's ghost sink pad # Expose the scale element's sink pad as the bin's ghost sink pad
# so playbin can push decoded frames into the bin. # so playbin can push decoded frames into the bin.
sink_pad = queue.get_static_pad("sink") sink_pad = scale.get_static_pad("sink")
ghost = self._gst.GhostPad.new("sink", sink_pad) ghost = self._gst.GhostPad.new("sink", sink_pad)
ghost.set_active(True) ghost.set_active(True)
bin_.add_pad(ghost) bin_.add_pad(ghost)

22
tests/benchmark_nv12_decode.py

@ -114,17 +114,13 @@ appsink.set_property("drop", True)
video_sink = appsink video_sink = appsink
if hw_decoders and "--noscale" not in sys.argv: if hw_decoders and "--noscale" not in sys.argv:
queue_el = Gst.ElementFactory.make("queue", "vqueue")
scale_el = Gst.ElementFactory.make("videoscale", "vscale") scale_el = Gst.ElementFactory.make("videoscale", "vscale")
cfilt_el = Gst.ElementFactory.make("capsfilter", "vcaps") cfilt_el = Gst.ElementFactory.make("capsfilter", "vcaps")
if scale_el is not None and cfilt_el is not None and queue_el is not None: if scale_el is not None and cfilt_el is not None:
# nearest-neighbour: reads ~1/7 of source pixels (strided), far cheaper # nearest-neighbour: skips ~56% of source rows so only ~44% of the
# than bilinear which reads all adjacent pixels for each output sample. # source cache lines are loaded vs 100% for memmove of the full frame.
# Bilinear must read adjacent rows too, making it slower than nearest.
scale_el.set_property("method", 0) scale_el.set_property("method", 0)
# queue decouples mppvideodec from the scale thread so the decoder is
# never stalled waiting for software scaling to finish.
queue_el.set_property("max-size-buffers", 4)
queue_el.set_property("leaky", 2) # drop oldest when full → keep latest
cfilt_el.set_property( cfilt_el.set_property(
"caps", "caps",
Gst.Caps.from_string( Gst.Caps.from_string(
@ -132,21 +128,19 @@ if hw_decoders and "--noscale" not in sys.argv:
), ),
) )
bin_ = Gst.Bin.new("vscale-bin") bin_ = Gst.Bin.new("vscale-bin")
bin_.add(queue_el)
bin_.add(scale_el) bin_.add(scale_el)
bin_.add(cfilt_el) bin_.add(cfilt_el)
bin_.add(appsink) bin_.add(appsink)
queue_el.link(scale_el)
scale_el.link(cfilt_el) scale_el.link(cfilt_el)
cfilt_el.link(appsink) cfilt_el.link(appsink)
sink_pad = queue_el.get_static_pad("sink") sink_pad = scale_el.get_static_pad("sink")
ghost = Gst.GhostPad.new("sink", sink_pad) ghost = Gst.GhostPad.new("sink", sink_pad)
ghost.set_active(True) ghost.set_active(True)
bin_.add_pad(ghost) bin_.add_pad(ghost)
video_sink = bin_ video_sink = bin_
print(f"[scale] queue → videoscale(nearest) → {SCALE_W}×{SCALE_H} NV12") print(f"[scale] videoscale(nearest) → {SCALE_W}×{SCALE_H} NV12")
else: else:
print("[scale] queue/videoscale element unavailable — falling back to unscaled NV12") print("[scale] videoscale element unavailable — falling back to unscaled NV12")
appsink.set_property( appsink.set_property(
"caps", Gst.Caps.from_string("video/x-raw,format=NV12;video/x-raw,format=BGRA") "caps", Gst.Caps.from_string("video/x-raw,format=NV12;video/x-raw,format=BGRA")
) )
@ -158,7 +152,7 @@ elif hw_decoders:
else: else:
appsink.set_property("caps", Gst.Caps.from_string("video/x-raw,format=BGRA")) appsink.set_property("caps", Gst.Caps.from_string("video/x-raw,format=BGRA"))
print(f"[caps] video-sink: {'GstBin(videoscale+capsfilter+appsink)' if video_sink is not appsink else 'appsink'}") print(f"[caps] video-sink: {'GstBin(videoscale-nearest+capsfilter+appsink)' if video_sink is not appsink else 'appsink'}")
pipeline.set_property("video-sink", video_sink) pipeline.set_property("video-sink", video_sink)
pipeline.set_property("uri", url) pipeline.set_property("uri", url)

Loading…
Cancel
Save