perf: insert videoscale before appsink to cut NV12 memmove 6.7×

When hardware decode (mppvideodec/NV12) is active, wrap the appsink in a GstBin with a videoscale element so the VPU decodes at full stream resolution but Python only receives a frame pre-scaled to the SDL display size (default 640x480). Effect: NV12 buffer per frame: 3,133,440 B (1080p) → 460,800 B (640x480) memmove per frame: ~33 ms (80.5% budget) → ~5 ms (expected ~12%) The videoscale bilinear step runs entirely in software on the A35 cores but scales down 6.7×, so its cost is far lower than the avoided memmove. SDL still handles final aspect-ratio fitting inside the viewport, so visual quality is unchanged relative to what the 640x480 display can show. Fallback: if videoscale is not available, unscaled NV12 is used as before.
1 week ago · 67224626a5
2 changed files with 81 additions and 8 deletions
--- a/src/r36s_dlna_browser/player/gstreamer_backend.py
+++ b/src/r36s_dlna_browser/player/gstreamer_backend.py
@ -421,16 +421,55 @@ class GStreamerBackend(PlayerBackend):
        sink.set_property("sync", True)
        sink.set_property("max-buffers", 2)
        sink.set_property("drop", True)
-        # Accept NV12 when hardware decode is active (avoids a software colourspace
-        # conversion step); fall back to BGRA for the software-decode path.
-        if self._hw_decoders:
-            caps_str = "video/x-raw,format=NV12;video/x-raw,format=BGRA"
-        else:
-            caps_str = "video/x-raw,format=BGRA"
-        sink.set_property("caps", self._gst.Caps.from_string(caps_str))
        sink.connect("new-sample", self._on_new_sample)
+
+        if not self._hw_decoders:
+            # Software decode: request BGRA directly, no scaling bin needed.
+            sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA"))
+            return sink
+
+        # Hardware decode (NV12): insert a videoscale element before the appsink
+        # so mppvideodec can decode at full resolution in HW, but Python only
+        # receives a frame scaled to the display size (default 640x480).
+        # This cuts the memmove from 3.1 MB (1080p) to ~460 KB (640x480) per frame
+        # — a 6.7× reduction in CPU copy cost.
+        app_w, app_h = self._viewport[0], self._viewport[1]
+        scale_w, scale_h = (app_w or 640), (app_h or 480)
+        log.info("NV12 appsink: inserting videoscale → %dx%d before appsink", scale_w, scale_h)
+
+        scale = self._gst.ElementFactory.make("videoscale", "vscale")
+        capsfilter = self._gst.ElementFactory.make("capsfilter", "vcaps")
+        if scale is None or capsfilter is None:
+            # videoscale not available — fall back to unscaled NV12
+            log.warning("videoscale element unavailable; using unscaled NV12 appsink")
+            sink.set_property("caps", self._gst.Caps.from_string(
+                "video/x-raw,format=NV12;video/x-raw,format=BGRA"))
            return sink

+        capsfilter.set_property(
+            "caps",
+            self._gst.Caps.from_string(
+                f"video/x-raw,format=NV12,width={scale_w},height={scale_h}"
+            ),
+        )
+
+        # Wire scale → capsfilter → appsink inside a bin so playbin accepts it
+        # as a single video-sink element.
+        bin_ = self._gst.Bin.new("vscale-bin")
+        bin_.add(scale)
+        bin_.add(capsfilter)
+        bin_.add(sink)
+        scale.link(capsfilter)
+        capsfilter.link(sink)
+
+        # Expose the scale element's sink pad as the bin's ghost sink pad.
+        sink_pad = scale.get_static_pad("sink")
+        ghost = self._gst.GhostPad.new("sink", sink_pad)
+        ghost.set_active(True)
+        bin_.add_pad(ghost)
+
+        return bin_
+
    def _on_new_sample(self, sink) -> Any:
        sample = sink.emit("pull-sample")
        if sample is None:
--- a/tests/test_player.py
+++ b/tests/test_player.py
@ -189,6 +189,30 @@ class FakeMessage:
        return SimpleNamespace(get_name=lambda: self._structure_name)


+class FakeGhostPad:
+    def __init__(self, name, pad):
+        self.name = name
+    def set_active(self, _):
+        pass
+
+
+class FakeBin:
+    def __init__(self, name=""):
+        self.name = name
+        self._elements = []
+        self._pads = []
+        self.props = {}
+
+    def add(self, elem):
+        self._elements.append(elem)
+
+    def add_pad(self, pad):
+        self._pads.append(pad)
+
+    def set_property(self, name, value):
+        self.props[name] = value
+
+
 class FakeMapFlags:
    READ = 1

@ -204,6 +228,16 @@ class FakeGst:
    SECOND = 1_000_000_000
    MSECOND = 1_000_000
    Caps = SimpleNamespace(from_string=lambda value: value)
+    GhostPad = SimpleNamespace(new=lambda name, pad: FakeGhostPad(name, pad))
+    Bin = SimpleNamespace(new=lambda name="": FakeBin(name))
+
+    @staticmethod
+    def ElementFactory_make(name, alias=None):
+        return None  # signal "not available" so SW fallback kicks in
+
+    # GStreamer uses Gst.ElementFactory.make, exposed as class attribute below
+ElementFactory = SimpleNamespace(make=lambda name, alias=None: None)
+FakeGst.ElementFactory = ElementFactory


 class _FakeFinfo: