From 67224626a5a7e4f1296ca464d4d3750d4b6619d5 Mon Sep 17 00:00:00 2001
From: Matteo Benedetto <matteo.benedetto@local>
Date: Tue, 24 Mar 2026 10:33:33 +0100
Subject: [PATCH] =?UTF-8?q?perf:=20insert=20videoscale=20before=20appsink?=
 =?UTF-8?q?=20to=20cut=20NV12=20memmove=206.7=C3=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When hardware decode (mppvideodec/NV12) is active, wrap the appsink in a
GstBin with a videoscale element so the VPU decodes at full stream
resolution but Python only receives a frame pre-scaled to the SDL display
size (default 640x480).

Effect:
  NV12 buffer per frame: 3,133,440 B (1080p) → 460,800 B (640x480)
  memmove per frame:     ~33 ms (80.5% budget) → ~5 ms (expected ~12%)

The videoscale bilinear step runs entirely in software on the A35 cores
but scales down 6.7×, so its cost is far lower than the avoided memmove.
SDL still handles final aspect-ratio fitting inside the viewport, so
visual quality is unchanged relative to what the 640x480 display can show.

Fallback: if videoscale is not available, unscaled NV12 is used as before.
---
 .../player/gstreamer_backend.py               | 55 ++++++++++++++++---
 tests/test_player.py                          | 34 ++++++++++++
 2 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/src/r36s_dlna_browser/player/gstreamer_backend.py b/src/r36s_dlna_browser/player/gstreamer_backend.py
index 516d8ad..a0a2ebc 100644
--- a/src/r36s_dlna_browser/player/gstreamer_backend.py
+++ b/src/r36s_dlna_browser/player/gstreamer_backend.py
@@ -421,15 +421,54 @@ class GStreamerBackend(PlayerBackend):
         sink.set_property("sync", True)
         sink.set_property("max-buffers", 2)
         sink.set_property("drop", True)
-        # Accept NV12 when hardware decode is active (avoids a software colourspace
-        # conversion step); fall back to BGRA for the software-decode path.
-        if self._hw_decoders:
-            caps_str = "video/x-raw,format=NV12;video/x-raw,format=BGRA"
-        else:
-            caps_str = "video/x-raw,format=BGRA"
-        sink.set_property("caps", self._gst.Caps.from_string(caps_str))
         sink.connect("new-sample", self._on_new_sample)
-        return sink
+
+        if not self._hw_decoders:
+            # Software decode: request BGRA directly, no scaling bin needed.
+            sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA"))
+            return sink
+
+        # Hardware decode (NV12): insert a videoscale element before the appsink
+        # so mppvideodec can decode at full resolution in HW, but Python only
+        # receives a frame scaled to the display size (default 640x480).
+        # This cuts the memmove from 3.1 MB (1080p) to ~460 KB (640x480) per frame
+        # — a 6.7× reduction in CPU copy cost.
+        app_w, app_h = self._viewport[0], self._viewport[1]
+        scale_w, scale_h = (app_w or 640), (app_h or 480)
+        log.info("NV12 appsink: inserting videoscale → %dx%d before appsink", scale_w, scale_h)
+
+        scale = self._gst.ElementFactory.make("videoscale", "vscale")
+        capsfilter = self._gst.ElementFactory.make("capsfilter", "vcaps")
+        if scale is None or capsfilter is None:
+            # videoscale not available — fall back to unscaled NV12
+            log.warning("videoscale element unavailable; using unscaled NV12 appsink")
+            sink.set_property("caps", self._gst.Caps.from_string(
+                "video/x-raw,format=NV12;video/x-raw,format=BGRA"))
+            return sink
+
+        capsfilter.set_property(
+            "caps",
+            self._gst.Caps.from_string(
+                f"video/x-raw,format=NV12,width={scale_w},height={scale_h}"
+            ),
+        )
+
+        # Wire scale → capsfilter → appsink inside a bin so playbin accepts it
+        # as a single video-sink element.
+        bin_ = self._gst.Bin.new("vscale-bin")
+        bin_.add(scale)
+        bin_.add(capsfilter)
+        bin_.add(sink)
+        scale.link(capsfilter)
+        capsfilter.link(sink)
+
+        # Expose the scale element's sink pad as the bin's ghost sink pad.
+        sink_pad = scale.get_static_pad("sink")
+        ghost = self._gst.GhostPad.new("sink", sink_pad)
+        ghost.set_active(True)
+        bin_.add_pad(ghost)
+
+        return bin_
 
     def _on_new_sample(self, sink) -> Any:
         sample = sink.emit("pull-sample")
diff --git a/tests/test_player.py b/tests/test_player.py
index 98434b8..8f0fc01 100644
--- a/tests/test_player.py
+++ b/tests/test_player.py
@@ -189,6 +189,30 @@ class FakeMessage:
         return SimpleNamespace(get_name=lambda: self._structure_name)
 
 
+class FakeGhostPad:
+    def __init__(self, name, pad):
+        self.name = name
+    def set_active(self, _):
+        pass
+
+
+class FakeBin:
+    def __init__(self, name=""):
+        self.name = name
+        self._elements = []
+        self._pads = []
+        self.props = {}
+
+    def add(self, elem):
+        self._elements.append(elem)
+
+    def add_pad(self, pad):
+        self._pads.append(pad)
+
+    def set_property(self, name, value):
+        self.props[name] = value
+
+
 class FakeMapFlags:
     READ = 1
 
@@ -204,6 +228,16 @@ class FakeGst:
     SECOND = 1_000_000_000
     MSECOND = 1_000_000
     Caps = SimpleNamespace(from_string=lambda value: value)
+    GhostPad = SimpleNamespace(new=lambda name, pad: FakeGhostPad(name, pad))
+    Bin = SimpleNamespace(new=lambda name="": FakeBin(name))
+
+    @staticmethod
+    def ElementFactory_make(name, alias=None):
+        return None  # signal "not available" so SW fallback kicks in
+
+    # GStreamer uses Gst.ElementFactory.make, exposed as class attribute below
+ElementFactory = SimpleNamespace(make=lambda name, alias=None: None)
+FakeGst.ElementFactory = ElementFactory
 
 
 class _FakeFinfo: