perf: insert videoscale before appsink to cut NV12 memmove 6.7×

When hardware decode (mppvideodec/NV12) is active, wrap the appsink in a GstBin with a videoscale element so the VPU decodes at full stream resolution but Python only receives a frame pre-scaled to the SDL display size (default 640x480). Effect: NV12 buffer per frame: 3,133,440 B (1080p) → 460,800 B (640x480) memmove per frame: ~33 ms (80.5% budget) → ~5 ms (expected ~12%) The videoscale bilinear step runs entirely in software on the A35 cores but scales down 6.7×, so its cost is far lower than the avoided memmove. SDL still handles final aspect-ratio fitting inside the viewport, so visual quality is unchanged relative to what the 640x480 display can show. Fallback: if videoscale is not available, unscaled NV12 is used as before.
1 week ago · 67224626a5
2 changed files with 81 additions and 8 deletions
--- a/src/r36s_dlna_browser/player/gstreamer_backend.py
+++ b/src/r36s_dlna_browser/player/gstreamer_backend.py
@ -421,15 +421,54 @@ class GStreamerBackend(PlayerBackend):
        sink.set_property("sync", True)
        sink.set_property("max-buffers", 2)
        sink.set_property("drop", True)
        # Accept NV12 when hardware decode is active (avoids a software colourspace
        # conversion step); fall back to BGRA for the software-decode path.
        if self._hw_decoders:
            caps_str = "video/x-raw,format=NV12;video/x-raw,format=BGRA"
        else:
            caps_str = "video/x-raw,format=BGRA"
        sink.set_property("caps", self._gst.Caps.from_string(caps_str))
        sink.connect("new-sample", self._on_new_sample)
-        return sink
+
        if not self._hw_decoders:
            # Software decode: request BGRA directly, no scaling bin needed.
            sink.set_property("caps", self._gst.Caps.from_string("video/x-raw,format=BGRA"))
            return sink
        # Hardware decode (NV12): insert a videoscale element before the appsink
        # so mppvideodec can decode at full resolution in HW, but Python only
        # receives a frame scaled to the display size (default 640x480).
        # This cuts the memmove from 3.1 MB (1080p) to ~460 KB (640x480) per frame
        # — a 6.7× reduction in CPU copy cost.
        app_w, app_h = self._viewport[0], self._viewport[1]
        scale_w, scale_h = (app_w or 640), (app_h or 480)
        log.info("NV12 appsink: inserting videoscale → %dx%d before appsink", scale_w, scale_h)
        scale = self._gst.ElementFactory.make("videoscale", "vscale")
        capsfilter = self._gst.ElementFactory.make("capsfilter", "vcaps")
        if scale is None or capsfilter is None:
            # videoscale not available — fall back to unscaled NV12
            log.warning("videoscale element unavailable; using unscaled NV12 appsink")
            sink.set_property("caps", self._gst.Caps.from_string(
                "video/x-raw,format=NV12;video/x-raw,format=BGRA"))
            return sink
        capsfilter.set_property(
            "caps",
            self._gst.Caps.from_string(
                f"video/x-raw,format=NV12,width={scale_w},height={scale_h}"
            ),
        )
        # Wire scale → capsfilter → appsink inside a bin so playbin accepts it
        # as a single video-sink element.
        bin_ = self._gst.Bin.new("vscale-bin")
        bin_.add(scale)
        bin_.add(capsfilter)
        bin_.add(sink)
        scale.link(capsfilter)
        capsfilter.link(sink)
        # Expose the scale element's sink pad as the bin's ghost sink pad.
        sink_pad = scale.get_static_pad("sink")
        ghost = self._gst.GhostPad.new("sink", sink_pad)
        ghost.set_active(True)
        bin_.add_pad(ghost)
        return bin_
    def _on_new_sample(self, sink) -> Any:
        sample = sink.emit("pull-sample")
--- a/tests/test_player.py
+++ b/tests/test_player.py
@ -189,6 +189,30 @@ class FakeMessage:
        return SimpleNamespace(get_name=lambda: self._structure_name)
 class FakeGhostPad:
    def __init__(self, name, pad):
        self.name = name
    def set_active(self, _):
        pass
 class FakeBin:
    def __init__(self, name=""):
        self.name = name
        self._elements = []
        self._pads = []
        self.props = {}
    def add(self, elem):
        self._elements.append(elem)
    def add_pad(self, pad):
        self._pads.append(pad)
    def set_property(self, name, value):
        self.props[name] = value
 class FakeMapFlags:
    READ = 1
@ -204,6 +228,16 @@ class FakeGst:
    SECOND = 1_000_000_000
    MSECOND = 1_000_000
    Caps = SimpleNamespace(from_string=lambda value: value)
    GhostPad = SimpleNamespace(new=lambda name, pad: FakeGhostPad(name, pad))
    Bin = SimpleNamespace(new=lambda name="": FakeBin(name))
    @staticmethod
    def ElementFactory_make(name, alias=None):
        return None  # signal "not available" so SW fallback kicks in
    # GStreamer uses Gst.ElementFactory.make, exposed as class attribute below
 ElementFactory = SimpleNamespace(make=lambda name, alias=None: None)
 FakeGst.ElementFactory = ElementFactory
 class _FakeFinfo: