Initial import of Vosk Pi voice extension

2 months ago · 896530a0e8
5 changed files with 780 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+__pycache__/
+*.pyc
+node_modules/
--- a/README.md
+++ b/README.md
@ -0,0 +1,60 @@
+# pi-vosk-voice
+
+Push-to-talk voice input for Pi using Vosk offline speech recognition.
+
+## Features
+
+- Hold-to-talk recording
+- Live transcription while speaking
+- Partial stabilization on pauses
+- Writes into the Pi editor in `paste` mode
+- Optional `send` mode
+- Start/stop audio cues via SoX
+- Global or project-local Pi configuration
+
+## Requirements
+
+- `sox` (`rec` and `play`)
+- Python 3
+- `vosk` Python package
+- A downloaded Vosk model, e.g. `vosk-model-it-0.22`
+
+## Configuration
+
+Put this in `~/.pi/vosk-voice.json` or `.pi/vosk-voice.json`:
+
+```json
+{
+  "shortcut": "f12",
+  "enabled": true,
+  "mode": "paste",
+  "modelPath": "/home/you/.cache/vosk/vosk-model-it-0.22",
+  "soundsEnabled": true
+}
+```
+
+## Install in Pi
+
+```bash
+pi install /path/to/pi-vosk-voice
+```
+
+Then reload Pi:
+
+```text
+/reload
+```
+
+## Commands
+
+- `/voice`
+- `/voice config`
+- `/voice stop`
+- `/voice download-model`
+- `/voice set shortcut ctrl+t`
+- `/voice set mode paste`
+- `/voice set sounds false`
+
+## Notes
+
+The extension uses Vosk streaming partial/final results and updates the editor progressively.
--- a/extensions/index.ts
+++ b/extensions/index.ts
@ -0,0 +1,641 @@
+import { CustomEditor, type ExtensionAPI, type ExtensionContext } from "@mariozechner/pi-coding-agent";
+import { Key, isKeyRelease, isKittyProtocolActive, matchesKey, type KeyId } from "@mariozechner/pi-tui";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { spawn } from "node:child_process";
+import { fileURLToPath } from "node:url";
+
+type VoiceMode = "send" | "paste";
+
+type VoskVoiceConfig = {
+  shortcut: string;
+  enabled: boolean;
+  mode: VoiceMode;
+  modelPath: string;
+  soundsEnabled: boolean;
+};
+
+type ActiveRecording = {
+  baseText: string;
+  committedText: string;
+  partialText: string;
+  pendingPartialText: string;
+  partialDebounceTimer: ReturnType<typeof setTimeout> | null;
+  displayedText: string;
+  targetText: string;
+  typingTimer: ReturnType<typeof setTimeout> | null;
+  ignoreIncoming: boolean;
+  interruptedEditorText: string | null;
+  recProc: ReturnType<typeof spawn>;
+  sttProc: ReturnType<typeof spawn>;
+  recDone: Promise<void>;
+  sttDone: Promise<void>;
+};
+
+type StreamMessage = {
+  type: "ready" | "partial" | "final" | "fatal";
+  text?: string;
+  error?: string;
+};
+
+const STATUS_KEY = "vosk-voice";
+const SPINNER = ["⠁", "⠂", "⠄", "⠂"];
+const VOICE_HANDLER_SYMBOL = Symbol.for("vosk-voice:handleInput");
+const START_COOLDOWN_MS = 400;
+const PARTIAL_DEBOUNCE_MS = 120;
+const DEFAULT_CONFIG: VoskVoiceConfig = {
+  shortcut: "f12",
+  enabled: true,
+  mode: "paste",
+  modelPath: path.join(os.homedir(), ".cache", "vosk", "vosk-model-it-0.22"),
+  soundsEnabled: true,
+};
+const MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-it-0.22.zip";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const STREAMER_PATH = path.join(__dirname, "vosk_daemon.py");
+
+let protoPatchApplied = false;
+let config: VoskVoiceConfig = DEFAULT_CONFIG;
+let spinnerFrame = 0;
+let spinnerTimer: ReturnType<typeof setInterval> | null = null;
+let keyPressed = false;
+let lastStopTime = 0;
+let recording: ActiveRecording | null = null;
+
+function getProjectConfigPath(cwd: string) {
+  return path.join(cwd, ".pi", "vosk-voice.json");
+}
+
+function getGlobalConfigPath() {
+  return path.join(os.homedir(), ".pi", "vosk-voice.json");
+}
+
+function getReadableConfigPath(cwd: string) {
+  const projectPath = getProjectConfigPath(cwd);
+  if (fs.existsSync(projectPath)) return projectPath;
+  return getGlobalConfigPath();
+}
+
+function getEditableConfigPath(cwd: string) {
+  return getProjectConfigPath(cwd);
+}
+
+function ensureConfigDir(cwd: string) {
+  fs.mkdirSync(path.dirname(getEditableConfigPath(cwd)), { recursive: true });
+}
+
+function loadConfig(cwd: string): VoskVoiceConfig {
+  const candidate = getReadableConfigPath(cwd);
+  if (!fs.existsSync(candidate)) return { ...DEFAULT_CONFIG };
+
+  try {
+    const parsed = JSON.parse(fs.readFileSync(candidate, "utf8")) as Partial<VoskVoiceConfig>;
+    return {
+      shortcut: (parsed.shortcut ?? DEFAULT_CONFIG.shortcut).toLowerCase(),
+      enabled: parsed.enabled ?? DEFAULT_CONFIG.enabled,
+      mode: parsed.mode === "send" ? "send" : "paste",
+      modelPath: parsed.modelPath ?? DEFAULT_CONFIG.modelPath,
+      soundsEnabled: parsed.soundsEnabled ?? DEFAULT_CONFIG.soundsEnabled,
+    };
+  } catch {
+    return { ...DEFAULT_CONFIG };
+  }
+}
+
+function updateConfig(cwd: string, patch: Partial<VoskVoiceConfig>): VoskVoiceConfig {
+  const next = { ...config, ...patch };
+  ensureConfigDir(cwd);
+  fs.writeFileSync(getEditableConfigPath(cwd), `${JSON.stringify(next, null, 2)}\n`);
+  config = next;
+  return next;
+}
+
+function setStatus(ctx: ExtensionContext, text?: string) {
+  ctx.ui.setStatus(STATUS_KEY, text);
+}
+
+function spin(ctx: ExtensionContext) {
+  const frame = SPINNER[spinnerFrame % SPINNER.length]!;
+  spinnerFrame++;
+  setStatus(ctx, `${frame} recording`);
+}
+
+function startSpinner(ctx: ExtensionContext) {
+  stopSpinner(ctx);
+  spin(ctx);
+  spinnerTimer = setInterval(() => spin(ctx), 250);
+}
+
+function stopSpinner(ctx: ExtensionContext) {
+  if (spinnerTimer) {
+    clearInterval(spinnerTimer);
+    spinnerTimer = null;
+  }
+  setStatus(ctx, undefined);
+}
+
+function parseBooleanish(input: string): boolean | undefined {
+  const normalized = input.trim().toLowerCase();
+  if (["1", "true", "on", "yes", "enabled"].includes(normalized)) return true;
+  if (["0", "false", "off", "no", "disabled"].includes(normalized)) return false;
+  return undefined;
+}
+
+function playCue(kind: "start" | "stop") {
+  if (!config.soundsEnabled) return;
+  const args = kind === "start"
+    ? ["-q", "-n", "synth", "0.05", "sine", "880", "fade", "q", "0.01", "0.05", "0.02"]
+    : ["-q", "-n", "synth", "0.04", "sine", "660:440", "fade", "q", "0.005", "0.04", "0.02"];
+  const proc = spawn("play", args, { stdio: "ignore", detached: true });
+  proc.on("error", () => {});
+  proc.unref();
+}
+
+function ensureProtoPatch() {
+  if (protoPatchApplied) return;
+  protoPatchApplied = true;
+
+  const originalHandleInput = CustomEditor.prototype.handleInput as (data: string) => void;
+  CustomEditor.prototype.handleInput = function patchedHandleInput(data: string): void {
+    const handler = Reflect.get(globalThis, VOICE_HANDLER_SYMBOL) as ((data: string) => boolean) | undefined;
+    if (handler?.(data)) return;
+    originalHandleInput.call(this, data);
+  };
+}
+
+function joinTranscript(baseText: string, committedText: string, partialText: string) {
+  const spoken = `${committedText}${partialText ? `${committedText ? " " : ""}${partialText}` : ""}`.trim();
+  if (!spoken) return baseText;
+  if (!baseText) return spoken;
+  const sep = baseText.endsWith(" ") || baseText.endsWith("\n") ? "" : " ";
+  return `${baseText}${sep}${spoken}`;
+}
+
+function schedulePartialUpdate(ctx: ExtensionContext, current: ActiveRecording, text: string) {
+  current.pendingPartialText = text.trim();
+  if (current.partialDebounceTimer) clearTimeout(current.partialDebounceTimer);
+  current.partialDebounceTimer = setTimeout(() => {
+    current.partialDebounceTimer = null;
+    current.partialText = current.pendingPartialText;
+    scheduleTyping(ctx, current);
+  }, PARTIAL_DEBOUNCE_MS);
+}
+
+function scheduleTyping(ctx: ExtensionContext, current: ActiveRecording) {
+  if (config.mode !== "paste") return;
+  current.targetText = joinTranscript(current.baseText, current.committedText, current.partialText);
+
+  const tick = () => {
+    current.typingTimer = null;
+
+    if (current.displayedText === current.targetText) return;
+
+    if (current.targetText.startsWith(current.displayedText)) {
+      current.displayedText += current.targetText[current.displayedText.length] ?? "";
+      ctx.ui.setEditorText(current.displayedText);
+      current.typingTimer = setTimeout(tick, 12);
+      return;
+    }
+
+    current.displayedText = current.targetText;
+    ctx.ui.setEditorText(current.displayedText);
+  };
+
+  if (!current.typingTimer) current.typingTimer = setTimeout(tick, 12);
+}
+
+function flushTypedText(ctx: ExtensionContext, current: ActiveRecording) {
+  if (current.partialDebounceTimer) {
+    clearTimeout(current.partialDebounceTimer);
+    current.partialDebounceTimer = null;
+    current.partialText = current.pendingPartialText;
+  }
+  if (current.typingTimer) {
+    clearTimeout(current.typingTimer);
+    current.typingTimer = null;
+  }
+  current.targetText = joinTranscript(current.baseText, current.committedText, current.partialText);
+  current.displayedText = current.targetText;
+  if (config.mode === "paste") ctx.ui.setEditorText(current.displayedText);
+}
+
+function appendCommitted(current: ActiveRecording, text: string) {
+  const clean = text.trim();
+  if (!clean) return;
+  current.committedText = current.committedText ? `${current.committedText} ${clean}` : clean;
+  current.partialText = "";
+  current.pendingPartialText = "";
+  if (current.partialDebounceTimer) {
+    clearTimeout(current.partialDebounceTimer);
+    current.partialDebounceTimer = null;
+  }
+}
+
+function reanchorToEditorText(current: ActiveRecording, editorText: string) {
+  if (current.partialDebounceTimer) {
+    clearTimeout(current.partialDebounceTimer);
+    current.partialDebounceTimer = null;
+  }
+  if (current.typingTimer) {
+    clearTimeout(current.typingTimer);
+    current.typingTimer = null;
+  }
+  current.baseText = editorText;
+  current.committedText = "";
+  current.partialText = "";
+  current.pendingPartialText = "";
+  current.displayedText = editorText;
+  current.targetText = editorText;
+}
+
+function startRecording(ctx: ExtensionContext) {
+  if (Date.now() - lastStopTime < START_COOLDOWN_MS) return;
+  if (recording) return;
+  if (!fs.existsSync(config.modelPath)) {
+    ctx.ui.notify(`Modello Vosk non trovato: ${config.modelPath}`, "error");
+    return;
+  }
+
+  const recProc = spawn("rec", [
+    "-q",
+    "-t", "raw",
+    "-r", "16000",
+    "-b", "16",
+    "-c", "1",
+    "-e", "signed-integer",
+    "-",
+  ], { stdio: ["ignore", "pipe", "pipe"] });
+
+  const sttProc = spawn("python3", [STREAMER_PATH, config.modelPath], {
+    stdio: ["pipe", "pipe", "pipe"],
+  });
+
+  if (!recProc.stdout || !recProc.stderr || !sttProc.stdin || !sttProc.stdout || !sttProc.stderr) {
+    ctx.ui.notify("Impossibile avviare la pipeline audio", "error");
+    try { recProc.kill("SIGKILL"); } catch {}
+    try { sttProc.kill("SIGKILL"); } catch {}
+    return;
+  }
+
+  const current: ActiveRecording = {
+    baseText: config.mode === "paste" ? ctx.ui.getEditorText() : "",
+    committedText: "",
+    partialText: "",
+    pendingPartialText: "",
+    partialDebounceTimer: null,
+    displayedText: config.mode === "paste" ? ctx.ui.getEditorText() : "",
+    targetText: config.mode === "paste" ? ctx.ui.getEditorText() : "",
+    typingTimer: null,
+    ignoreIncoming: false,
+    interruptedEditorText: null,
+    recProc,
+    sttProc,
+    recDone: new Promise((resolve) => recProc.once("close", () => resolve())),
+    sttDone: new Promise((resolve) => sttProc.once("close", () => resolve())),
+  };
+
+  let recErr = "";
+  recProc.stderr.setEncoding("utf8");
+  recProc.stderr.on("data", (chunk: string) => {
+    recErr += chunk;
+  });
+
+  let sttErr = "";
+  sttProc.stderr.setEncoding("utf8");
+  sttProc.stderr.on("data", (chunk: string) => {
+    sttErr += chunk;
+  });
+
+  let buffer = "";
+  sttProc.stdout.setEncoding("utf8");
+  sttProc.stdout.on("data", (chunk: string) => {
+    buffer += chunk;
+    while (true) {
+      const idx = buffer.indexOf("\n");
+      if (idx === -1) break;
+      const line = buffer.slice(0, idx).trim();
+      buffer = buffer.slice(idx + 1);
+      if (!line) continue;
+      try {
+        const msg = JSON.parse(line) as StreamMessage;
+        if (current.ignoreIncoming) {
+          continue;
+        }
+        if (msg.type === "fatal") {
+          ctx.ui.notify(msg.error || "Errore Vosk", "error");
+          continue;
+        }
+        if (msg.type === "partial") {
+          schedulePartialUpdate(ctx, current, msg.text ?? "");
+          continue;
+        }
+        if (msg.type === "final") {
+          appendCommitted(current, msg.text ?? "");
+          scheduleTyping(ctx, current);
+        }
+      } catch {
+        // ignore malformed lines
+      }
+    }
+  });
+
+  recProc.stdout.pipe(sttProc.stdin);
+  recProc.on("error", (error) => {
+    ctx.ui.notify(`Errore registrazione: ${error.message}`, "error");
+  });
+  sttProc.on("error", (error) => {
+    ctx.ui.notify(`Errore Vosk: ${error.message}`, "error");
+  });
+  recProc.on("exit", (code, signal) => {
+    const expected = signal === "SIGINT" || signal === "SIGKILL" || code === 0 || code === null;
+    if (!expected && recErr.trim()) {
+      ctx.ui.notify(`Errore registrazione: ${recErr.trim()}`, "error");
+    }
+  });
+  sttProc.on("exit", (code, signal) => {
+    const expected = signal === null && (code === 0 || code === null);
+    if (!expected && sttErr.trim()) {
+      ctx.ui.notify(`Errore Vosk: ${sttErr.trim()}`, "error");
+    }
+  });
+
+  recording = current;
+  playCue("start");
+  startSpinner(ctx);
+}
+
+async function stopRecording(
+  ctx: ExtensionContext,
+  pi: ExtensionAPI,
+  runTranscription: boolean,
+  preserveEditorTextOnInterrupt = false,
+) {
+  if (!recording) return;
+  lastStopTime = Date.now();
+  const current = recording;
+  recording = null;
+  if (!runTranscription) {
+    current.ignoreIncoming = true;
+    if (preserveEditorTextOnInterrupt && config.mode === "paste") {
+      current.interruptedEditorText = ctx.ui.getEditorText();
+    }
+  }
+  playCue("stop");
+  stopSpinner(ctx);
+
+  try { current.recProc.kill("SIGINT"); } catch {}
+  await current.recDone;
+  await current.sttDone;
+
+  const finalText = `${current.committedText}${current.partialText ? `${current.committedText ? " " : ""}${current.partialText}` : ""}`.trim();
+
+  if (!runTranscription) {
+    if (current.partialDebounceTimer) {
+      clearTimeout(current.partialDebounceTimer);
+      current.partialDebounceTimer = null;
+    }
+    if (current.typingTimer) {
+      clearTimeout(current.typingTimer);
+      current.typingTimer = null;
+    }
+    if (preserveEditorTextOnInterrupt && config.mode === "paste" && current.interruptedEditorText !== null) {
+      ctx.ui.setEditorText(current.interruptedEditorText);
+    } else if (config.mode === "paste") {
+      ctx.ui.setEditorText(current.baseText);
+    }
+    return;
+  }
+
+  if (!finalText) {
+    setStatus(ctx, "no speech detected");
+    setTimeout(() => setStatus(ctx, undefined), 2000);
+    return;
+  }
+
+  if (config.mode === "paste") {
+    flushTypedText(ctx, current);
+    setStatus(ctx, undefined);
+    return;
+  }
+
+  setStatus(ctx, "sending…");
+  if (ctx.isIdle()) pi.sendUserMessage(finalText);
+  else pi.sendUserMessage(finalText, { deliverAs: "followUp" });
+  setStatus(ctx, undefined);
+}
+
+export default function (pi: ExtensionAPI) {
+  config = loadConfig(process.cwd());
+
+  pi.on("session_start", (_event, ctx) => {
+    config = loadConfig(process.cwd());
+    ensureProtoPatch();
+    const keyId = config.shortcut.toLowerCase() as KeyId;
+
+    Reflect.set(globalThis, VOICE_HANDLER_SYMBOL, (data: string): boolean => {
+      if (recording && matchesKey(data, Key.enter)) {
+        keyPressed = false;
+        void stopRecording(ctx, pi, false, true);
+        return false;
+      }
+
+      if (recording && config.mode === "paste" && !isKeyRelease(data) && !matchesKey(data, keyId)) {
+        const current = recording;
+        setTimeout(() => {
+          if (!recording || recording !== current) return;
+          const editorText = ctx.ui.getEditorText();
+          if (editorText !== current.displayedText) {
+            reanchorToEditorText(current, editorText);
+          }
+        }, 0);
+      }
+
+      if (!matchesKey(data, keyId)) return false;
+      if (!config.enabled) return true;
+
+      if (!isKittyProtocolActive()) {
+        if (recording) {
+          keyPressed = false;
+          void stopRecording(ctx, pi, true);
+        } else {
+          keyPressed = true;
+          startRecording(ctx);
+        }
+        return true;
+      }
+
+      const released = isKeyRelease(data);
+      if (released) {
+        if (keyPressed) {
+          keyPressed = false;
+          void stopRecording(ctx, pi, true);
+        }
+        return true;
+      }
+
+      if (recording) {
+        keyPressed = false;
+        void stopRecording(ctx, pi, true);
+        return true;
+      }
+
+      if (keyPressed) return true;
+      keyPressed = true;
+      startRecording(ctx);
+      return true;
+    });
+
+    if (!fs.existsSync(config.modelPath)) {
+      ctx.ui.notify(
+        `Vosk model non trovato: ${config.modelPath}\nUsa /voice download-model per istruzioni.`,
+        "warning",
+      );
+    }
+  });
+
+  pi.on("session_shutdown", (_event, ctx) => {
+    Reflect.deleteProperty(globalThis, VOICE_HANDLER_SYMBOL);
+    keyPressed = false;
+    stopSpinner(ctx);
+    if (recording) {
+      if (recording.partialDebounceTimer) {
+        clearTimeout(recording.partialDebounceTimer);
+        recording.partialDebounceTimer = null;
+      }
+      if (recording.typingTimer) {
+        clearTimeout(recording.typingTimer);
+        recording.typingTimer = null;
+      }
+      try { recording.recProc.kill("SIGKILL"); } catch {}
+      try { recording.sttProc.kill("SIGKILL"); } catch {}
+      recording = null;
+    }
+  });
+
+  pi.registerShortcut(config.shortcut.toLowerCase() as KeyId, {
+    description: "Hold-to-talk con Vosk live (riavvia pi se cambi shortcut)",
+    handler: async (ctx) => {
+      ctx.ui.notify("Tieni premuto il tasto per registrare, rilascia per inviare", "info");
+    },
+  });
+
+  pi.registerCommand("voice", {
+    description: "Controlla Vosk voice (status/config/set/stop/download-model)",
+    handler: async (args, ctx) => {
+      config = loadConfig(process.cwd());
+      const trimmed = args.trim();
+      const [actionRaw, ...rest] = trimmed ? trimmed.split(/\s+/) : ["status"];
+      const action = (actionRaw ?? "status").toLowerCase();
+
+      if (action === "stop") {
+        if (!recording) {
+          ctx.ui.notify("Nessuna registrazione attiva", "info");
+          return;
+        }
+        keyPressed = false;
+        await stopRecording(ctx, pi, false, true);
+        ctx.ui.notify("Registrazione annullata", "info");
+        return;
+      }
+
+      if (action === "config") {
+        ctx.ui.notify(
+          [
+            `configPath: ${getReadableConfigPath(process.cwd())}`,
+            `shortcut:  ${config.shortcut}`,
+            `enabled:   ${config.enabled}`,
+            `mode:      ${config.mode}`,
+            `modelPath: ${config.modelPath}`,
+            `sounds:    ${config.soundsEnabled}`,
+            `live:      true`,
+            `modelUrl:  ${MODEL_URL}`,
+          ].join("\n"),
+          "info",
+        );
+        return;
+      }
+
+      if (action === "download-model") {
+        const targetDir = path.dirname(config.modelPath);
+        const zipPath = `${config.modelPath}.zip`;
+        ctx.ui.notify(
+          [
+            "Modello consigliato dal web:",
+            "- vosk-model-it-0.22",
+            "- modello italiano large/server (~1.2G)",
+            `URL: ${MODEL_URL}`,
+            "",
+            "Comandi:",
+            `mkdir -p ${targetDir}`,
+            `curl -L ${MODEL_URL} -o ${zipPath}`,
+            `unzip -q ${zipPath} -d ${targetDir}`,
+            `rm -f ${zipPath}`,
+          ].join("\n"),
+          "info",
+        );
+        return;
+      }
+
+      if (action === "set") {
+        if (rest.length < 2) {
+          ctx.ui.notify("Uso: /voice set <shortcut|enabled|mode|modelPath|sounds> <valore>", "warning");
+          return;
+        }
+        const field = rest[0]!.toLowerCase();
+        const value = rest.slice(1).join(" ");
+
+        if (field === "shortcut") {
+          updateConfig(process.cwd(), { shortcut: value.toLowerCase() });
+          ctx.ui.notify("Shortcut aggiornata. Riavvia pi per applicarla.", "info");
+          return;
+        }
+        if (field === "enabled") {
+          const parsed = parseBooleanish(value);
+          if (parsed === undefined) {
+            ctx.ui.notify("enabled deve essere true/false", "warning");
+            return;
+          }
+          updateConfig(process.cwd(), { enabled: parsed });
+          ctx.ui.notify(`Voice enabled=${parsed}`, "info");
+          return;
+        }
+        if (field === "mode") {
+          if (value !== "send" && value !== "paste") {
+            ctx.ui.notify("mode deve essere send o paste", "warning");
+            return;
+          }
+          updateConfig(process.cwd(), { mode: value as VoiceMode });
+          ctx.ui.notify(`Mode impostata a ${value}`, "info");
+          return;
+        }
+        if (field === "modelpath") {
+          updateConfig(process.cwd(), { modelPath: value });
+          ctx.ui.notify(`Model path aggiornato: ${value}`, "info");
+          return;
+        }
+        if (field === "sounds") {
+          const parsed = parseBooleanish(value);
+          if (parsed === undefined) {
+            ctx.ui.notify("sounds deve essere true/false", "warning");
+            return;
+          }
+          updateConfig(process.cwd(), { soundsEnabled: parsed });
+          ctx.ui.notify(`Sounds=${parsed}`, "info");
+          return;
+        }
+
+        ctx.ui.notify("Campo sconosciuto. Usa shortcut, enabled, mode, modelPath, sounds", "warning");
+        return;
+      }
+
+      const state = recording ? "🔴 recording" : "idle";
+      ctx.ui.notify(
+        `vosk-voice: ${state} (${config.mode}, enabled=${config.enabled}, shortcut=${config.shortcut}, live=true)`,
+        "info",
+      );
+    },
+  });
+}
--- a/extensions/vosk_daemon.py
+++ b/extensions/vosk_daemon.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+import json
+import sys
+from pathlib import Path
+
+try:
+    from vosk import Model, KaldiRecognizer, SetLogLevel
+except Exception as e:
+    print(json.dumps({"type": "fatal", "error": f"Python package 'vosk' non disponibile: {e}"}), flush=True)
+    raise SystemExit(1)
+
+
+def emit(obj: dict):
+    print(json.dumps(obj, ensure_ascii=False), flush=True)
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        emit({"type": "fatal", "error": "Uso: vosk_daemon.py <model_path>"})
+        return 1
+
+    model_path = Path(sys.argv[1]).expanduser()
+    if not model_path.exists():
+        emit({"type": "fatal", "error": f"Modello Vosk non trovato: {model_path}"})
+        return 1
+
+    SetLogLevel(-1)
+    model = Model(str(model_path))
+    rec = KaldiRecognizer(model, 16000)
+    last_partial = ""
+
+    emit({"type": "ready"})
+
+    while True:
+        data = sys.stdin.buffer.read(4000)
+        if not data:
+            break
+
+        if rec.AcceptWaveform(data):
+            result = json.loads(rec.Result())
+            text = (result.get("text") or "").strip()
+            if text:
+                emit({"type": "final", "text": text})
+            last_partial = ""
+        else:
+            result = json.loads(rec.PartialResult())
+            partial = (result.get("partial") or "").strip()
+            if partial != last_partial:
+                last_partial = partial
+                emit({"type": "partial", "text": partial})
+
+    final = json.loads(rec.FinalResult())
+    text = (final.get("text") or "").strip()
+    if text:
+        emit({"type": "final", "text": text})
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/package.json
+++ b/package.json
@ -0,0 +1,15 @@
+{
+  "name": "pi-vosk-voice",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "keywords": ["pi-package", "vosk", "voice", "speech-to-text"],
+  "description": "Global Vosk push-to-talk extension for Pi coding agent with live transcription in the editor.",
+  "pi": {
+    "extensions": ["./extensions"]
+  },
+  "peerDependencies": {
+    "@mariozechner/pi-coding-agent": "*",
+    "@mariozechner/pi-tui": "*"
+  }
+}