Browse Source

Initial import of Vosk Pi voice extension

master
Matteo Benedetto 2 months ago
commit
896530a0e8
  1. 3
      .gitignore
  2. 60
      README.md
  3. 641
      extensions/index.ts
  4. 61
      extensions/vosk_daemon.py
  5. 15
      package.json

3
.gitignore vendored

@ -0,0 +1,3 @@
__pycache__/
*.pyc
node_modules/

60
README.md

@ -0,0 +1,60 @@
# pi-vosk-voice
Push-to-talk voice input for Pi using Vosk offline speech recognition.
## Features
- Hold-to-talk recording
- Live transcription while speaking
- Partial stabilization on pauses
- Writes into the Pi editor in `paste` mode
- Optional `send` mode
- Start/stop audio cues via SoX
- Global or project-local Pi configuration
## Requirements
- `sox` (`rec` and `play`)
- Python 3
- `vosk` Python package
- A downloaded Vosk model, e.g. `vosk-model-it-0.22`
## Configuration
Put this in `~/.pi/vosk-voice.json` or `.pi/vosk-voice.json`:
```json
{
"shortcut": "f12",
"enabled": true,
"mode": "paste",
"modelPath": "/home/you/.cache/vosk/vosk-model-it-0.22",
"soundsEnabled": true
}
```
## Install in Pi
```bash
pi install /path/to/pi-vosk-voice
```
Then reload Pi:
```text
/reload
```
## Commands
- `/voice`
- `/voice config`
- `/voice stop`
- `/voice download-model`
- `/voice set shortcut ctrl+t`
- `/voice set mode paste`
- `/voice set sounds false`
## Notes
The extension uses Vosk streaming partial/final results and updates the editor progressively.

641
extensions/index.ts

@ -0,0 +1,641 @@
import { CustomEditor, type ExtensionAPI, type ExtensionContext } from "@mariozechner/pi-coding-agent";
import { Key, isKeyRelease, isKittyProtocolActive, matchesKey, type KeyId } from "@mariozechner/pi-tui";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { spawn } from "node:child_process";
import { fileURLToPath } from "node:url";
type VoiceMode = "send" | "paste";
type VoskVoiceConfig = {
shortcut: string;
enabled: boolean;
mode: VoiceMode;
modelPath: string;
soundsEnabled: boolean;
};
type ActiveRecording = {
baseText: string;
committedText: string;
partialText: string;
pendingPartialText: string;
partialDebounceTimer: ReturnType<typeof setTimeout> | null;
displayedText: string;
targetText: string;
typingTimer: ReturnType<typeof setTimeout> | null;
ignoreIncoming: boolean;
interruptedEditorText: string | null;
recProc: ReturnType<typeof spawn>;
sttProc: ReturnType<typeof spawn>;
recDone: Promise<void>;
sttDone: Promise<void>;
};
type StreamMessage = {
type: "ready" | "partial" | "final" | "fatal";
text?: string;
error?: string;
};
const STATUS_KEY = "vosk-voice";
const SPINNER = ["⠁", "⠂", "⠄", "⠂"];
const VOICE_HANDLER_SYMBOL = Symbol.for("vosk-voice:handleInput");
const START_COOLDOWN_MS = 400;
const PARTIAL_DEBOUNCE_MS = 120;
const DEFAULT_CONFIG: VoskVoiceConfig = {
shortcut: "f12",
enabled: true,
mode: "paste",
modelPath: path.join(os.homedir(), ".cache", "vosk", "vosk-model-it-0.22"),
soundsEnabled: true,
};
const MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-it-0.22.zip";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const STREAMER_PATH = path.join(__dirname, "vosk_daemon.py");
let protoPatchApplied = false;
let config: VoskVoiceConfig = DEFAULT_CONFIG;
let spinnerFrame = 0;
let spinnerTimer: ReturnType<typeof setInterval> | null = null;
let keyPressed = false;
let lastStopTime = 0;
let recording: ActiveRecording | null = null;
function getProjectConfigPath(cwd: string) {
return path.join(cwd, ".pi", "vosk-voice.json");
}
function getGlobalConfigPath() {
return path.join(os.homedir(), ".pi", "vosk-voice.json");
}
function getReadableConfigPath(cwd: string) {
const projectPath = getProjectConfigPath(cwd);
if (fs.existsSync(projectPath)) return projectPath;
return getGlobalConfigPath();
}
function getEditableConfigPath(cwd: string) {
return getProjectConfigPath(cwd);
}
function ensureConfigDir(cwd: string) {
fs.mkdirSync(path.dirname(getEditableConfigPath(cwd)), { recursive: true });
}
function loadConfig(cwd: string): VoskVoiceConfig {
const candidate = getReadableConfigPath(cwd);
if (!fs.existsSync(candidate)) return { ...DEFAULT_CONFIG };
try {
const parsed = JSON.parse(fs.readFileSync(candidate, "utf8")) as Partial<VoskVoiceConfig>;
return {
shortcut: (parsed.shortcut ?? DEFAULT_CONFIG.shortcut).toLowerCase(),
enabled: parsed.enabled ?? DEFAULT_CONFIG.enabled,
mode: parsed.mode === "send" ? "send" : "paste",
modelPath: parsed.modelPath ?? DEFAULT_CONFIG.modelPath,
soundsEnabled: parsed.soundsEnabled ?? DEFAULT_CONFIG.soundsEnabled,
};
} catch {
return { ...DEFAULT_CONFIG };
}
}
function updateConfig(cwd: string, patch: Partial<VoskVoiceConfig>): VoskVoiceConfig {
const next = { ...config, ...patch };
ensureConfigDir(cwd);
fs.writeFileSync(getEditableConfigPath(cwd), `${JSON.stringify(next, null, 2)}\n`);
config = next;
return next;
}
function setStatus(ctx: ExtensionContext, text?: string) {
ctx.ui.setStatus(STATUS_KEY, text);
}
function spin(ctx: ExtensionContext) {
const frame = SPINNER[spinnerFrame % SPINNER.length]!;
spinnerFrame++;
setStatus(ctx, `${frame} recording`);
}
function startSpinner(ctx: ExtensionContext) {
stopSpinner(ctx);
spin(ctx);
spinnerTimer = setInterval(() => spin(ctx), 250);
}
function stopSpinner(ctx: ExtensionContext) {
if (spinnerTimer) {
clearInterval(spinnerTimer);
spinnerTimer = null;
}
setStatus(ctx, undefined);
}
function parseBooleanish(input: string): boolean | undefined {
const normalized = input.trim().toLowerCase();
if (["1", "true", "on", "yes", "enabled"].includes(normalized)) return true;
if (["0", "false", "off", "no", "disabled"].includes(normalized)) return false;
return undefined;
}
function playCue(kind: "start" | "stop") {
if (!config.soundsEnabled) return;
const args = kind === "start"
? ["-q", "-n", "synth", "0.05", "sine", "880", "fade", "q", "0.01", "0.05", "0.02"]
: ["-q", "-n", "synth", "0.04", "sine", "660:440", "fade", "q", "0.005", "0.04", "0.02"];
const proc = spawn("play", args, { stdio: "ignore", detached: true });
proc.on("error", () => {});
proc.unref();
}
function ensureProtoPatch() {
if (protoPatchApplied) return;
protoPatchApplied = true;
const originalHandleInput = CustomEditor.prototype.handleInput as (data: string) => void;
CustomEditor.prototype.handleInput = function patchedHandleInput(data: string): void {
const handler = Reflect.get(globalThis, VOICE_HANDLER_SYMBOL) as ((data: string) => boolean) | undefined;
if (handler?.(data)) return;
originalHandleInput.call(this, data);
};
}
function joinTranscript(baseText: string, committedText: string, partialText: string) {
const spoken = `${committedText}${partialText ? `${committedText ? " " : ""}${partialText}` : ""}`.trim();
if (!spoken) return baseText;
if (!baseText) return spoken;
const sep = baseText.endsWith(" ") || baseText.endsWith("\n") ? "" : " ";
return `${baseText}${sep}${spoken}`;
}
function schedulePartialUpdate(ctx: ExtensionContext, current: ActiveRecording, text: string) {
current.pendingPartialText = text.trim();
if (current.partialDebounceTimer) clearTimeout(current.partialDebounceTimer);
current.partialDebounceTimer = setTimeout(() => {
current.partialDebounceTimer = null;
current.partialText = current.pendingPartialText;
scheduleTyping(ctx, current);
}, PARTIAL_DEBOUNCE_MS);
}
function scheduleTyping(ctx: ExtensionContext, current: ActiveRecording) {
if (config.mode !== "paste") return;
current.targetText = joinTranscript(current.baseText, current.committedText, current.partialText);
const tick = () => {
current.typingTimer = null;
if (current.displayedText === current.targetText) return;
if (current.targetText.startsWith(current.displayedText)) {
current.displayedText += current.targetText[current.displayedText.length] ?? "";
ctx.ui.setEditorText(current.displayedText);
current.typingTimer = setTimeout(tick, 12);
return;
}
current.displayedText = current.targetText;
ctx.ui.setEditorText(current.displayedText);
};
if (!current.typingTimer) current.typingTimer = setTimeout(tick, 12);
}
function flushTypedText(ctx: ExtensionContext, current: ActiveRecording) {
if (current.partialDebounceTimer) {
clearTimeout(current.partialDebounceTimer);
current.partialDebounceTimer = null;
current.partialText = current.pendingPartialText;
}
if (current.typingTimer) {
clearTimeout(current.typingTimer);
current.typingTimer = null;
}
current.targetText = joinTranscript(current.baseText, current.committedText, current.partialText);
current.displayedText = current.targetText;
if (config.mode === "paste") ctx.ui.setEditorText(current.displayedText);
}
function appendCommitted(current: ActiveRecording, text: string) {
const clean = text.trim();
if (!clean) return;
current.committedText = current.committedText ? `${current.committedText} ${clean}` : clean;
current.partialText = "";
current.pendingPartialText = "";
if (current.partialDebounceTimer) {
clearTimeout(current.partialDebounceTimer);
current.partialDebounceTimer = null;
}
}
function reanchorToEditorText(current: ActiveRecording, editorText: string) {
if (current.partialDebounceTimer) {
clearTimeout(current.partialDebounceTimer);
current.partialDebounceTimer = null;
}
if (current.typingTimer) {
clearTimeout(current.typingTimer);
current.typingTimer = null;
}
current.baseText = editorText;
current.committedText = "";
current.partialText = "";
current.pendingPartialText = "";
current.displayedText = editorText;
current.targetText = editorText;
}
function startRecording(ctx: ExtensionContext) {
if (Date.now() - lastStopTime < START_COOLDOWN_MS) return;
if (recording) return;
if (!fs.existsSync(config.modelPath)) {
ctx.ui.notify(`Modello Vosk non trovato: ${config.modelPath}`, "error");
return;
}
const recProc = spawn("rec", [
"-q",
"-t", "raw",
"-r", "16000",
"-b", "16",
"-c", "1",
"-e", "signed-integer",
"-",
], { stdio: ["ignore", "pipe", "pipe"] });
const sttProc = spawn("python3", [STREAMER_PATH, config.modelPath], {
stdio: ["pipe", "pipe", "pipe"],
});
if (!recProc.stdout || !recProc.stderr || !sttProc.stdin || !sttProc.stdout || !sttProc.stderr) {
ctx.ui.notify("Impossibile avviare la pipeline audio", "error");
try { recProc.kill("SIGKILL"); } catch {}
try { sttProc.kill("SIGKILL"); } catch {}
return;
}
const current: ActiveRecording = {
baseText: config.mode === "paste" ? ctx.ui.getEditorText() : "",
committedText: "",
partialText: "",
pendingPartialText: "",
partialDebounceTimer: null,
displayedText: config.mode === "paste" ? ctx.ui.getEditorText() : "",
targetText: config.mode === "paste" ? ctx.ui.getEditorText() : "",
typingTimer: null,
ignoreIncoming: false,
interruptedEditorText: null,
recProc,
sttProc,
recDone: new Promise((resolve) => recProc.once("close", () => resolve())),
sttDone: new Promise((resolve) => sttProc.once("close", () => resolve())),
};
let recErr = "";
recProc.stderr.setEncoding("utf8");
recProc.stderr.on("data", (chunk: string) => {
recErr += chunk;
});
let sttErr = "";
sttProc.stderr.setEncoding("utf8");
sttProc.stderr.on("data", (chunk: string) => {
sttErr += chunk;
});
let buffer = "";
sttProc.stdout.setEncoding("utf8");
sttProc.stdout.on("data", (chunk: string) => {
buffer += chunk;
while (true) {
const idx = buffer.indexOf("\n");
if (idx === -1) break;
const line = buffer.slice(0, idx).trim();
buffer = buffer.slice(idx + 1);
if (!line) continue;
try {
const msg = JSON.parse(line) as StreamMessage;
if (current.ignoreIncoming) {
continue;
}
if (msg.type === "fatal") {
ctx.ui.notify(msg.error || "Errore Vosk", "error");
continue;
}
if (msg.type === "partial") {
schedulePartialUpdate(ctx, current, msg.text ?? "");
continue;
}
if (msg.type === "final") {
appendCommitted(current, msg.text ?? "");
scheduleTyping(ctx, current);
}
} catch {
// ignore malformed lines
}
}
});
recProc.stdout.pipe(sttProc.stdin);
recProc.on("error", (error) => {
ctx.ui.notify(`Errore registrazione: ${error.message}`, "error");
});
sttProc.on("error", (error) => {
ctx.ui.notify(`Errore Vosk: ${error.message}`, "error");
});
recProc.on("exit", (code, signal) => {
const expected = signal === "SIGINT" || signal === "SIGKILL" || code === 0 || code === null;
if (!expected && recErr.trim()) {
ctx.ui.notify(`Errore registrazione: ${recErr.trim()}`, "error");
}
});
sttProc.on("exit", (code, signal) => {
const expected = signal === null && (code === 0 || code === null);
if (!expected && sttErr.trim()) {
ctx.ui.notify(`Errore Vosk: ${sttErr.trim()}`, "error");
}
});
recording = current;
playCue("start");
startSpinner(ctx);
}
async function stopRecording(
ctx: ExtensionContext,
pi: ExtensionAPI,
runTranscription: boolean,
preserveEditorTextOnInterrupt = false,
) {
if (!recording) return;
lastStopTime = Date.now();
const current = recording;
recording = null;
if (!runTranscription) {
current.ignoreIncoming = true;
if (preserveEditorTextOnInterrupt && config.mode === "paste") {
current.interruptedEditorText = ctx.ui.getEditorText();
}
}
playCue("stop");
stopSpinner(ctx);
try { current.recProc.kill("SIGINT"); } catch {}
await current.recDone;
await current.sttDone;
const finalText = `${current.committedText}${current.partialText ? `${current.committedText ? " " : ""}${current.partialText}` : ""}`.trim();
if (!runTranscription) {
if (current.partialDebounceTimer) {
clearTimeout(current.partialDebounceTimer);
current.partialDebounceTimer = null;
}
if (current.typingTimer) {
clearTimeout(current.typingTimer);
current.typingTimer = null;
}
if (preserveEditorTextOnInterrupt && config.mode === "paste" && current.interruptedEditorText !== null) {
ctx.ui.setEditorText(current.interruptedEditorText);
} else if (config.mode === "paste") {
ctx.ui.setEditorText(current.baseText);
}
return;
}
if (!finalText) {
setStatus(ctx, "no speech detected");
setTimeout(() => setStatus(ctx, undefined), 2000);
return;
}
if (config.mode === "paste") {
flushTypedText(ctx, current);
setStatus(ctx, undefined);
return;
}
setStatus(ctx, "sending…");
if (ctx.isIdle()) pi.sendUserMessage(finalText);
else pi.sendUserMessage(finalText, { deliverAs: "followUp" });
setStatus(ctx, undefined);
}
export default function (pi: ExtensionAPI) {
config = loadConfig(process.cwd());
pi.on("session_start", (_event, ctx) => {
config = loadConfig(process.cwd());
ensureProtoPatch();
const keyId = config.shortcut.toLowerCase() as KeyId;
Reflect.set(globalThis, VOICE_HANDLER_SYMBOL, (data: string): boolean => {
if (recording && matchesKey(data, Key.enter)) {
keyPressed = false;
void stopRecording(ctx, pi, false, true);
return false;
}
if (recording && config.mode === "paste" && !isKeyRelease(data) && !matchesKey(data, keyId)) {
const current = recording;
setTimeout(() => {
if (!recording || recording !== current) return;
const editorText = ctx.ui.getEditorText();
if (editorText !== current.displayedText) {
reanchorToEditorText(current, editorText);
}
}, 0);
}
if (!matchesKey(data, keyId)) return false;
if (!config.enabled) return true;
if (!isKittyProtocolActive()) {
if (recording) {
keyPressed = false;
void stopRecording(ctx, pi, true);
} else {
keyPressed = true;
startRecording(ctx);
}
return true;
}
const released = isKeyRelease(data);
if (released) {
if (keyPressed) {
keyPressed = false;
void stopRecording(ctx, pi, true);
}
return true;
}
if (recording) {
keyPressed = false;
void stopRecording(ctx, pi, true);
return true;
}
if (keyPressed) return true;
keyPressed = true;
startRecording(ctx);
return true;
});
if (!fs.existsSync(config.modelPath)) {
ctx.ui.notify(
`Vosk model non trovato: ${config.modelPath}\nUsa /voice download-model per istruzioni.`,
"warning",
);
}
});
pi.on("session_shutdown", (_event, ctx) => {
Reflect.deleteProperty(globalThis, VOICE_HANDLER_SYMBOL);
keyPressed = false;
stopSpinner(ctx);
if (recording) {
if (recording.partialDebounceTimer) {
clearTimeout(recording.partialDebounceTimer);
recording.partialDebounceTimer = null;
}
if (recording.typingTimer) {
clearTimeout(recording.typingTimer);
recording.typingTimer = null;
}
try { recording.recProc.kill("SIGKILL"); } catch {}
try { recording.sttProc.kill("SIGKILL"); } catch {}
recording = null;
}
});
pi.registerShortcut(config.shortcut.toLowerCase() as KeyId, {
description: "Hold-to-talk con Vosk live (riavvia pi se cambi shortcut)",
handler: async (ctx) => {
ctx.ui.notify("Tieni premuto il tasto per registrare, rilascia per inviare", "info");
},
});
pi.registerCommand("voice", {
description: "Controlla Vosk voice (status/config/set/stop/download-model)",
handler: async (args, ctx) => {
config = loadConfig(process.cwd());
const trimmed = args.trim();
const [actionRaw, ...rest] = trimmed ? trimmed.split(/\s+/) : ["status"];
const action = (actionRaw ?? "status").toLowerCase();
if (action === "stop") {
if (!recording) {
ctx.ui.notify("Nessuna registrazione attiva", "info");
return;
}
keyPressed = false;
await stopRecording(ctx, pi, false, true);
ctx.ui.notify("Registrazione annullata", "info");
return;
}
if (action === "config") {
ctx.ui.notify(
[
`configPath: ${getReadableConfigPath(process.cwd())}`,
`shortcut: ${config.shortcut}`,
`enabled: ${config.enabled}`,
`mode: ${config.mode}`,
`modelPath: ${config.modelPath}`,
`sounds: ${config.soundsEnabled}`,
`live: true`,
`modelUrl: ${MODEL_URL}`,
].join("\n"),
"info",
);
return;
}
if (action === "download-model") {
const targetDir = path.dirname(config.modelPath);
const zipPath = `${config.modelPath}.zip`;
ctx.ui.notify(
[
"Modello consigliato dal web:",
"- vosk-model-it-0.22",
"- modello italiano large/server (~1.2G)",
`URL: ${MODEL_URL}`,
"",
"Comandi:",
`mkdir -p ${targetDir}`,
`curl -L ${MODEL_URL} -o ${zipPath}`,
`unzip -q ${zipPath} -d ${targetDir}`,
`rm -f ${zipPath}`,
].join("\n"),
"info",
);
return;
}
if (action === "set") {
if (rest.length < 2) {
ctx.ui.notify("Uso: /voice set <shortcut|enabled|mode|modelPath|sounds> <valore>", "warning");
return;
}
const field = rest[0]!.toLowerCase();
const value = rest.slice(1).join(" ");
if (field === "shortcut") {
updateConfig(process.cwd(), { shortcut: value.toLowerCase() });
ctx.ui.notify("Shortcut aggiornata. Riavvia pi per applicarla.", "info");
return;
}
if (field === "enabled") {
const parsed = parseBooleanish(value);
if (parsed === undefined) {
ctx.ui.notify("enabled deve essere true/false", "warning");
return;
}
updateConfig(process.cwd(), { enabled: parsed });
ctx.ui.notify(`Voice enabled=${parsed}`, "info");
return;
}
if (field === "mode") {
if (value !== "send" && value !== "paste") {
ctx.ui.notify("mode deve essere send o paste", "warning");
return;
}
updateConfig(process.cwd(), { mode: value as VoiceMode });
ctx.ui.notify(`Mode impostata a ${value}`, "info");
return;
}
if (field === "modelpath") {
updateConfig(process.cwd(), { modelPath: value });
ctx.ui.notify(`Model path aggiornato: ${value}`, "info");
return;
}
if (field === "sounds") {
const parsed = parseBooleanish(value);
if (parsed === undefined) {
ctx.ui.notify("sounds deve essere true/false", "warning");
return;
}
updateConfig(process.cwd(), { soundsEnabled: parsed });
ctx.ui.notify(`Sounds=${parsed}`, "info");
return;
}
ctx.ui.notify("Campo sconosciuto. Usa shortcut, enabled, mode, modelPath, sounds", "warning");
return;
}
const state = recording ? "🔴 recording" : "idle";
ctx.ui.notify(
`vosk-voice: ${state} (${config.mode}, enabled=${config.enabled}, shortcut=${config.shortcut}, live=true)`,
"info",
);
},
});
}

61
extensions/vosk_daemon.py

@ -0,0 +1,61 @@
#!/usr/bin/env python3
import json
import sys
from pathlib import Path
try:
from vosk import Model, KaldiRecognizer, SetLogLevel
except Exception as e:
print(json.dumps({"type": "fatal", "error": f"Python package 'vosk' non disponibile: {e}"}), flush=True)
raise SystemExit(1)
def emit(obj: dict):
print(json.dumps(obj, ensure_ascii=False), flush=True)
def main() -> int:
if len(sys.argv) < 2:
emit({"type": "fatal", "error": "Uso: vosk_daemon.py <model_path>"})
return 1
model_path = Path(sys.argv[1]).expanduser()
if not model_path.exists():
emit({"type": "fatal", "error": f"Modello Vosk non trovato: {model_path}"})
return 1
SetLogLevel(-1)
model = Model(str(model_path))
rec = KaldiRecognizer(model, 16000)
last_partial = ""
emit({"type": "ready"})
while True:
data = sys.stdin.buffer.read(4000)
if not data:
break
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
text = (result.get("text") or "").strip()
if text:
emit({"type": "final", "text": text})
last_partial = ""
else:
result = json.loads(rec.PartialResult())
partial = (result.get("partial") or "").strip()
if partial != last_partial:
last_partial = partial
emit({"type": "partial", "text": partial})
final = json.loads(rec.FinalResult())
text = (final.get("text") or "").strip()
if text:
emit({"type": "final", "text": text})
return 0
if __name__ == "__main__":
raise SystemExit(main())

15
package.json

@ -0,0 +1,15 @@
{
"name": "pi-vosk-voice",
"version": "0.1.0",
"private": true,
"type": "module",
"keywords": ["pi-package", "vosk", "voice", "speech-to-text"],
"description": "Global Vosk push-to-talk extension for Pi coding agent with live transcription in the editor.",
"pi": {
"extensions": ["./extensions"]
},
"peerDependencies": {
"@mariozechner/pi-coding-agent": "*",
"@mariozechner/pi-tui": "*"
}
}
Loading…
Cancel
Save