commit
896530a0e8
5 changed files with 780 additions and 0 deletions
@ -0,0 +1,60 @@
|
||||
# pi-vosk-voice |
||||
|
||||
Push-to-talk voice input for Pi using Vosk offline speech recognition. |
||||
|
||||
## Features |
||||
|
||||
- Hold-to-talk recording |
||||
- Live transcription while speaking |
||||
- Partial stabilization on pauses |
||||
- Writes into the Pi editor in `paste` mode |
||||
- Optional `send` mode |
||||
- Start/stop audio cues via SoX |
||||
- Global or project-local Pi configuration |
||||
|
||||
## Requirements |
||||
|
||||
- `sox` (`rec` and `play`) |
||||
- Python 3 |
||||
- `vosk` Python package |
||||
- A downloaded Vosk model, e.g. `vosk-model-it-0.22` |
||||
|
||||
## Configuration |
||||
|
||||
Put this in `~/.pi/vosk-voice.json` or `.pi/vosk-voice.json`: |
||||
|
||||
```json |
||||
{ |
||||
"shortcut": "f12", |
||||
"enabled": true, |
||||
"mode": "paste", |
||||
"modelPath": "/home/you/.cache/vosk/vosk-model-it-0.22", |
||||
"soundsEnabled": true |
||||
} |
||||
``` |
||||
|
||||
## Install in Pi |
||||
|
||||
```bash |
||||
pi install /path/to/pi-vosk-voice |
||||
``` |
||||
|
||||
Then reload Pi: |
||||
|
||||
```text |
||||
/reload |
||||
``` |
||||
|
||||
## Commands |
||||
|
||||
- `/voice` |
||||
- `/voice config` |
||||
- `/voice stop` |
||||
- `/voice download-model` |
||||
- `/voice set shortcut ctrl+t` |
||||
- `/voice set mode paste` |
||||
- `/voice set sounds false` |
||||
|
||||
## Notes |
||||
|
||||
The extension uses Vosk streaming partial/final results and updates the editor progressively. |
||||
@ -0,0 +1,641 @@
|
||||
import { CustomEditor, type ExtensionAPI, type ExtensionContext } from "@mariozechner/pi-coding-agent"; |
||||
import { Key, isKeyRelease, isKittyProtocolActive, matchesKey, type KeyId } from "@mariozechner/pi-tui"; |
||||
import fs from "node:fs"; |
||||
import os from "node:os"; |
||||
import path from "node:path"; |
||||
import { spawn } from "node:child_process"; |
||||
import { fileURLToPath } from "node:url"; |
||||
|
||||
type VoiceMode = "send" | "paste"; |
||||
|
||||
type VoskVoiceConfig = { |
||||
shortcut: string; |
||||
enabled: boolean; |
||||
mode: VoiceMode; |
||||
modelPath: string; |
||||
soundsEnabled: boolean; |
||||
}; |
||||
|
||||
type ActiveRecording = { |
||||
baseText: string; |
||||
committedText: string; |
||||
partialText: string; |
||||
pendingPartialText: string; |
||||
partialDebounceTimer: ReturnType<typeof setTimeout> | null; |
||||
displayedText: string; |
||||
targetText: string; |
||||
typingTimer: ReturnType<typeof setTimeout> | null; |
||||
ignoreIncoming: boolean; |
||||
interruptedEditorText: string | null; |
||||
recProc: ReturnType<typeof spawn>; |
||||
sttProc: ReturnType<typeof spawn>; |
||||
recDone: Promise<void>; |
||||
sttDone: Promise<void>; |
||||
}; |
||||
|
||||
type StreamMessage = { |
||||
type: "ready" | "partial" | "final" | "fatal"; |
||||
text?: string; |
||||
error?: string; |
||||
}; |
||||
|
||||
const STATUS_KEY = "vosk-voice"; |
||||
const SPINNER = ["⠁", "⠂", "⠄", "⠂"]; |
||||
const VOICE_HANDLER_SYMBOL = Symbol.for("vosk-voice:handleInput"); |
||||
const START_COOLDOWN_MS = 400; |
||||
const PARTIAL_DEBOUNCE_MS = 120; |
||||
const DEFAULT_CONFIG: VoskVoiceConfig = { |
||||
shortcut: "f12", |
||||
enabled: true, |
||||
mode: "paste", |
||||
modelPath: path.join(os.homedir(), ".cache", "vosk", "vosk-model-it-0.22"), |
||||
soundsEnabled: true, |
||||
}; |
||||
const MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-it-0.22.zip"; |
||||
|
||||
const __filename = fileURLToPath(import.meta.url); |
||||
const __dirname = path.dirname(__filename); |
||||
const STREAMER_PATH = path.join(__dirname, "vosk_daemon.py"); |
||||
|
||||
let protoPatchApplied = false; |
||||
let config: VoskVoiceConfig = DEFAULT_CONFIG; |
||||
let spinnerFrame = 0; |
||||
let spinnerTimer: ReturnType<typeof setInterval> | null = null; |
||||
let keyPressed = false; |
||||
let lastStopTime = 0; |
||||
let recording: ActiveRecording | null = null; |
||||
|
||||
function getProjectConfigPath(cwd: string) { |
||||
return path.join(cwd, ".pi", "vosk-voice.json"); |
||||
} |
||||
|
||||
function getGlobalConfigPath() { |
||||
return path.join(os.homedir(), ".pi", "vosk-voice.json"); |
||||
} |
||||
|
||||
function getReadableConfigPath(cwd: string) { |
||||
const projectPath = getProjectConfigPath(cwd); |
||||
if (fs.existsSync(projectPath)) return projectPath; |
||||
return getGlobalConfigPath(); |
||||
} |
||||
|
||||
function getEditableConfigPath(cwd: string) { |
||||
return getProjectConfigPath(cwd); |
||||
} |
||||
|
||||
function ensureConfigDir(cwd: string) { |
||||
fs.mkdirSync(path.dirname(getEditableConfigPath(cwd)), { recursive: true }); |
||||
} |
||||
|
||||
function loadConfig(cwd: string): VoskVoiceConfig { |
||||
const candidate = getReadableConfigPath(cwd); |
||||
if (!fs.existsSync(candidate)) return { ...DEFAULT_CONFIG }; |
||||
|
||||
try { |
||||
const parsed = JSON.parse(fs.readFileSync(candidate, "utf8")) as Partial<VoskVoiceConfig>; |
||||
return { |
||||
shortcut: (parsed.shortcut ?? DEFAULT_CONFIG.shortcut).toLowerCase(), |
||||
enabled: parsed.enabled ?? DEFAULT_CONFIG.enabled, |
||||
mode: parsed.mode === "send" ? "send" : "paste", |
||||
modelPath: parsed.modelPath ?? DEFAULT_CONFIG.modelPath, |
||||
soundsEnabled: parsed.soundsEnabled ?? DEFAULT_CONFIG.soundsEnabled, |
||||
}; |
||||
} catch { |
||||
return { ...DEFAULT_CONFIG }; |
||||
} |
||||
} |
||||
|
||||
function updateConfig(cwd: string, patch: Partial<VoskVoiceConfig>): VoskVoiceConfig { |
||||
const next = { ...config, ...patch }; |
||||
ensureConfigDir(cwd); |
||||
fs.writeFileSync(getEditableConfigPath(cwd), `${JSON.stringify(next, null, 2)}\n`); |
||||
config = next; |
||||
return next; |
||||
} |
||||
|
||||
function setStatus(ctx: ExtensionContext, text?: string) { |
||||
ctx.ui.setStatus(STATUS_KEY, text); |
||||
} |
||||
|
||||
function spin(ctx: ExtensionContext) { |
||||
const frame = SPINNER[spinnerFrame % SPINNER.length]!; |
||||
spinnerFrame++; |
||||
setStatus(ctx, `${frame} recording`); |
||||
} |
||||
|
||||
function startSpinner(ctx: ExtensionContext) { |
||||
stopSpinner(ctx); |
||||
spin(ctx); |
||||
spinnerTimer = setInterval(() => spin(ctx), 250); |
||||
} |
||||
|
||||
function stopSpinner(ctx: ExtensionContext) { |
||||
if (spinnerTimer) { |
||||
clearInterval(spinnerTimer); |
||||
spinnerTimer = null; |
||||
} |
||||
setStatus(ctx, undefined); |
||||
} |
||||
|
||||
function parseBooleanish(input: string): boolean | undefined { |
||||
const normalized = input.trim().toLowerCase(); |
||||
if (["1", "true", "on", "yes", "enabled"].includes(normalized)) return true; |
||||
if (["0", "false", "off", "no", "disabled"].includes(normalized)) return false; |
||||
return undefined; |
||||
} |
||||
|
||||
function playCue(kind: "start" | "stop") { |
||||
if (!config.soundsEnabled) return; |
||||
const args = kind === "start" |
||||
? ["-q", "-n", "synth", "0.05", "sine", "880", "fade", "q", "0.01", "0.05", "0.02"] |
||||
: ["-q", "-n", "synth", "0.04", "sine", "660:440", "fade", "q", "0.005", "0.04", "0.02"]; |
||||
const proc = spawn("play", args, { stdio: "ignore", detached: true }); |
||||
proc.on("error", () => {}); |
||||
proc.unref(); |
||||
} |
||||
|
||||
function ensureProtoPatch() { |
||||
if (protoPatchApplied) return; |
||||
protoPatchApplied = true; |
||||
|
||||
const originalHandleInput = CustomEditor.prototype.handleInput as (data: string) => void; |
||||
CustomEditor.prototype.handleInput = function patchedHandleInput(data: string): void { |
||||
const handler = Reflect.get(globalThis, VOICE_HANDLER_SYMBOL) as ((data: string) => boolean) | undefined; |
||||
if (handler?.(data)) return; |
||||
originalHandleInput.call(this, data); |
||||
}; |
||||
} |
||||
|
||||
function joinTranscript(baseText: string, committedText: string, partialText: string) { |
||||
const spoken = `${committedText}${partialText ? `${committedText ? " " : ""}${partialText}` : ""}`.trim(); |
||||
if (!spoken) return baseText; |
||||
if (!baseText) return spoken; |
||||
const sep = baseText.endsWith(" ") || baseText.endsWith("\n") ? "" : " "; |
||||
return `${baseText}${sep}${spoken}`; |
||||
} |
||||
|
||||
function schedulePartialUpdate(ctx: ExtensionContext, current: ActiveRecording, text: string) { |
||||
current.pendingPartialText = text.trim(); |
||||
if (current.partialDebounceTimer) clearTimeout(current.partialDebounceTimer); |
||||
current.partialDebounceTimer = setTimeout(() => { |
||||
current.partialDebounceTimer = null; |
||||
current.partialText = current.pendingPartialText; |
||||
scheduleTyping(ctx, current); |
||||
}, PARTIAL_DEBOUNCE_MS); |
||||
} |
||||
|
||||
function scheduleTyping(ctx: ExtensionContext, current: ActiveRecording) { |
||||
if (config.mode !== "paste") return; |
||||
current.targetText = joinTranscript(current.baseText, current.committedText, current.partialText); |
||||
|
||||
const tick = () => { |
||||
current.typingTimer = null; |
||||
|
||||
if (current.displayedText === current.targetText) return; |
||||
|
||||
if (current.targetText.startsWith(current.displayedText)) { |
||||
current.displayedText += current.targetText[current.displayedText.length] ?? ""; |
||||
ctx.ui.setEditorText(current.displayedText); |
||||
current.typingTimer = setTimeout(tick, 12); |
||||
return; |
||||
} |
||||
|
||||
current.displayedText = current.targetText; |
||||
ctx.ui.setEditorText(current.displayedText); |
||||
}; |
||||
|
||||
if (!current.typingTimer) current.typingTimer = setTimeout(tick, 12); |
||||
} |
||||
|
||||
function flushTypedText(ctx: ExtensionContext, current: ActiveRecording) { |
||||
if (current.partialDebounceTimer) { |
||||
clearTimeout(current.partialDebounceTimer); |
||||
current.partialDebounceTimer = null; |
||||
current.partialText = current.pendingPartialText; |
||||
} |
||||
if (current.typingTimer) { |
||||
clearTimeout(current.typingTimer); |
||||
current.typingTimer = null; |
||||
} |
||||
current.targetText = joinTranscript(current.baseText, current.committedText, current.partialText); |
||||
current.displayedText = current.targetText; |
||||
if (config.mode === "paste") ctx.ui.setEditorText(current.displayedText); |
||||
} |
||||
|
||||
function appendCommitted(current: ActiveRecording, text: string) { |
||||
const clean = text.trim(); |
||||
if (!clean) return; |
||||
current.committedText = current.committedText ? `${current.committedText} ${clean}` : clean; |
||||
current.partialText = ""; |
||||
current.pendingPartialText = ""; |
||||
if (current.partialDebounceTimer) { |
||||
clearTimeout(current.partialDebounceTimer); |
||||
current.partialDebounceTimer = null; |
||||
} |
||||
} |
||||
|
||||
function reanchorToEditorText(current: ActiveRecording, editorText: string) { |
||||
if (current.partialDebounceTimer) { |
||||
clearTimeout(current.partialDebounceTimer); |
||||
current.partialDebounceTimer = null; |
||||
} |
||||
if (current.typingTimer) { |
||||
clearTimeout(current.typingTimer); |
||||
current.typingTimer = null; |
||||
} |
||||
current.baseText = editorText; |
||||
current.committedText = ""; |
||||
current.partialText = ""; |
||||
current.pendingPartialText = ""; |
||||
current.displayedText = editorText; |
||||
current.targetText = editorText; |
||||
} |
||||
|
||||
function startRecording(ctx: ExtensionContext) { |
||||
if (Date.now() - lastStopTime < START_COOLDOWN_MS) return; |
||||
if (recording) return; |
||||
if (!fs.existsSync(config.modelPath)) { |
||||
ctx.ui.notify(`Modello Vosk non trovato: ${config.modelPath}`, "error"); |
||||
return; |
||||
} |
||||
|
||||
const recProc = spawn("rec", [ |
||||
"-q", |
||||
"-t", "raw", |
||||
"-r", "16000", |
||||
"-b", "16", |
||||
"-c", "1", |
||||
"-e", "signed-integer", |
||||
"-", |
||||
], { stdio: ["ignore", "pipe", "pipe"] }); |
||||
|
||||
const sttProc = spawn("python3", [STREAMER_PATH, config.modelPath], { |
||||
stdio: ["pipe", "pipe", "pipe"], |
||||
}); |
||||
|
||||
if (!recProc.stdout || !recProc.stderr || !sttProc.stdin || !sttProc.stdout || !sttProc.stderr) { |
||||
ctx.ui.notify("Impossibile avviare la pipeline audio", "error"); |
||||
try { recProc.kill("SIGKILL"); } catch {} |
||||
try { sttProc.kill("SIGKILL"); } catch {} |
||||
return; |
||||
} |
||||
|
||||
const current: ActiveRecording = { |
||||
baseText: config.mode === "paste" ? ctx.ui.getEditorText() : "", |
||||
committedText: "", |
||||
partialText: "", |
||||
pendingPartialText: "", |
||||
partialDebounceTimer: null, |
||||
displayedText: config.mode === "paste" ? ctx.ui.getEditorText() : "", |
||||
targetText: config.mode === "paste" ? ctx.ui.getEditorText() : "", |
||||
typingTimer: null, |
||||
ignoreIncoming: false, |
||||
interruptedEditorText: null, |
||||
recProc, |
||||
sttProc, |
||||
recDone: new Promise((resolve) => recProc.once("close", () => resolve())), |
||||
sttDone: new Promise((resolve) => sttProc.once("close", () => resolve())), |
||||
}; |
||||
|
||||
let recErr = ""; |
||||
recProc.stderr.setEncoding("utf8"); |
||||
recProc.stderr.on("data", (chunk: string) => { |
||||
recErr += chunk; |
||||
}); |
||||
|
||||
let sttErr = ""; |
||||
sttProc.stderr.setEncoding("utf8"); |
||||
sttProc.stderr.on("data", (chunk: string) => { |
||||
sttErr += chunk; |
||||
}); |
||||
|
||||
let buffer = ""; |
||||
sttProc.stdout.setEncoding("utf8"); |
||||
sttProc.stdout.on("data", (chunk: string) => { |
||||
buffer += chunk; |
||||
while (true) { |
||||
const idx = buffer.indexOf("\n"); |
||||
if (idx === -1) break; |
||||
const line = buffer.slice(0, idx).trim(); |
||||
buffer = buffer.slice(idx + 1); |
||||
if (!line) continue; |
||||
try { |
||||
const msg = JSON.parse(line) as StreamMessage; |
||||
if (current.ignoreIncoming) { |
||||
continue; |
||||
} |
||||
if (msg.type === "fatal") { |
||||
ctx.ui.notify(msg.error || "Errore Vosk", "error"); |
||||
continue; |
||||
} |
||||
if (msg.type === "partial") { |
||||
schedulePartialUpdate(ctx, current, msg.text ?? ""); |
||||
continue; |
||||
} |
||||
if (msg.type === "final") { |
||||
appendCommitted(current, msg.text ?? ""); |
||||
scheduleTyping(ctx, current); |
||||
} |
||||
} catch { |
||||
// ignore malformed lines
|
||||
} |
||||
} |
||||
}); |
||||
|
||||
recProc.stdout.pipe(sttProc.stdin); |
||||
recProc.on("error", (error) => { |
||||
ctx.ui.notify(`Errore registrazione: ${error.message}`, "error"); |
||||
}); |
||||
sttProc.on("error", (error) => { |
||||
ctx.ui.notify(`Errore Vosk: ${error.message}`, "error"); |
||||
}); |
||||
recProc.on("exit", (code, signal) => { |
||||
const expected = signal === "SIGINT" || signal === "SIGKILL" || code === 0 || code === null; |
||||
if (!expected && recErr.trim()) { |
||||
ctx.ui.notify(`Errore registrazione: ${recErr.trim()}`, "error"); |
||||
} |
||||
}); |
||||
sttProc.on("exit", (code, signal) => { |
||||
const expected = signal === null && (code === 0 || code === null); |
||||
if (!expected && sttErr.trim()) { |
||||
ctx.ui.notify(`Errore Vosk: ${sttErr.trim()}`, "error"); |
||||
} |
||||
}); |
||||
|
||||
recording = current; |
||||
playCue("start"); |
||||
startSpinner(ctx); |
||||
} |
||||
|
||||
async function stopRecording( |
||||
ctx: ExtensionContext, |
||||
pi: ExtensionAPI, |
||||
runTranscription: boolean, |
||||
preserveEditorTextOnInterrupt = false, |
||||
) { |
||||
if (!recording) return; |
||||
lastStopTime = Date.now(); |
||||
const current = recording; |
||||
recording = null; |
||||
if (!runTranscription) { |
||||
current.ignoreIncoming = true; |
||||
if (preserveEditorTextOnInterrupt && config.mode === "paste") { |
||||
current.interruptedEditorText = ctx.ui.getEditorText(); |
||||
} |
||||
} |
||||
playCue("stop"); |
||||
stopSpinner(ctx); |
||||
|
||||
try { current.recProc.kill("SIGINT"); } catch {} |
||||
await current.recDone; |
||||
await current.sttDone; |
||||
|
||||
const finalText = `${current.committedText}${current.partialText ? `${current.committedText ? " " : ""}${current.partialText}` : ""}`.trim(); |
||||
|
||||
if (!runTranscription) { |
||||
if (current.partialDebounceTimer) { |
||||
clearTimeout(current.partialDebounceTimer); |
||||
current.partialDebounceTimer = null; |
||||
} |
||||
if (current.typingTimer) { |
||||
clearTimeout(current.typingTimer); |
||||
current.typingTimer = null; |
||||
} |
||||
if (preserveEditorTextOnInterrupt && config.mode === "paste" && current.interruptedEditorText !== null) { |
||||
ctx.ui.setEditorText(current.interruptedEditorText); |
||||
} else if (config.mode === "paste") { |
||||
ctx.ui.setEditorText(current.baseText); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
if (!finalText) { |
||||
setStatus(ctx, "no speech detected"); |
||||
setTimeout(() => setStatus(ctx, undefined), 2000); |
||||
return; |
||||
} |
||||
|
||||
if (config.mode === "paste") { |
||||
flushTypedText(ctx, current); |
||||
setStatus(ctx, undefined); |
||||
return; |
||||
} |
||||
|
||||
setStatus(ctx, "sending…"); |
||||
if (ctx.isIdle()) pi.sendUserMessage(finalText); |
||||
else pi.sendUserMessage(finalText, { deliverAs: "followUp" }); |
||||
setStatus(ctx, undefined); |
||||
} |
||||
|
||||
export default function (pi: ExtensionAPI) { |
||||
config = loadConfig(process.cwd()); |
||||
|
||||
pi.on("session_start", (_event, ctx) => { |
||||
config = loadConfig(process.cwd()); |
||||
ensureProtoPatch(); |
||||
const keyId = config.shortcut.toLowerCase() as KeyId; |
||||
|
||||
Reflect.set(globalThis, VOICE_HANDLER_SYMBOL, (data: string): boolean => { |
||||
if (recording && matchesKey(data, Key.enter)) { |
||||
keyPressed = false; |
||||
void stopRecording(ctx, pi, false, true); |
||||
return false; |
||||
} |
||||
|
||||
if (recording && config.mode === "paste" && !isKeyRelease(data) && !matchesKey(data, keyId)) { |
||||
const current = recording; |
||||
setTimeout(() => { |
||||
if (!recording || recording !== current) return; |
||||
const editorText = ctx.ui.getEditorText(); |
||||
if (editorText !== current.displayedText) { |
||||
reanchorToEditorText(current, editorText); |
||||
} |
||||
}, 0); |
||||
} |
||||
|
||||
if (!matchesKey(data, keyId)) return false; |
||||
if (!config.enabled) return true; |
||||
|
||||
if (!isKittyProtocolActive()) { |
||||
if (recording) { |
||||
keyPressed = false; |
||||
void stopRecording(ctx, pi, true); |
||||
} else { |
||||
keyPressed = true; |
||||
startRecording(ctx); |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
const released = isKeyRelease(data); |
||||
if (released) { |
||||
if (keyPressed) { |
||||
keyPressed = false; |
||||
void stopRecording(ctx, pi, true); |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
if (recording) { |
||||
keyPressed = false; |
||||
void stopRecording(ctx, pi, true); |
||||
return true; |
||||
} |
||||
|
||||
if (keyPressed) return true; |
||||
keyPressed = true; |
||||
startRecording(ctx); |
||||
return true; |
||||
}); |
||||
|
||||
if (!fs.existsSync(config.modelPath)) { |
||||
ctx.ui.notify( |
||||
`Vosk model non trovato: ${config.modelPath}\nUsa /voice download-model per istruzioni.`, |
||||
"warning", |
||||
); |
||||
} |
||||
}); |
||||
|
||||
pi.on("session_shutdown", (_event, ctx) => { |
||||
Reflect.deleteProperty(globalThis, VOICE_HANDLER_SYMBOL); |
||||
keyPressed = false; |
||||
stopSpinner(ctx); |
||||
if (recording) { |
||||
if (recording.partialDebounceTimer) { |
||||
clearTimeout(recording.partialDebounceTimer); |
||||
recording.partialDebounceTimer = null; |
||||
} |
||||
if (recording.typingTimer) { |
||||
clearTimeout(recording.typingTimer); |
||||
recording.typingTimer = null; |
||||
} |
||||
try { recording.recProc.kill("SIGKILL"); } catch {} |
||||
try { recording.sttProc.kill("SIGKILL"); } catch {} |
||||
recording = null; |
||||
} |
||||
}); |
||||
|
||||
pi.registerShortcut(config.shortcut.toLowerCase() as KeyId, { |
||||
description: "Hold-to-talk con Vosk live (riavvia pi se cambi shortcut)", |
||||
handler: async (ctx) => { |
||||
ctx.ui.notify("Tieni premuto il tasto per registrare, rilascia per inviare", "info"); |
||||
}, |
||||
}); |
||||
|
||||
pi.registerCommand("voice", { |
||||
description: "Controlla Vosk voice (status/config/set/stop/download-model)", |
||||
handler: async (args, ctx) => { |
||||
config = loadConfig(process.cwd()); |
||||
const trimmed = args.trim(); |
||||
const [actionRaw, ...rest] = trimmed ? trimmed.split(/\s+/) : ["status"]; |
||||
const action = (actionRaw ?? "status").toLowerCase(); |
||||
|
||||
if (action === "stop") { |
||||
if (!recording) { |
||||
ctx.ui.notify("Nessuna registrazione attiva", "info"); |
||||
return; |
||||
} |
||||
keyPressed = false; |
||||
await stopRecording(ctx, pi, false, true); |
||||
ctx.ui.notify("Registrazione annullata", "info"); |
||||
return; |
||||
} |
||||
|
||||
if (action === "config") { |
||||
ctx.ui.notify( |
||||
[ |
||||
`configPath: ${getReadableConfigPath(process.cwd())}`, |
||||
`shortcut: ${config.shortcut}`, |
||||
`enabled: ${config.enabled}`, |
||||
`mode: ${config.mode}`, |
||||
`modelPath: ${config.modelPath}`, |
||||
`sounds: ${config.soundsEnabled}`, |
||||
`live: true`, |
||||
`modelUrl: ${MODEL_URL}`, |
||||
].join("\n"), |
||||
"info", |
||||
); |
||||
return; |
||||
} |
||||
|
||||
if (action === "download-model") { |
||||
const targetDir = path.dirname(config.modelPath); |
||||
const zipPath = `${config.modelPath}.zip`; |
||||
ctx.ui.notify( |
||||
[ |
||||
"Modello consigliato dal web:", |
||||
"- vosk-model-it-0.22", |
||||
"- modello italiano large/server (~1.2G)", |
||||
`URL: ${MODEL_URL}`, |
||||
"", |
||||
"Comandi:", |
||||
`mkdir -p ${targetDir}`, |
||||
`curl -L ${MODEL_URL} -o ${zipPath}`, |
||||
`unzip -q ${zipPath} -d ${targetDir}`, |
||||
`rm -f ${zipPath}`, |
||||
].join("\n"), |
||||
"info", |
||||
); |
||||
return; |
||||
} |
||||
|
||||
if (action === "set") { |
||||
if (rest.length < 2) { |
||||
ctx.ui.notify("Uso: /voice set <shortcut|enabled|mode|modelPath|sounds> <valore>", "warning"); |
||||
return; |
||||
} |
||||
const field = rest[0]!.toLowerCase(); |
||||
const value = rest.slice(1).join(" "); |
||||
|
||||
if (field === "shortcut") { |
||||
updateConfig(process.cwd(), { shortcut: value.toLowerCase() }); |
||||
ctx.ui.notify("Shortcut aggiornata. Riavvia pi per applicarla.", "info"); |
||||
return; |
||||
} |
||||
if (field === "enabled") { |
||||
const parsed = parseBooleanish(value); |
||||
if (parsed === undefined) { |
||||
ctx.ui.notify("enabled deve essere true/false", "warning"); |
||||
return; |
||||
} |
||||
updateConfig(process.cwd(), { enabled: parsed }); |
||||
ctx.ui.notify(`Voice enabled=${parsed}`, "info"); |
||||
return; |
||||
} |
||||
if (field === "mode") { |
||||
if (value !== "send" && value !== "paste") { |
||||
ctx.ui.notify("mode deve essere send o paste", "warning"); |
||||
return; |
||||
} |
||||
updateConfig(process.cwd(), { mode: value as VoiceMode }); |
||||
ctx.ui.notify(`Mode impostata a ${value}`, "info"); |
||||
return; |
||||
} |
||||
if (field === "modelpath") { |
||||
updateConfig(process.cwd(), { modelPath: value }); |
||||
ctx.ui.notify(`Model path aggiornato: ${value}`, "info"); |
||||
return; |
||||
} |
||||
if (field === "sounds") { |
||||
const parsed = parseBooleanish(value); |
||||
if (parsed === undefined) { |
||||
ctx.ui.notify("sounds deve essere true/false", "warning"); |
||||
return; |
||||
} |
||||
updateConfig(process.cwd(), { soundsEnabled: parsed }); |
||||
ctx.ui.notify(`Sounds=${parsed}`, "info"); |
||||
return; |
||||
} |
||||
|
||||
ctx.ui.notify("Campo sconosciuto. Usa shortcut, enabled, mode, modelPath, sounds", "warning"); |
||||
return; |
||||
} |
||||
|
||||
const state = recording ? "🔴 recording" : "idle"; |
||||
ctx.ui.notify( |
||||
`vosk-voice: ${state} (${config.mode}, enabled=${config.enabled}, shortcut=${config.shortcut}, live=true)`, |
||||
"info", |
||||
); |
||||
}, |
||||
}); |
||||
} |
||||
@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python3 |
||||
import json |
||||
import sys |
||||
from pathlib import Path |
||||
|
||||
try: |
||||
from vosk import Model, KaldiRecognizer, SetLogLevel |
||||
except Exception as e: |
||||
print(json.dumps({"type": "fatal", "error": f"Python package 'vosk' non disponibile: {e}"}), flush=True) |
||||
raise SystemExit(1) |
||||
|
||||
|
||||
def emit(obj: dict): |
||||
print(json.dumps(obj, ensure_ascii=False), flush=True) |
||||
|
||||
|
||||
def main() -> int: |
||||
if len(sys.argv) < 2: |
||||
emit({"type": "fatal", "error": "Uso: vosk_daemon.py <model_path>"}) |
||||
return 1 |
||||
|
||||
model_path = Path(sys.argv[1]).expanduser() |
||||
if not model_path.exists(): |
||||
emit({"type": "fatal", "error": f"Modello Vosk non trovato: {model_path}"}) |
||||
return 1 |
||||
|
||||
SetLogLevel(-1) |
||||
model = Model(str(model_path)) |
||||
rec = KaldiRecognizer(model, 16000) |
||||
last_partial = "" |
||||
|
||||
emit({"type": "ready"}) |
||||
|
||||
while True: |
||||
data = sys.stdin.buffer.read(4000) |
||||
if not data: |
||||
break |
||||
|
||||
if rec.AcceptWaveform(data): |
||||
result = json.loads(rec.Result()) |
||||
text = (result.get("text") or "").strip() |
||||
if text: |
||||
emit({"type": "final", "text": text}) |
||||
last_partial = "" |
||||
else: |
||||
result = json.loads(rec.PartialResult()) |
||||
partial = (result.get("partial") or "").strip() |
||||
if partial != last_partial: |
||||
last_partial = partial |
||||
emit({"type": "partial", "text": partial}) |
||||
|
||||
final = json.loads(rec.FinalResult()) |
||||
text = (final.get("text") or "").strip() |
||||
if text: |
||||
emit({"type": "final", "text": text}) |
||||
|
||||
return 0 |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
raise SystemExit(main()) |
||||
@ -0,0 +1,15 @@
|
||||
{ |
||||
"name": "pi-vosk-voice", |
||||
"version": "0.1.0", |
||||
"private": true, |
||||
"type": "module", |
||||
"keywords": ["pi-package", "vosk", "voice", "speech-to-text"], |
||||
"description": "Global Vosk push-to-talk extension for Pi coding agent with live transcription in the editor.", |
||||
"pi": { |
||||
"extensions": ["./extensions"] |
||||
}, |
||||
"peerDependencies": { |
||||
"@mariozechner/pi-coding-agent": "*", |
||||
"@mariozechner/pi-tui": "*" |
||||
} |
||||
} |
||||
Loading…
Reference in new issue