voicebot demo

This commit is contained in:
2025-10-05 06:34:07 +00:00
parent c2be654e2f
commit 33cab69410
6 changed files with 930 additions and 4 deletions

View File

@@ -1,7 +1,10 @@
<script lang="ts">
import { onDestroy } from "svelte"
import { mdiBookAccountOutline, mdiCreation, mdiFaceAgent, mdiHours24 } from "@mdi/js"
import ProductCategoryFrame from "../widgets/ProductCategoryFrame.svelte"
import CrinkledSection from "../CrinkledSection.svelte"
import { base64ToUint8, createPlayer, createRecorder, SAMPLE_RATE, uint8ToBase64 } from "../voicebotDemo/helper"
import { RealtimeServerEvent as RSE } from "../voicebotDemo/events"
const voiceProperties: Array<{ title: string; icon: string; color: string }> = [
{
@@ -25,6 +28,302 @@
color: "#EB5757",
},
]
const browser = typeof window !== "undefined"
const VOICE_WS_URL =
browser && window.location.protocol === "http:"
? "ws://2svoice-server.kontextwerk.info/api/v1/voicebot/ws"
: "wss://2svoice-server.kontextwerk.info/api/v1/voicebot/ws"
const CHUNK_DURATION_MS = 200
const CHUNK_SIZE_BYTES = Math.round((SAMPLE_RATE * CHUNK_DURATION_MS) / 1000) * 2
type VoiceStatus = "idle" | "connecting" | "connected" | "error"
let status: VoiceStatus = "idle"
let errorMessage = ""
let ws: WebSocket | null = null
let recorder: ReturnType<typeof createRecorder> | null = null
let player: ReturnType<typeof createPlayer> | null = null
let outboundBuffer = new Uint8Array(0)
let closing = false
let cleanupPromise: Promise<void> | null = null
let startPromise: Promise<void> | null = null
$: statusHint =
status === "idle"
? "Tippen, um die Voice-Demo zu starten"
: status === "connecting"
? "Verbindung wird aufgebaut …"
: status === "connected"
? "Live sprechen Sie jetzt"
: errorMessage || "Verbindung fehlgeschlagen"
const toggleVoiceDemo = async () => {
if (status === "connecting") return
if (status === "connected") {
await stopVoiceDemo()
return
}
await startVoiceDemo()
}
const handleKeydown = (event: KeyboardEvent) => {
if (event.key !== "Enter" && event.key !== " ") return
event.preventDefault()
void toggleVoiceDemo()
}
const startVoiceDemo = async () => {
if (!browser) {
status = "error"
errorMessage = "Die Sprach-Demo steht nur im Browser zur Verfügung."
return
}
if (startPromise || status === "connecting" || status === "connected") return
startPromise = (async () => {
await stopVoiceDemo({ resetStatus: false })
status = "connecting"
errorMessage = ""
outboundBuffer = new Uint8Array(0)
closing = false
try {
const newPlayer = createPlayer()
await newPlayer.init()
player = newPlayer
const handleChunk = (pcm: Int16Array) => {
if (pcm.length === 0) return
const bytes = new Uint8Array(pcm.byteLength)
bytes.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength))
appendToOutboundBuffer(bytes)
}
const newRecorder = createRecorder(handleChunk)
await newRecorder.start()
recorder = newRecorder
} catch (err) {
const message = extractErrorMessage(err, "Mikrofon konnte nicht gestartet werden.")
handleConnectionError(message, err)
return
}
try {
ws = new WebSocket(VOICE_WS_URL)
} catch (err) {
const message = extractErrorMessage(err, "WebSocket-Verbindung konnte nicht aufgebaut werden.")
handleConnectionError(message, err)
return
}
if (!ws) return
ws.onopen = () => {
status = "connected"
flushOutboundBuffer(true)
}
ws.onmessage = (event) => handleServerMessage(event)
ws.onerror = (event) => {
handleConnectionError("WebSocket-Fehler bitte später erneut versuchen.", event)
}
ws.onclose = () => {
if (!closing && status === "connected") {
status = "idle"
errorMessage = ""
}
}
})()
try {
await startPromise
} finally {
startPromise = null
}
}
const stopVoiceDemo = async ({ resetStatus = true }: { resetStatus?: boolean } = {}) => {
if (cleanupPromise) {
await cleanupPromise
if (resetStatus && status !== "error") {
status = "idle"
errorMessage = ""
}
return
}
closing = true
cleanupPromise = (async () => {
try {
flushOutboundBuffer(true)
} catch {
/* ignore */
}
if (recorder) {
try {
await recorder.stop()
} catch {
/* ignore */
}
}
recorder = null
if (player) {
try {
player.stop()
await player.destroy()
} catch {
/* ignore */
}
}
player = null
if (ws && ws.readyState === WebSocket.OPEN) {
try {
ws.close(1000, "client-stop")
} catch {
/* ignore */
}
}
ws = null
outboundBuffer = new Uint8Array(0)
})()
try {
await cleanupPromise
} finally {
cleanupPromise = null
closing = false
if (resetStatus && status !== "error") {
status = "idle"
errorMessage = ""
}
}
}
const handleServerMessage = (event: MessageEvent) => {
if (!player) return
let payload: unknown = event.data
if (typeof payload !== "string") return
try {
payload = JSON.parse(payload)
} catch (err) {
console.warn("VoiceBot Preview: Konnte Nachricht nicht parsen.", err)
return
}
const message = payload as Record<string, unknown>
const type = typeof message.type === "string" ? message.type : "<unbekannt>"
if (type === RSE.INPUT_AUDIO_BUFFER_SPEECH_STARTED) {
const { item_id, played_ms } = player.getNowPlaying()
if (item_id) {
player.stop()
ws?.send(
JSON.stringify({
type: "last_item_played_ms.truncate",
details: { item_id, played_ms: played_ms || 0 },
})
)
}
return
}
if (type === RSE.RESPONSE_AUDIO_DELTA) {
const bytes = base64ToUint8((message as any).delta)
const pcm = new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2)
player.play({
response_id: message.response_id,
item_id: message.item_id,
delta: message.delta,
pcmInt16: pcm,
})
return
}
if (type === "error") {
console.log("VoiceBot Preview: Server hat einen Fehler gemeldet.", message)
}
}
const appendToOutboundBuffer = (chunk: Uint8Array) => {
if (!chunk.length) return
const combined = new Uint8Array(outboundBuffer.length + chunk.length)
combined.set(outboundBuffer)
combined.set(chunk, outboundBuffer.length)
outboundBuffer = combined
flushOutboundBuffer()
}
const flushOutboundBuffer = (force = false) => {
if (!ws || ws.readyState !== WebSocket.OPEN || outboundBuffer.length === 0) {
return
}
const chunkSize = CHUNK_SIZE_BYTES > 0 ? CHUNK_SIZE_BYTES : outboundBuffer.length
let buffer = outboundBuffer
outboundBuffer = new Uint8Array(0)
while (buffer.length >= chunkSize && chunkSize > 0) {
const part = buffer.slice(0, chunkSize)
buffer = buffer.slice(chunkSize)
sendChunk(part)
}
if (force && buffer.length > 0) {
sendChunk(buffer)
} else if (buffer.length > 0) {
outboundBuffer = buffer
}
}
const sendChunk = (chunk: Uint8Array) => {
if (!ws || ws.readyState !== WebSocket.OPEN) return
try {
ws.send(JSON.stringify({ type: "input_audio_buffer.append", audio: uint8ToBase64(chunk) }))
} catch (err) {
if (!closing) {
handleConnectionError("Senden des Audiostreams fehlgeschlagen.", err)
}
}
}
const handleConnectionError = (message: string, err?: unknown) => {
console.error("VoiceBot Preview Fehler:", err ?? message)
errorMessage = message
status = "error"
void stopVoiceDemo({ resetStatus: false })
}
const extractErrorMessage = (err: unknown, fallback: string) => {
if (err instanceof DOMException) {
if (err.name === "NotAllowedError") {
return "Zugriff auf das Mikrofon wurde verweigert."
}
if (err.name === "NotFoundError") {
return "Kein Mikrofon gefunden oder verfügbar."
}
if (err.name === "NotReadableError") {
return "Auf das Mikrofon konnte nicht zugegriffen werden (ggf. bereits in Verwendung)."
}
if (err.name === "SecurityError") {
return "Der Browser blockiert den Zugriff bitte die Seite über HTTPS öffnen."
}
}
if (err instanceof Error && err.message) return err.message
return fallback
}
onDestroy(() => {
void stopVoiceDemo({ resetStatus: false })
})
</script>
<CrinkledSection
@@ -40,12 +339,29 @@
lowerDescription="Durch den Einsatz modernster KI-Technologien gewährleisten wir eine intelligente und effiziente Kommunikation, die den höchsten Datenschutzstandards entspricht."
>
{#snippet primaryContent()}
<div class="img">
<div
class="img"
class:connected={status === "connected"}
class:errored={status === "error"}
role="button"
tabindex="0"
aria-pressed={status === "connected"}
aria-busy={status === "connecting"}
aria-label="Voicebot Demo starten"
on:click={() => void toggleVoiceDemo()}
on:keydown={handleKeydown}
>
<img
src="/media/iphone.png"
alt="Kontextwerk is calling"
/>
<div class="shadow"></div>
<div
class="voice-overlay"
data-status={status}
aria-live="polite"
>
<span>{statusHint}</span>
</div>
</div>
{/snippet}
</ProductCategoryFrame>
@@ -54,19 +370,78 @@
<style lang="less">
.img {
position: relative;
width: 400px;
min-width: 400px;
max-height: 100%;
height: 100%;
display: flex;
justify-content: center;
align-items: center;
cursor: pointer;
border-radius: 1.6rem;
transition:
transform 0.2s ease,
border-color 0.2s ease;
outline: none;
img {
width: 60% !important;
height: 100%;
object-fit: contain;
pointer-events: none;
user-select: none;
}
.shadow {
display: none;
.voice-overlay {
position: absolute;
bottom: 1.2rem;
left: 50%;
transform: translateX(-50%);
padding: 0.45rem 1.1rem;
border-radius: 999px;
background: rgba(13, 12, 12, 0.8);
color: white;
font-size: 0.8rem;
font-weight: 500;
letter-spacing: 0.01em;
display: flex;
align-items: center;
gap: 0.4rem;
pointer-events: none;
white-space: nowrap;
transition:
background 0.2s ease,
color 0.2s ease;
}
.voice-overlay[data-status="connected"] {
background: rgba(76, 175, 80, 0.85);
}
.voice-overlay[data-status="connecting"] {
background: rgba(255, 152, 0, 0.85);
}
.voice-overlay[data-status="error"] {
background: rgba(235, 87, 87, 0.9);
}
&:hover {
transform: translateY(-4px);
}
&.connected {
border-color: rgba(76, 175, 80, 0.4);
}
&.errored {
border-color: rgba(235, 87, 87, 0.45);
}
&:focus-visible {
outline: 2px solid var(--primary-200);
outline-offset: 4px;
}
}
</style>

View File

@@ -0,0 +1,90 @@
/** -----------------------------
* Client-Events (Client → Server)
* ----------------------------- */
export enum RealtimeClientEvent {
// Session / Config
SESSION_UPDATE = "session.update",
// Input-Audio-Buffer (Client → Server)
INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append",
INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit",
INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear",
// WebRTC-spezifischer Output-Buffer (Client → Server)
OUTPUT_AUDIO_BUFFER_CLEAR = "output_audio_buffer.clear",
// Conversation-Items (Client → Server)
CONVERSATION_ITEM_CREATE = "conversation.item.create",
CONVERSATION_ITEM_DELETE = "conversation.item.delete",
CONVERSATION_ITEM_RETRIEVE = "conversation.item.retrieve",
CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate",
// Inferenzsteuerung
RESPONSE_CREATE = "response.create",
RESPONSE_CANCEL = "response.cancel",
}
/** ----------------------------
* Server-Events (Server → Client)
* ---------------------------- */
export enum RealtimeServerEvent {
// Session / Conversation
SESSION_CREATED = "session.created",
SESSION_UPDATED = "session.updated",
CONVERSATION_CREATED = "conversation.created",
// Conversation-Items (Server → Client)
CONVERSATION_ITEM_DONE = "conversation.item.done",
CONVERSATION_ITEM_RETRIEVED = "conversation.item.retrieved",
CONVERSATION_ITEM_DELETED = "conversation.item.deleted",
CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated",
CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed",
CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed",
// Input-Audio-Buffer (Server → Client)
INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared",
INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed",
INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started",
INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped",
// WebRTC-spezifischer Output-Buffer (Server → Client)
OUTPUT_AUDIO_BUFFER_CLEARED = "output_audio_buffer.cleared",
OUTPUT_AUDIO_BUFFER_STARTED = "output_audio_buffer.started",
OUTPUT_AUDIO_BUFFER_STOPPED = "output_audio_buffer.stopped",
// Rate Limits
RATE_LIMITS_UPDATED = "rate_limits.updated",
// Response-Lifecycle (Server → Client)
RESPONSE_CREATED = "response.created",
RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added",
RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done",
RESPONSE_CONTENT_PART_ADDED = "response.content_part.added",
RESPONSE_CONTENT_PART_DONE = "response.content_part.done",
// Streaming-Deltas
RESPONSE_TEXT_DELTA = "response.output_text.delta",
RESPONSE_TEXT_DONE = "response.output_text.done",
RESPONSE_AUDIO_DELTA = "response.output_audio.delta",
RESPONSE_AUDIO_DONE = "response.output_audio.done",
RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.output_audio_transcript.delta",
RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.output_audio_transcript.done",
RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta",
RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done",
// Abschluss / Abbruch / Fehler
RESPONSE_DONE = "response.done",
RESPONSE_CANCELLED = "response.cancelled",
ERROR = "error",
// Graph Synchronisation
GRAPH_CURSOR = "graph_cursor",
}
/** Optional: gemeinsame Typen */
export type AnyRealtimeEvent = RealtimeClientEvent | RealtimeServerEvent
export function extractSessionId(path: string): string | null {
const pathRegex = /^\/develop\/chats\/([^\/]+)(?:\/|$)/
const match = path.match(pathRegex)
return match && match[1] !== "new" ? match[1] : null
}

View File

@@ -0,0 +1,148 @@
const SAMPLE_RATE = 24_000
const BUFFER_SIZE = 4_800
const AUDIO_PLAYBACK_WORKLET_URL = "/assets/audio-playback-worklet.js"
const AUDIO_PROCESSOR_WORKLET_URL = "/assets/audio-processor-worklet.js"
const uint8ToBase64 = (u8: Uint8Array): string => {
let bin = ""
for (let i = 0; i < u8.length; i++) bin += String.fromCharCode(u8[i])
return btoa(bin)
}
const base64ToUint8 = (b64: string): Uint8Array => {
const bin = atob(b64)
const out = new Uint8Array(bin.length)
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i)
return out
}
interface NowPlayingMessage {
type: "nowPlaying"
item_id: string
played_ms: number
}
interface NowPlayingState {
item_id: string | null
played_ms: number
}
interface Player {
init: (sampleRate?: number) => Promise<void>
play: (delta) => void
deleteItem: (item_id: string) => void
stop: () => void
setSourceRate: (hz: number) => void
getNowPlaying: () => NowPlayingState
destroy: () => Promise<void>
mute: () => void
unmute: () => void
node?: AudioWorkletNode | null
}
const createPlayer = (defaultSampleRate = 48000): Player => {
let ctx: AudioContext | null = null
let node: AudioWorkletNode | null = null
let nowItemId: string | null = null
let playedMs = 0
const isNowPlayingMessage = (m: unknown): m is NowPlayingMessage => {
if (!m || typeof m !== "object") return false
const x = m as Record<string, unknown>
return x["type"] === "nowPlaying" && "played_ms" in x
}
const init = async (sampleRate = defaultSampleRate): Promise<void> => {
ctx = new AudioContext({ sampleRate })
await ctx.audioWorklet.addModule(AUDIO_PLAYBACK_WORKLET_URL)
node = new AudioWorkletNode(ctx, "audio-playback-worklet")
node.port.onmessage = (e: MessageEvent) => {
const m = e.data
if (isNowPlayingMessage(m)) {
nowItemId = m.item_id
playedMs = m.played_ms | 0
}
}
node.connect(ctx.destination)
}
const play = (delta: ResponseAudioDelta): void => {
if (!node) return
const buf = delta.pcmInt16.buffer
node.port.postMessage({ type: "appendDelta", delta }, [buf])
}
const deleteItem = (item_id: string): void => {
node?.port.postMessage({ type: "deleteItem", item_id })
}
const stop = (): void => {
node?.port.postMessage({ type: "clear" })
}
const setSourceRate = (hz: number): void => {
node?.port.postMessage({ type: "setSourceRate", hz })
}
const getNowPlaying = (): NowPlayingState => {
return { item_id: nowItemId, played_ms: playedMs }
}
const mute = (): void => {
node?.port.postMessage({ type: "mute" })
}
const unmute = (): void => {
node?.port.postMessage({ type: "unmute" })
}
const destroy = async (): Promise<void> => {
if (!ctx) return
try {
await ctx.close()
} finally {
ctx = null
node = null
nowItemId = null
playedMs = 0
}
}
return { init, play, deleteItem, stop, setSourceRate, getNowPlaying, destroy, mute, unmute }
}
const createRecorder = (onChunk: (pcm: Int16Array) => void) => {
let ctx: AudioContext | null = null
let stream: MediaStream | null = null
let source: MediaStreamAudioSourceNode | null = null
let worklet: AudioWorkletNode | null = null
const start = async () => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
if (ctx) await ctx.close()
ctx = new (window.AudioContext || (window as any).webkitAudioContext)({ sampleRate: SAMPLE_RATE })
await ctx.audioWorklet.addModule(AUDIO_PROCESSOR_WORKLET_URL)
source = ctx.createMediaStreamSource(stream)
worklet = new AudioWorkletNode(ctx, "audio-processor-worklet")
worklet.port.onmessage = (ev: MessageEvent<Int16Array>) => onChunk(ev.data)
source.connect(worklet)
worklet.connect(ctx.destination)
}
const stop = async () => {
if (stream) {
stream.getTracks().forEach((t) => t.stop())
stream = null
}
if (ctx) {
try {
await ctx.close()
} finally {
ctx = null
}
}
source = null
worklet = null
}
return { start, stop }
}
export { uint8ToBase64, base64ToUint8, createPlayer, createRecorder,SAMPLE_RATE }