-
Notifications
You must be signed in to change notification settings - Fork 129
Expand file tree
/
Copy path07-transcription-live-websocket.py
More file actions
97 lines (75 loc) · 3.45 KB
/
07-transcription-live-websocket.py
File metadata and controls
97 lines (75 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Example: Live Transcription with WebSocket (Listen V1)
This example shows how to stream audio for real-time transcription using WebSocket.
It reads an audio file, chunks it, and sends it as if it were microphone audio.
"""
import os
import threading
import time
import wave
from typing import Union
from dotenv import load_dotenv
load_dotenv()
from deepgram import DeepgramClient
from deepgram.core.events import EventType
from deepgram.listen.v1.types import (
ListenV1Metadata,
ListenV1Results,
ListenV1SpeechStarted,
ListenV1UtteranceEnd,
)
ListenV1SocketClientResponse = Union[ListenV1Results, ListenV1Metadata, ListenV1UtteranceEnd, ListenV1SpeechStarted]
# Audio file properties (from ffprobe: sample_rate=44100 Hz, mono, PCM s16le)
SAMPLE_RATE = 44100 # Hz
CHANNELS = 1 # mono
SAMPLE_WIDTH = 2 # 16-bit = 2 bytes per sample
# Calculate chunk size for 100ms of audio (to simulate real-time streaming)
CHUNK_DURATION_MS = 100 # milliseconds
CHUNK_SIZE = int(SAMPLE_RATE * SAMPLE_WIDTH * CHANNELS * (CHUNK_DURATION_MS / 1000.0))
CHUNK_DELAY = CHUNK_DURATION_MS / 1000.0 # Delay in seconds
client = DeepgramClient()
try:
with client.listen.v1.connect(model="nova-3") as connection:
def on_message(message: ListenV1SocketClientResponse) -> None:
msg_type = getattr(message, "type", "Unknown")
print(f"Received {msg_type} event")
# Extract transcription from Results events
if isinstance(message, ListenV1Results):
if message.channel and message.channel.alternatives:
transcript = message.channel.alternatives[0].transcript
if transcript:
print(f"Transcript: {transcript}")
connection.on(EventType.OPEN, lambda _: print("Connection opened"))
connection.on(EventType.MESSAGE, on_message)
connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
connection.on(EventType.ERROR, lambda error: print(f"Error: {error}"))
# Start listening in a background thread (it blocks until connection closes)
threading.Thread(target=connection.start_listening, daemon=True).start()
# Wait a moment for connection to establish
time.sleep(0.5)
# Load audio file and extract raw PCM data
audio_path = os.path.join(os.path.dirname(__file__), "fixtures", "audio.wav")
print(f"Loading audio file: {audio_path}")
with wave.open(audio_path, "rb") as wav_file:
# Read all audio frames as raw PCM data
audio_data = wav_file.readframes(wav_file.getnframes())
print(f"Audio loaded: {len(audio_data)} bytes")
print(f"Sending audio in {CHUNK_DURATION_MS}ms chunks...")
# Send audio in chunks with delays to simulate microphone input
chunk_count = 0
for i in range(0, len(audio_data), CHUNK_SIZE):
chunk = audio_data[i : i + CHUNK_SIZE]
if chunk:
connection.send_listen_v_1_media(chunk)
chunk_count += 1
time.sleep(CHUNK_DELAY)
print(f"Finished sending {chunk_count} chunks")
print("Waiting for final transcription...")
time.sleep(2)
# For async version:
# from deepgram import AsyncDeepgramClient
# async with client.listen.v1.connect(model="nova-3") as connection:
# # ... same event handlers ...
# await connection.start_listening()
except Exception as e:
print(f"Error: {e}")