From e05e8ab6db33115a5328efdc128b56980251fe9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 28 Dec 2025 15:47:23 +0700 Subject: [PATCH 01/14] Implement speaker voice sample extraction for person profiles --- backend/database/users.py | 46 +++++++++ backend/routers/pusher.py | 180 ++++++++++++++++++++++++++++++++- backend/routers/transcribe.py | 63 ++++++++++++ backend/utils/other/storage.py | 25 +++++ 4 files changed, 312 insertions(+), 2 deletions(-) diff --git a/backend/database/users.py b/backend/database/users.py index e1c1986cd5..32239fc148 100644 --- a/backend/database/users.py +++ b/backend/database/users.py @@ -100,6 +100,52 @@ def delete_person(uid: str, person_id: str): person_ref.delete() +def add_person_speech_sample(uid: str, person_id: str, sample_path: str, max_samples: int = 5) -> bool: + """ + Append speech sample path to person's speech_samples list. + Limits to max_samples to prevent unlimited growth. + + Args: + uid: User ID + person_id: Person ID + sample_path: GCS path to the speech sample + max_samples: Maximum number of samples to keep (default 5) + + Returns: + True if sample was added, False if limit reached + """ + person_ref = db.collection('users').document(uid).collection('people').document(person_id) + person_doc = person_ref.get() + + if not person_doc.exists: + return False + + person_data = person_doc.to_dict() + current_samples = person_data.get('speech_samples', []) + + # Check if we've hit the limit + if len(current_samples) >= max_samples: + return False + + person_ref.update({ + 'speech_samples': firestore.ArrayUnion([sample_path]), + 'updated_at': datetime.now(timezone.utc), + }) + return True + + +def get_person_speech_samples_count(uid: str, person_id: str) -> int: + """Get the count of speech samples for a person.""" + person_ref = db.collection('users').document(uid).collection('people').document(person_id) + person_doc = person_ref.get() + + if not person_doc.exists: + return 0 + + person_data = person_doc.to_dict() + return len(person_data.get('speech_samples', [])) + + def delete_user_data(uid: str): user_ref = db.collection('users').document(uid) if not user_ref.get().exists: diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index 4f8c0781b1..99695ac9ab 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -3,6 +3,7 @@ import json import time from datetime import datetime, timezone +from typing import List from fastapi import APIRouter from fastapi.websockets import WebSocketDisconnect, WebSocket @@ -21,10 +22,112 @@ realtime_transcript_webhook, get_audio_bytes_webhook_seconds, ) -from utils.other.storage import upload_audio_chunk +from utils.other.storage import ( + upload_audio_chunk, + list_audio_chunks, + download_audio_chunks_and_merge, + upload_person_speech_sample_from_bytes, +) router = APIRouter() +# Constants for speaker sample extraction +SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 2.0 # Minimum segment duration in seconds +SPEAKER_SAMPLE_PROCESS_INTERVAL = 5.0 # seconds between queue checks +SPEAKER_SAMPLE_MIN_AGE = 10.0 # seconds to wait before processing a request +PRIVATE_CLOUD_CHUNK_DURATION = 5.0 # Duration of each audio chunk in seconds + + +async def _extract_speaker_samples( + uid: str, + person_id: str, + conversation_id: str, + started_at_ts: float, + segments: List[dict], + chunks: List[dict], + sample_rate: int = 16000, +): + """ + Extract speech samples from segments and store as speaker profiles. + Processes each segment one by one, stops when sample limit reached. + Chunks are passed in from the caller (already verified to exist). + """ + try: + # Check current sample count once + sample_count = await asyncio.to_thread( + users_db.get_person_speech_samples_count, uid, person_id + ) + if sample_count >= 5: + print(f"Person {person_id} already has {sample_count} samples, skipping", uid, conversation_id) + return + + samples_added = 0 + max_samples_to_add = 5 - sample_count + + for seg in segments: + if samples_added >= max_samples_to_add: + break + + segment_start = seg.get('start') + segment_end = seg.get('end') + if segment_start is None or segment_end is None: + continue + + seg_duration = segment_end - segment_start + if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: + print(f"Segment too short ({seg_duration:.1f}s), skipping", uid, conversation_id) + continue + + # Calculate absolute timestamps + abs_start = started_at_ts + segment_start + abs_end = started_at_ts + segment_end + + # Find overlapping chunks + relevant_timestamps = [ + c['timestamp'] for c in chunks + if (c['timestamp'] + PRIVATE_CLOUD_CHUNK_DURATION) >= abs_start + and c['timestamp'] <= abs_end + ] + + if not relevant_timestamps: + print(f"No relevant chunks for segment {segment_start:.1f}-{segment_end:.1f}s", uid, conversation_id) + continue + + # Download, merge, and extract + merged = await asyncio.to_thread( + download_audio_chunks_and_merge, uid, conversation_id, relevant_timestamps + ) + buffer_start = min(relevant_timestamps) + bytes_per_second = sample_rate * 2 # 16-bit mono + + start_byte = max(0, int((abs_start - buffer_start) * bytes_per_second)) + end_byte = min(len(merged), int((abs_end - buffer_start) * bytes_per_second)) + sample_audio = merged[start_byte:end_byte] + + # Ensure minimum sample length (0.5 seconds) + min_sample_bytes = int(sample_rate * 0.5 * 2) + if len(sample_audio) < min_sample_bytes: + print(f"Sample too short ({len(sample_audio)} bytes), skipping", uid, conversation_id) + continue + + # Upload and store + path = await asyncio.to_thread( + upload_person_speech_sample_from_bytes, sample_audio, uid, person_id, sample_rate + ) + + success = await asyncio.to_thread( + users_db.add_person_speech_sample, uid, person_id, path + ) + if success: + samples_added += 1 + print(f"Stored speech sample {samples_added} for person {person_id}: {path}", uid, conversation_id) + else: + print(f"Failed to add speech sample for person {person_id}", uid, conversation_id) + break # Likely hit limit + + except Exception as e: + print(f"Error extracting speaker samples: {e}", uid, conversation_id) + async def _process_conversation_task(uid: str, conversation_id: str, language: str, websocket: WebSocket): """Process a conversation and send result back to _listen via websocket.""" @@ -123,9 +226,63 @@ async def save_audio_chunk(chunk_data: bytes, uid: str, conversation_id: str, ti upload_audio_chunk(chunk_data, uid, conversation_id, timestamp) # task + # Queue for pending speaker sample extraction requests + speaker_sample_queue: List[dict] = [] + + async def process_speaker_sample_queue(): + """Background task that processes speaker sample extraction requests.""" + nonlocal websocket_active, speaker_sample_queue + + while websocket_active or len(speaker_sample_queue) > 0: + await asyncio.sleep(SPEAKER_SAMPLE_PROCESS_INTERVAL) + + if not speaker_sample_queue: + continue + + current_time = time.time() + + # Separate ready and pending requests + ready_requests = [] + pending_requests = [] + + for request in speaker_sample_queue: + if current_time - request['queued_at'] >= SPEAKER_SAMPLE_MIN_AGE: + ready_requests.append(request) + else: + pending_requests.append(request) + + # Keep pending requests in queue + speaker_sample_queue = pending_requests + + # Process ready requests (fire and forget) + for request in ready_requests: + person_id = request['person_id'] + conv_id = request['conversation_id'] + started_at_ts = request['started_at'] + segments = request['segments'] + + try: + chunks = await asyncio.to_thread(list_audio_chunks, uid, conv_id) + if not chunks: + print(f"No chunks found for {conv_id}, skipping speaker sample extraction", uid) + continue + + await _extract_speaker_samples( + uid=uid, + person_id=person_id, + conversation_id=conv_id, + started_at_ts=started_at_ts, + segments=segments, + chunks=chunks, + sample_rate=sample_rate, + ) + except Exception as e: + print(f"Error extracting speaker samples: {e}", uid, conv_id) + async def receive_tasks(): nonlocal websocket_active nonlocal websocket_close_code + nonlocal speaker_sample_queue audiobuffer = bytearray() trigger_audiobuffer = bytearray() @@ -168,6 +325,24 @@ async def receive_tasks(): ) continue + # Speaker sample extraction request - queue for background processing + if header_type == 105: + res = json.loads(bytes(data[4:]).decode("utf-8")) + person_id = res.get('person_id') + conv_id = res.get('conversation_id') + started_at_ts = res.get('started_at') + segments = res.get('segments', []) + if person_id and conv_id and started_at_ts is not None and segments: + print(f"Queued speaker sample request: person={person_id}, {len(segments)} segments", uid) + speaker_sample_queue.append({ + 'person_id': person_id, + 'conversation_id': conv_id, + 'started_at': started_at_ts, + 'segments': segments, + 'queued_at': time.time(), + }) + continue + # Audio bytes if header_type == 101: audiobuffer.extend(data[4:]) @@ -218,7 +393,8 @@ async def receive_tasks(): try: receive_task = asyncio.create_task(receive_tasks()) - await asyncio.gather(receive_task) + speaker_sample_task = asyncio.create_task(process_speaker_sample_queue()) + await asyncio.gather(receive_task, speaker_sample_task) except Exception as e: print(f"Error during WebSocket operation: {e}") diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 6c44820488..7c81a1b331 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -1081,6 +1081,30 @@ async def close(code: int = 1000): if pusher_ws: await pusher_ws.close(code) + async def send_speaker_sample_request( + person_id: str, + conv_id: str, + started_at_ts: float, + segments: List[dict], + ): + """Send speaker sample extraction request to pusher with list of segments.""" + nonlocal pusher_ws, pusher_connected + if not pusher_connected or not pusher_ws: + return + try: + data = bytearray() + data.extend(struct.pack("I", 105)) + data.extend(bytes(json.dumps({ + "person_id": person_id, + "conversation_id": conv_id, + "started_at": started_at_ts, + "segments": segments, + }), "utf-8")) + await pusher_ws.send(data) + print(f"Sent speaker sample request to pusher: person={person_id}, {len(segments)} segments", uid, session_id) + except Exception as e: + print(f"Failed to send speaker sample request: {e}", uid, session_id) + def is_connected(): return pusher_connected @@ -1094,6 +1118,7 @@ def is_connected(): request_conversation_processing, pusher_receive, is_connected, + send_speaker_sample_request, ) transcript_send = None @@ -1105,6 +1130,7 @@ def is_connected(): request_conversation_processing = None pusher_receive = None pusher_is_connected = None + send_speaker_sample_request = None # Transcripts # @@ -1562,6 +1588,42 @@ async def close_soniox_profile(): print( f"Speaker {speaker_id} assigned to {person_name} ({person_id})", uid, session_id ) + + # Forward to pusher for speech sample extraction (non-blocking) + # Only for real people (not 'user') and when private cloud sync is enabled + if ( + person_id + and person_id != 'user' + and private_cloud_sync_enabled + and send_speaker_sample_request is not None + and current_conversation_id + ): + # Get conversation for started_at and segment info + conv_data = conversations_db.get_conversation(uid, current_conversation_id) + if conv_data and conv_data.get('started_at'): + started_at = conv_data['started_at'] + started_at_ts = started_at.timestamp() if hasattr(started_at, 'timestamp') else started_at + conv_segments = conv_data.get('transcript_segments', []) + + # Collect segments with valid start/end + segments_to_extract = [] + for sid in segment_ids: + seg = next((s for s in conv_segments if s.get('id') == sid), None) + if seg and seg.get('start') is not None and seg.get('end') is not None: + segments_to_extract.append({ + 'start': seg['start'], + 'end': seg['end'], + }) + + if segments_to_extract: + asyncio.create_task( + send_speaker_sample_request( + person_id=person_id, + conv_id=current_conversation_id, + started_at_ts=started_at_ts, + segments=segments_to_extract, + ) + ) else: print( "Speaker assignment ignored: no segment_ids or no speech-profile-processed segments.", @@ -1602,6 +1664,7 @@ async def close_soniox_profile(): request_conversation_processing, pusher_receive, pusher_is_connected, + send_speaker_sample_request, ) = create_pusher_task_handler() # Pusher connection diff --git a/backend/utils/other/storage.py b/backend/utils/other/storage.py index 70c27c342d..5833783295 100644 --- a/backend/utils/other/storage.py +++ b/backend/utils/other/storage.py @@ -141,6 +141,31 @@ def delete_user_person_speech_samples(uid: str, person_id: str) -> None: blob.delete() +def upload_person_speech_sample_from_bytes( + audio_bytes: bytes, + uid: str, + person_id: str, + sample_rate: int = 16000, +) -> str: + """Upload PCM audio bytes as WAV speech sample. Returns GCS path.""" + import uuid as uuid_module + + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) # 16-bit audio + wav_file.setframerate(sample_rate) + wav_file.writeframes(audio_bytes) + + bucket = storage_client.bucket(speech_profiles_bucket) + filename = f"{uuid_module.uuid4()}.wav" + path = f'{uid}/people_profiles/{person_id}/{filename}' + blob = bucket.blob(path) + blob.upload_from_string(wav_buffer.getvalue(), content_type='audio/wav') + + return path + + def get_user_people_ids(uid: str) -> List[str]: bucket = storage_client.bucket(speech_profiles_bucket) blobs = bucket.list_blobs(prefix=f'{uid}/people_profiles/') From ed668aebdf007a00321396809c2f14a32db8f5f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 28 Dec 2025 17:22:18 +0700 Subject: [PATCH 02/14] Improve audio timestamp accuracy in transcribe and pusher services --- backend/routers/pusher.py | 13 +++++++++---- backend/routers/transcribe.py | 18 ++++++++++++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index 99695ac9ab..0c73db8d84 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -345,15 +345,20 @@ async def receive_tasks(): # Audio bytes if header_type == 101: - audiobuffer.extend(data[4:]) - trigger_audiobuffer.extend(data[4:]) + # Parse: header(4) | timestamp(8 bytes double) | audio_data + buffer_start_timestamp = struct.unpack("d", data[4:12])[0] + audio_data = data[12:] + + audiobuffer.extend(audio_data) + trigger_audiobuffer.extend(audio_data) # Private cloud sync if private_cloud_sync_enabled and current_conversation_id: if private_cloud_chunk_start_time is None: - private_cloud_chunk_start_time = time.time() + # Use timestamp from first buffer of this 5-second chunk + private_cloud_chunk_start_time = buffer_start_timestamp - private_cloud_sync_buffer.extend(data[4:]) + private_cloud_sync_buffer.extend(audio_data) # Save chunk every 5 seconds (sample_rate * 2 bytes per sample * 5 seconds) if len(private_cloud_sync_buffer) >= sample_rate * 2 * private_cloud_sync_delay_seconds: chunk_data = bytes(private_cloud_sync_buffer) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 7c81a1b331..aa93f4d998 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -928,16 +928,19 @@ async def transcript_consume(): # Audio bytes audio_buffers = bytearray() + audio_buffer_last_received: float = None # Track when last audio was received audio_bytes_enabled = ( bool(get_audio_bytes_webhook_seconds(uid)) or is_audio_bytes_app_enabled(uid) or private_cloud_sync_enabled ) - def audio_bytes_send(audio_bytes): - nonlocal audio_buffers + def audio_bytes_send(audio_bytes: bytes, received_at: float): + nonlocal audio_buffers, audio_buffer_last_received audio_buffers.extend(audio_bytes) + audio_buffer_last_received = received_at async def _audio_bytes_flush(auto_reconnect: bool = True): nonlocal audio_buffers + nonlocal audio_buffer_last_received nonlocal pusher_ws nonlocal pusher_connected nonlocal last_synced_conversation_id @@ -964,9 +967,16 @@ async def _audio_bytes_flush(auto_reconnect: bool = True): # Send audio bytes if pusher_connected and pusher_ws and len(audio_buffers) > 0: try: - # 101|data + # Calculate buffer start time: + # buffer_start = last_received_time - buffer_duration + # buffer_duration = buffer_length_bytes / (sample_rate * 2 bytes per sample) + buffer_duration_seconds = len(audio_buffers) / (sample_rate * 2) + buffer_start_time = (audio_buffer_last_received or time.time()) - buffer_duration_seconds + + # 101|timestamp(8 bytes double)|audio_data data = bytearray() data.extend(struct.pack("I", 101)) + data.extend(struct.pack("d", buffer_start_time)) data.extend(audio_buffers.copy()) audio_buffers = bytearray() # reset await pusher_ws.send(data) @@ -1546,7 +1556,7 @@ async def close_soniox_profile(): await flush_stt_buffer() if audio_bytes_send is not None: - audio_bytes_send(data) + audio_bytes_send(data, last_audio_received_time) elif message.get("text") is not None: try: From 72176a3fce0c29e2fd93463942aa5ed51c501ce7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 28 Dec 2025 17:46:28 +0700 Subject: [PATCH 03/14] Simplify speaker sample request by sending only segment IDs --- backend/routers/pusher.py | 60 +++++++++++++++++++++++------------ backend/routers/transcribe.py | 43 +++++++------------------ 2 files changed, 50 insertions(+), 53 deletions(-) diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index 0c73db8d84..d27fccf023 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -42,15 +42,13 @@ async def _extract_speaker_samples( uid: str, person_id: str, conversation_id: str, - started_at_ts: float, - segments: List[dict], - chunks: List[dict], + segment_ids: List[str], sample_rate: int = 16000, ): """ Extract speech samples from segments and store as speaker profiles. + Fetches conversation from DB to get started_at and segment details. Processes each segment one by one, stops when sample limit reached. - Chunks are passed in from the caller (already verified to exist). """ try: # Check current sample count once @@ -61,13 +59,43 @@ async def _extract_speaker_samples( print(f"Person {person_id} already has {sample_count} samples, skipping", uid, conversation_id) return + # Fetch conversation to get started_at and segment details + conversation = await asyncio.to_thread( + conversations_db.get_conversation, uid, conversation_id + ) + if not conversation: + print(f"Conversation {conversation_id} not found", uid) + return + + started_at = conversation.get('started_at') + if not started_at: + print(f"Conversation {conversation_id} has no started_at", uid) + return + + started_at_ts = started_at.timestamp() if hasattr(started_at, 'timestamp') else float(started_at) + + # Build segment lookup from conversation's transcript_segments + conv_segments = conversation.get('transcript_segments', []) + segment_map = {s.get('id'): s for s in conv_segments if s.get('id')} + + # List chunks from storage + chunks = await asyncio.to_thread(list_audio_chunks, uid, conversation_id) + if not chunks: + print(f"No chunks found for {conversation_id}, skipping speaker sample extraction", uid) + return + samples_added = 0 max_samples_to_add = 5 - sample_count - for seg in segments: + for seg_id in segment_ids: if samples_added >= max_samples_to_add: break + seg = segment_map.get(seg_id) + if not seg: + print(f"Segment {seg_id} not found in conversation", uid, conversation_id) + continue + segment_start = seg.get('start') segment_end = seg.get('end') if segment_start is None or segment_end is None: @@ -258,22 +286,14 @@ async def process_speaker_sample_queue(): for request in ready_requests: person_id = request['person_id'] conv_id = request['conversation_id'] - started_at_ts = request['started_at'] - segments = request['segments'] + segment_ids = request['segment_ids'] try: - chunks = await asyncio.to_thread(list_audio_chunks, uid, conv_id) - if not chunks: - print(f"No chunks found for {conv_id}, skipping speaker sample extraction", uid) - continue - await _extract_speaker_samples( uid=uid, person_id=person_id, conversation_id=conv_id, - started_at_ts=started_at_ts, - segments=segments, - chunks=chunks, + segment_ids=segment_ids, sample_rate=sample_rate, ) except Exception as e: @@ -330,15 +350,13 @@ async def receive_tasks(): res = json.loads(bytes(data[4:]).decode("utf-8")) person_id = res.get('person_id') conv_id = res.get('conversation_id') - started_at_ts = res.get('started_at') - segments = res.get('segments', []) - if person_id and conv_id and started_at_ts is not None and segments: - print(f"Queued speaker sample request: person={person_id}, {len(segments)} segments", uid) + segment_ids = res.get('segment_ids', []) + if person_id and conv_id and segment_ids: + print(f"Queued speaker sample request: person={person_id}, {len(segment_ids)} segments", uid) speaker_sample_queue.append({ 'person_id': person_id, 'conversation_id': conv_id, - 'started_at': started_at_ts, - 'segments': segments, + 'segment_ids': segment_ids, 'queued_at': time.time(), }) continue diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index aa93f4d998..44c5315665 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -1094,10 +1094,9 @@ async def close(code: int = 1000): async def send_speaker_sample_request( person_id: str, conv_id: str, - started_at_ts: float, - segments: List[dict], + segment_ids: List[str], ): - """Send speaker sample extraction request to pusher with list of segments.""" + """Send speaker sample extraction request to pusher with segment IDs.""" nonlocal pusher_ws, pusher_connected if not pusher_connected or not pusher_ws: return @@ -1107,11 +1106,10 @@ async def send_speaker_sample_request( data.extend(bytes(json.dumps({ "person_id": person_id, "conversation_id": conv_id, - "started_at": started_at_ts, - "segments": segments, + "segment_ids": segment_ids, }), "utf-8")) await pusher_ws.send(data) - print(f"Sent speaker sample request to pusher: person={person_id}, {len(segments)} segments", uid, session_id) + print(f"Sent speaker sample request to pusher: person={person_id}, {len(segment_ids)} segments", uid, session_id) except Exception as e: print(f"Failed to send speaker sample request: {e}", uid, session_id) @@ -1608,32 +1606,13 @@ async def close_soniox_profile(): and send_speaker_sample_request is not None and current_conversation_id ): - # Get conversation for started_at and segment info - conv_data = conversations_db.get_conversation(uid, current_conversation_id) - if conv_data and conv_data.get('started_at'): - started_at = conv_data['started_at'] - started_at_ts = started_at.timestamp() if hasattr(started_at, 'timestamp') else started_at - conv_segments = conv_data.get('transcript_segments', []) - - # Collect segments with valid start/end - segments_to_extract = [] - for sid in segment_ids: - seg = next((s for s in conv_segments if s.get('id') == sid), None) - if seg and seg.get('start') is not None and seg.get('end') is not None: - segments_to_extract.append({ - 'start': seg['start'], - 'end': seg['end'], - }) - - if segments_to_extract: - asyncio.create_task( - send_speaker_sample_request( - person_id=person_id, - conv_id=current_conversation_id, - started_at_ts=started_at_ts, - segments=segments_to_extract, - ) - ) + asyncio.create_task( + send_speaker_sample_request( + person_id=person_id, + conv_id=current_conversation_id, + segment_ids=segment_ids, + ) + ) else: print( "Speaker assignment ignored: no segment_ids or no speech-profile-processed segments.", From 1fd1270944b0026eaf7c9e0fd531f0eb8d5dd04d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 28 Dec 2025 17:51:20 +0700 Subject: [PATCH 04/14] Enhance speaker sample extraction by expanding adjacent segments with same speaker --- backend/routers/pusher.py | 115 +++++++++++++++++++++------------- backend/routers/transcribe.py | 22 +++++++ 2 files changed, 92 insertions(+), 45 deletions(-) diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index d27fccf023..6c5f289d9d 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -14,7 +14,11 @@ from database.redis_db import get_cached_user_geolocation from models.conversation import Conversation, ConversationStatus, Geolocation from utils.apps import is_audio_bytes_app_enabled -from utils.app_integrations import trigger_realtime_integrations, trigger_realtime_audio_bytes, trigger_external_integrations +from utils.app_integrations import ( + trigger_realtime_integrations, + trigger_realtime_audio_bytes, + trigger_external_integrations, +) from utils.conversations.location import get_google_maps_location from utils.conversations.process_conversation import process_conversation from utils.webhooks import ( @@ -32,7 +36,7 @@ router = APIRouter() # Constants for speaker sample extraction -SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 2.0 # Minimum segment duration in seconds +SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 10.0 # Minimum segment duration in seconds SPEAKER_SAMPLE_PROCESS_INTERVAL = 5.0 # seconds between queue checks SPEAKER_SAMPLE_MIN_AGE = 10.0 # seconds to wait before processing a request PRIVATE_CLOUD_CHUNK_DURATION = 5.0 # Duration of each audio chunk in seconds @@ -52,17 +56,13 @@ async def _extract_speaker_samples( """ try: # Check current sample count once - sample_count = await asyncio.to_thread( - users_db.get_person_speech_samples_count, uid, person_id - ) + sample_count = await asyncio.to_thread(users_db.get_person_speech_samples_count, uid, person_id) if sample_count >= 5: print(f"Person {person_id} already has {sample_count} samples, skipping", uid, conversation_id) return # Fetch conversation to get started_at and segment details - conversation = await asyncio.to_thread( - conversations_db.get_conversation, uid, conversation_id - ) + conversation = await asyncio.to_thread(conversations_db.get_conversation, uid, conversation_id) if not conversation: print(f"Conversation {conversation_id} not found", uid) return @@ -87,6 +87,10 @@ async def _extract_speaker_samples( samples_added = 0 max_samples_to_add = 5 - sample_count + # Build ordered list with index lookup for expansion + ordered_segments = [s for s in conv_segments if s.get('id')] + segment_index_map = {s.get('id'): i for i, s in enumerate(ordered_segments)} + for seg_id in segment_ids: if samples_added >= max_samples_to_add: break @@ -102,8 +106,44 @@ async def _extract_speaker_samples( continue seg_duration = segment_end - segment_start + speaker_id = seg.get('speaker_id') + + # If segment is too short, try expanding to adjacent segments with same speaker + if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: + seg_idx = segment_index_map.get(seg_id) + if seg_idx is not None: + # Expand backward + i = seg_idx - 1 + while i >= 0: + prev_seg = ordered_segments[i] + if prev_seg.get('speaker_id') != speaker_id: + break + prev_start = prev_seg.get('start') + if prev_start is not None: + segment_start = min(segment_start, prev_start) + i -= 1 + + # Expand forward + i = seg_idx + 1 + while i < len(ordered_segments): + next_seg = ordered_segments[i] + if next_seg.get('speaker_id') != speaker_id: + break + next_end = next_seg.get('end') + if next_end is not None: + segment_end = max(segment_end, next_end) + i += 1 + + seg_duration = segment_end - segment_start + if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: + print( + f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", + uid, + conversation_id, + ) + if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: - print(f"Segment too short ({seg_duration:.1f}s), skipping", uid, conversation_id) + print(f"Segment too short ({seg_duration:.1f}s) even after expansion, skipping", uid, conversation_id) continue # Calculate absolute timestamps @@ -112,9 +152,9 @@ async def _extract_speaker_samples( # Find overlapping chunks relevant_timestamps = [ - c['timestamp'] for c in chunks - if (c['timestamp'] + PRIVATE_CLOUD_CHUNK_DURATION) >= abs_start - and c['timestamp'] <= abs_end + c['timestamp'] + for c in chunks + if (c['timestamp'] + PRIVATE_CLOUD_CHUNK_DURATION) >= abs_start and c['timestamp'] <= abs_end ] if not relevant_timestamps: @@ -122,9 +162,7 @@ async def _extract_speaker_samples( continue # Download, merge, and extract - merged = await asyncio.to_thread( - download_audio_chunks_and_merge, uid, conversation_id, relevant_timestamps - ) + merged = await asyncio.to_thread(download_audio_chunks_and_merge, uid, conversation_id, relevant_timestamps) buffer_start = min(relevant_timestamps) bytes_per_second = sample_rate * 2 # 16-bit mono @@ -143,9 +181,7 @@ async def _extract_speaker_samples( upload_person_speech_sample_from_bytes, sample_audio, uid, person_id, sample_rate ) - success = await asyncio.to_thread( - users_db.add_person_speech_sample, uid, person_id, path - ) + success = await asyncio.to_thread(users_db.add_person_speech_sample, uid, person_id, path) if success: samples_added += 1 print(f"Stored speech sample {samples_added} for person {person_id}: {path}", uid, conversation_id) @@ -163,10 +199,7 @@ async def _process_conversation_task(uid: str, conversation_id: str, language: s conversation_data = conversations_db.get_conversation(uid, conversation_id) if not conversation_data: # Send error response - response = { - "conversation_id": conversation_id, - "error": "conversation_not_found" - } + response = {"conversation_id": conversation_id, "error": "conversation_not_found"} data = bytearray() data.extend(struct.pack("I", 201)) data.extend(bytes(json.dumps(response), "utf-8")) @@ -174,7 +207,7 @@ async def _process_conversation_task(uid: str, conversation_id: str, language: s return conversation = Conversation(**conversation_data) - + if conversation.status != ConversationStatus.processing: conversations_db.update_conversation_status(uid, conversation.id, ConversationStatus.processing) conversation.status = ConversationStatus.processing @@ -187,12 +220,8 @@ async def _process_conversation_task(uid: str, conversation_id: str, language: s conversation.geolocation = get_google_maps_location(geolocation.latitude, geolocation.longitude) # Run blocking operations in thread pool to avoid blocking event loop - conversation = await asyncio.to_thread( - process_conversation, uid, language, conversation - ) - messages = await asyncio.to_thread( - trigger_external_integrations, uid, conversation - ) + conversation = await asyncio.to_thread(process_conversation, uid, language, conversation) + messages = await asyncio.to_thread(trigger_external_integrations, uid, conversation) except Exception as e: print(f"Error processing conversation: {e}", uid, conversation_id) conversations_db.set_conversation_as_discarded(uid, conversation.id) @@ -200,21 +229,15 @@ async def _process_conversation_task(uid: str, conversation_id: str, language: s messages = [] # Send success response back (minimal - transcribe will fetch from DB) - response = { - "conversation_id": conversation_id, - "success": True - } + response = {"conversation_id": conversation_id, "success": True} data = bytearray() data.extend(struct.pack("I", 201)) data.extend(bytes(json.dumps(response), "utf-8")) await websocket.send_bytes(data) - + except Exception as e: print(f"Error in _process_conversation_task: {e}", uid, conversation_id) - response = { - "conversation_id": conversation_id, - "error": str(e) - } + response = {"conversation_id": conversation_id, "error": str(e)} data = bytearray() data.extend(struct.pack("I", 201)) data.extend(bytes(json.dumps(response), "utf-8")) @@ -353,12 +376,14 @@ async def receive_tasks(): segment_ids = res.get('segment_ids', []) if person_id and conv_id and segment_ids: print(f"Queued speaker sample request: person={person_id}, {len(segment_ids)} segments", uid) - speaker_sample_queue.append({ - 'person_id': person_id, - 'conversation_id': conv_id, - 'segment_ids': segment_ids, - 'queued_at': time.time(), - }) + speaker_sample_queue.append( + { + 'person_id': person_id, + 'conversation_id': conv_id, + 'segment_ids': segment_ids, + 'queued_at': time.time(), + } + ) continue # Audio bytes @@ -366,7 +391,7 @@ async def receive_tasks(): # Parse: header(4) | timestamp(8 bytes double) | audio_data buffer_start_timestamp = struct.unpack("d", data[4:12])[0] audio_data = data[12:] - + audiobuffer.extend(audio_data) trigger_audiobuffer.extend(audio_data) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 44c5315665..b9c4ceef78 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -1272,6 +1272,28 @@ async def stream_transcript_process(): transcript_segments = [] if segments_to_process: last_transcript_time = time.time() + + # Log segment times BEFORE any modification + if first_audio_byte_timestamp: + for seg in segments_to_process: + abs_start = first_audio_byte_timestamp + seg["start"] + abs_end = first_audio_byte_timestamp + seg["end"] + print( + f"[SEGMENT_TIMING] raw_start={seg['start']:.3f}s raw_end={seg['end']:.3f}s | " + f"abs_start={abs_start:.3f} abs_end={abs_end:.3f} | " + f"abs_start_dt={datetime.fromtimestamp(abs_start, tz=timezone.utc).isoformat()} | " + f"text={seg.get('text', '')[:50]}", + uid, session_id + ) + else: + for seg in segments_to_process: + print( + f"[SEGMENT_TIMING] raw_start={seg['start']:.3f}s raw_end={seg['end']:.3f}s | " + f"first_audio_byte_timestamp=None | " + f"text={seg.get('text', '')[:50]}", + uid, session_id + ) + if seconds_to_trim is None: seconds_to_trim = segments_to_process[0]["start"] From 3e61af63cfdabb7e00f3284caa21d4be56af49cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Mon, 29 Dec 2025 10:22:01 +0700 Subject: [PATCH 05/14] Simplify transcript time offset with unified approach --- backend/routers/transcribe.py | 104 ++++++++++++---------------------- 1 file changed, 36 insertions(+), 68 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index b9c4ceef78..068130305e 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -281,8 +281,6 @@ def onboarding_stream_transcript(segments: List[dict]): last_usage_record_timestamp: Optional[float] = None words_transcribed_since_last_record: int = 0 last_transcript_time: Optional[float] = None - seconds_to_trim = None - seconds_to_add = None current_conversation_id = None async def _record_usage_periodically(): @@ -492,8 +490,6 @@ def send_last_conversation(): # Create new stub conversation for next batch async def _create_new_in_progress_conversation(): - nonlocal seconds_to_trim - nonlocal seconds_to_add nonlocal current_conversation_id conversation_source = ConversationSource.omi @@ -561,8 +557,6 @@ async def _create_new_in_progress_conversation(): redis_db.set_conversation_meeting_id(new_conversation_id, detected_meeting_id) current_conversation_id = new_conversation_id - seconds_to_trim = None - seconds_to_add = None print(f"Created new stub conversation: {new_conversation_id}", uid, session_id) @@ -583,7 +577,6 @@ async def _process_conversation(conversation_id: str): # Process existing conversations async def _prepare_in_progess_conversations(): - nonlocal seconds_to_add nonlocal current_conversation_id if existing_conversation := retrieve_in_progress_conversation(uid): @@ -600,14 +593,8 @@ async def _prepare_in_progess_conversations(): # Continue with the existing conversation current_conversation_id = existing_conversation['id'] - started_at = datetime.fromisoformat(existing_conversation['started_at'].isoformat()) - seconds_to_add = ( - (datetime.now(timezone.utc) - started_at).total_seconds() - if existing_conversation['transcript_segments'] - else None - ) print( - f"Resuming conversation {current_conversation_id} with {(seconds_to_add if seconds_to_add else 0):.1f}s offset. Will timeout in {conversation_creation_timeout - seconds_since_last_segment:.1f}s", + f"Resuming conversation {current_conversation_id}. Will timeout in {conversation_creation_timeout - seconds_since_last_segment:.1f}s", uid, session_id, ) @@ -634,24 +621,11 @@ def _process_speaker_assigned_segments(transcript_segments: List[TranscriptSegme segment.person_id = person_id def _update_in_progress_conversation( - conversation_id: str, segments: List[TranscriptSegment], photos: List[ConversationPhoto], finished_at: datetime + conversation: Conversation, segments: List[TranscriptSegment], photos: List[ConversationPhoto], finished_at: datetime ): - """Update the current in-progress conversation with new segments/photos.""" - conversation_data = conversations_db.get_conversation(uid, conversation_id) - if not conversation_data: - print(f"Warning: conversation {conversation_id} not found", uid, session_id) - return None, (0, 0) - - conversation = Conversation(**conversation_data) starts, ends = (0, 0) if segments: - # If conversation has no segments yet but we're adding some, update started_at - if not conversation.transcript_segments: - started_at = finished_at - timedelta(seconds=max(0, segments[-1].end)) - conversations_db.update_conversation(uid, conversation.id, {'started_at': started_at}) - conversation.started_at = started_at - conversation.transcript_segments, (starts, ends) = TranscriptSegment.combine_segments( conversation.transcript_segments, segments ) @@ -1252,7 +1226,7 @@ async def conversation_lifecycle_manager(): await _create_new_in_progress_conversation() async def stream_transcript_process(): - nonlocal websocket_active, realtime_segment_buffers, realtime_photo_buffers, websocket, seconds_to_trim + nonlocal websocket_active, realtime_segment_buffers, realtime_photo_buffers, websocket nonlocal current_conversation_id, translation_enabled, speaker_to_person_map, suggested_segments, words_transcribed_since_last_record, last_transcript_time while websocket_active or len(realtime_segment_buffers) > 0 or len(realtime_photo_buffers) > 0: @@ -1269,44 +1243,40 @@ async def stream_transcript_process(): finished_at = datetime.now(timezone.utc) + # Get conversation + conversation_data = conversations_db.get_conversation(uid, current_conversation_id) + if not conversation_data: + print(f"Warning: conversation {current_conversation_id} not found during segment processing", uid, session_id) + continue + + # Guard first_audio_byte_timestamp must be set + if not first_audio_byte_timestamp: + print(f"Warning: first_audio_byte_timestamp not set, skipping segment processing", uid, +session_id) + continue + transcript_segments = [] if segments_to_process: last_transcript_time = time.time() - # Log segment times BEFORE any modification - if first_audio_byte_timestamp: - for seg in segments_to_process: - abs_start = first_audio_byte_timestamp + seg["start"] - abs_end = first_audio_byte_timestamp + seg["end"] - print( - f"[SEGMENT_TIMING] raw_start={seg['start']:.3f}s raw_end={seg['end']:.3f}s | " - f"abs_start={abs_start:.3f} abs_end={abs_end:.3f} | " - f"abs_start_dt={datetime.fromtimestamp(abs_start, tz=timezone.utc).isoformat()} | " - f"text={seg.get('text', '')[:50]}", - uid, session_id - ) - else: - for seg in segments_to_process: - print( - f"[SEGMENT_TIMING] raw_start={seg['start']:.3f}s raw_end={seg['end']:.3f}s | " - f"first_audio_byte_timestamp=None | " - f"text={seg.get('text', '')[:50]}", - uid, session_id - ) - - if seconds_to_trim is None: - seconds_to_trim = segments_to_process[0]["start"] - - if seconds_to_add: - for i, segment in enumerate(segments_to_process): - segment["start"] += seconds_to_add - segment["end"] += seconds_to_add - segments_to_process[i] = segment - elif seconds_to_trim: - for i, segment in enumerate(segments_to_process): - segment["start"] -= seconds_to_trim - segment["end"] -= seconds_to_trim - segments_to_process[i] = segment + # If conversation has no segments yet, set started_at based on when first speech occurred + if not conversation_data.get('transcript_segments'): + first_speech_timestamp = first_audio_byte_timestamp + segments_to_process[0]["start"] + new_started_at = datetime.fromtimestamp(first_speech_timestamp, tz=timezone.utc) + conversations_db.update_conversation(uid, current_conversation_id, {'started_at': new_started_at}) + conversation_data['started_at'] = new_started_at + + # Calculate unified time offset: audio stream start relative to conversation start + conversation_started_at = conversation_data['started_at'] + if isinstance(conversation_started_at, str): + conversation_started_at = datetime.fromisoformat(conversation_started_at) + time_offset = first_audio_byte_timestamp - conversation_started_at.timestamp() + + # Apply offset to all segments + for i, segment in enumerate(segments_to_process): + segment["start"] += time_offset + segment["end"] += time_offset + segments_to_process[i] = segment newly_processed_segments = [] for s in segments_to_process: @@ -1323,12 +1293,10 @@ async def stream_transcript_process(): current_session_segments[seg.id] = seg.speech_profile_processed transcript_segments, _ = TranscriptSegment.combine_segments([], newly_processed_segments) - if not current_conversation_id: - print("Warning: No current conversation ID", uid, session_id) - continue - + # Update transcript segments + conversation = Conversation(**conversation_data) result = _update_in_progress_conversation( - current_conversation_id, transcript_segments, photos_to_process, finished_at + conversation, transcript_segments, photos_to_process, finished_at ) if not result or not result[0]: continue From edee14378a7d36397ae73f615556bbfbc63deab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Mon, 29 Dec 2025 11:05:22 +0700 Subject: [PATCH 06/14] Simplify segment expansion logic in speaker sample extraction --- backend/routers/pusher.py | 81 ++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index 6c5f289d9d..bbfb2bcc93 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -108,39 +108,27 @@ async def _extract_speaker_samples( seg_duration = segment_end - segment_start speaker_id = seg.get('speaker_id') - # If segment is too short, try expanding to adjacent segments with same speaker - if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: - seg_idx = segment_index_map.get(seg_id) - if seg_idx is not None: - # Expand backward - i = seg_idx - 1 - while i >= 0: - prev_seg = ordered_segments[i] - if prev_seg.get('speaker_id') != speaker_id: - break - prev_start = prev_seg.get('start') - if prev_start is not None: - segment_start = min(segment_start, prev_start) - i -= 1 - - # Expand forward - i = seg_idx + 1 - while i < len(ordered_segments): - next_seg = ordered_segments[i] - if next_seg.get('speaker_id') != speaker_id: - break - next_end = next_seg.get('end') - if next_end is not None: - segment_end = max(segment_end, next_end) - i += 1 - - seg_duration = segment_end - segment_start - if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: - print( - f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", - uid, - conversation_id, - ) + # # If segment is too short, try expanding to adjacent segments with same speaker + # if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: + # seg_idx = segment_index_map.get(seg_id) + # if seg_idx is not None: + # i = seg_idx - 1 + # while i >= 0: + # prev_seg = ordered_segments[i] + # if prev_seg.get('speaker_id') != speaker_id: + # break + # prev_start = prev_seg.get('start') + # if prev_start is not None: + # segment_start = min(segment_start, prev_start) + # seg_duration = segment_end - segment_start + # if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: + # print( + # f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", + # uid, + # conversation_id, + # ) + # break + # i -= 1 if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: print(f"Segment too short ({seg_duration:.1f}s) even after expansion, skipping", uid, conversation_id) @@ -150,12 +138,24 @@ async def _extract_speaker_samples( abs_start = started_at_ts + segment_start abs_end = started_at_ts + segment_end - # Find overlapping chunks - relevant_timestamps = [ - c['timestamp'] - for c in chunks - if (c['timestamp'] + PRIVATE_CLOUD_CHUNK_DURATION) >= abs_start and c['timestamp'] <= abs_end - ] + # Find relevant chunks + sorted_chunks = sorted(chunks, key=lambda c: c['timestamp']) + + # Find first chunk that starts at or before abs_start + first_idx = 0 + for i, chunk in enumerate(sorted_chunks): + if chunk['timestamp'] <= abs_start: + first_idx = i + else: + break + + # Collect from first_idx up to abs_end + relevant_timestamps = [] + for chunk in sorted_chunks[first_idx:]: + if chunk['timestamp'] <= abs_end: + relevant_timestamps.append(chunk['timestamp']) + else: + break if not relevant_timestamps: print(f"No relevant chunks for segment {segment_start:.1f}-{segment_end:.1f}s", uid, conversation_id) @@ -184,7 +184,8 @@ async def _extract_speaker_samples( success = await asyncio.to_thread(users_db.add_person_speech_sample, uid, person_id, path) if success: samples_added += 1 - print(f"Stored speech sample {samples_added} for person {person_id}: {path}", uid, conversation_id) + seg_text = seg.get('text', '')[:100] # Truncate to 100 chars + print(f"Stored speech sample {samples_added} for person {person_id}: segment_id={seg_id}, file={path}, text={seg_text}", uid, conversation_id) else: print(f"Failed to add speech sample for person {person_id}", uid, conversation_id) break # Likely hit limit From ad752fe9ceea41b68965e366ee5a5ca136eaf46b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Mon, 29 Dec 2025 19:48:17 +0700 Subject: [PATCH 07/14] Improve private cloud sync with queue-based processing and retry logic --- backend/routers/pusher.py | 143 ++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 44 deletions(-) diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index bbfb2bcc93..981a63fd79 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -36,10 +36,14 @@ router = APIRouter() # Constants for speaker sample extraction -SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 10.0 # Minimum segment duration in seconds -SPEAKER_SAMPLE_PROCESS_INTERVAL = 5.0 # seconds between queue checks -SPEAKER_SAMPLE_MIN_AGE = 10.0 # seconds to wait before processing a request -PRIVATE_CLOUD_CHUNK_DURATION = 5.0 # Duration of each audio chunk in seconds +SPEAKER_SAMPLE_PROCESS_INTERVAL = 5.0 +SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 10.0 +SPEAKER_SAMPLE_MIN_AGE = 15.0 + +# Constants for private cloud sync +PRIVATE_CLOUD_SYNC_PROCESS_INTERVAL = 1.0 +PRIVATE_CLOUD_CHUNK_DURATION = 5.0 +PRIVATE_CLOUD_SYNC_MAX_RETRIES = 3 async def _extract_speaker_samples( @@ -56,13 +60,13 @@ async def _extract_speaker_samples( """ try: # Check current sample count once - sample_count = await asyncio.to_thread(users_db.get_person_speech_samples_count, uid, person_id) + sample_count = users_db.get_person_speech_samples_count(uid, person_id) if sample_count >= 5: print(f"Person {person_id} already has {sample_count} samples, skipping", uid, conversation_id) return # Fetch conversation to get started_at and segment details - conversation = await asyncio.to_thread(conversations_db.get_conversation, uid, conversation_id) + conversation = conversations_db.get_conversation(uid, conversation_id) if not conversation: print(f"Conversation {conversation_id} not found", uid) return @@ -79,7 +83,7 @@ async def _extract_speaker_samples( segment_map = {s.get('id'): s for s in conv_segments if s.get('id')} # List chunks from storage - chunks = await asyncio.to_thread(list_audio_chunks, uid, conversation_id) + chunks = list_audio_chunks(uid, conversation_id) if not chunks: print(f"No chunks found for {conversation_id}, skipping speaker sample extraction", uid) return @@ -108,27 +112,27 @@ async def _extract_speaker_samples( seg_duration = segment_end - segment_start speaker_id = seg.get('speaker_id') - # # If segment is too short, try expanding to adjacent segments with same speaker - # if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: - # seg_idx = segment_index_map.get(seg_id) - # if seg_idx is not None: - # i = seg_idx - 1 - # while i >= 0: - # prev_seg = ordered_segments[i] - # if prev_seg.get('speaker_id') != speaker_id: - # break - # prev_start = prev_seg.get('start') - # if prev_start is not None: - # segment_start = min(segment_start, prev_start) - # seg_duration = segment_end - segment_start - # if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: - # print( - # f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", - # uid, - # conversation_id, - # ) - # break - # i -= 1 + # If segment is too short, try expanding to adjacent segments with same speaker + if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: + seg_idx = segment_index_map.get(seg_id) + if seg_idx is not None: + i = seg_idx - 1 + while i >= 0: + prev_seg = ordered_segments[i] + if prev_seg.get('speaker_id') != speaker_id: + break + prev_start = prev_seg.get('start') + if prev_start is not None: + segment_start = min(segment_start, prev_start) + seg_duration = segment_end - segment_start + if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: + print( + f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", + uid, + conversation_id, + ) + break + i -= 1 if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: print(f"Segment too short ({seg_duration:.1f}s) even after expansion, skipping", uid, conversation_id) @@ -181,11 +185,15 @@ async def _extract_speaker_samples( upload_person_speech_sample_from_bytes, sample_audio, uid, person_id, sample_rate ) - success = await asyncio.to_thread(users_db.add_person_speech_sample, uid, person_id, path) + success = users_db.add_person_speech_sample(uid, person_id, path) if success: samples_added += 1 seg_text = seg.get('text', '')[:100] # Truncate to 100 chars - print(f"Stored speech sample {samples_added} for person {person_id}: segment_id={seg_id}, file={path}, text={seg_text}", uid, conversation_id) + print( + f"Stored speech sample {samples_added} for person {person_id}: segment_id={seg_id}, file={path}, text={seg_text}", + uid, + conversation_id, + ) else: print(f"Failed to add speech sample for person {person_id}", uid, conversation_id) break # Likely hit limit @@ -272,15 +280,48 @@ async def _websocket_util_trigger( audio_bytes_trigger_delay_seconds = 4 has_audio_apps_enabled = is_audio_bytes_app_enabled(uid) private_cloud_sync_enabled = users_db.get_user_private_cloud_sync_enabled(uid) - private_cloud_sync_delay_seconds = 5 - async def save_audio_chunk(chunk_data: bytes, uid: str, conversation_id: str, timestamp: float): - upload_audio_chunk(chunk_data, uid, conversation_id, timestamp) - - # task # Queue for pending speaker sample extraction requests speaker_sample_queue: List[dict] = [] + # Queue for pending private cloud sync chunks + private_cloud_queue: List[dict] = [] + + async def process_private_cloud_queue(): + """Background task that processes private cloud sync uploads with retry logic.""" + nonlocal websocket_active, private_cloud_queue + + while websocket_active or len(private_cloud_queue) > 0: + await asyncio.sleep(PRIVATE_CLOUD_SYNC_PROCESS_INTERVAL) + + if not private_cloud_queue: + continue + + # Process all pending chunks + chunks_to_process = private_cloud_queue.copy() + private_cloud_queue = [] + + for chunk_info in chunks_to_process: + chunk_data = chunk_info['data'] + conv_id = chunk_info['conversation_id'] + timestamp = chunk_info['timestamp'] + retries = chunk_info.get('retries', 0) + + try: + await asyncio.to_thread(upload_audio_chunk, chunk_data, uid, conv_id, timestamp) + except Exception as e: + if retries < PRIVATE_CLOUD_SYNC_MAX_RETRIES: + # Re-queue with incremented retry count + chunk_info['retries'] = retries + 1 + private_cloud_queue.append(chunk_info) + print(f"Private cloud upload failed (retry {retries + 1}): {e}", uid, conv_id) + else: + print( + f"Private cloud upload failed after {PRIVATE_CLOUD_SYNC_MAX_RETRIES} retries, dropping chunk: {e}", + uid, + conv_id, + ) + async def process_speaker_sample_queue(): """Background task that processes speaker sample extraction requests.""" nonlocal websocket_active, speaker_sample_queue @@ -396,20 +437,22 @@ async def receive_tasks(): audiobuffer.extend(audio_data) trigger_audiobuffer.extend(audio_data) - # Private cloud sync + # Private cloud sync - queue chunks for background processing if private_cloud_sync_enabled and current_conversation_id: if private_cloud_chunk_start_time is None: # Use timestamp from first buffer of this 5-second chunk private_cloud_chunk_start_time = buffer_start_timestamp private_cloud_sync_buffer.extend(audio_data) - # Save chunk every 5 seconds (sample_rate * 2 bytes per sample * 5 seconds) - if len(private_cloud_sync_buffer) >= sample_rate * 2 * private_cloud_sync_delay_seconds: - chunk_data = bytes(private_cloud_sync_buffer) - timestamp = private_cloud_chunk_start_time - conv_id = current_conversation_id - asyncio.run_coroutine_threadsafe( - save_audio_chunk(chunk_data, uid, conv_id, timestamp), loop + # Queue chunk every 5 seconds (sample_rate * 2 bytes per sample * 5 seconds) + if len(private_cloud_sync_buffer) >= sample_rate * 2 * PRIVATE_CLOUD_CHUNK_DURATION: + private_cloud_queue.append( + { + 'data': bytes(private_cloud_sync_buffer), + 'conversation_id': current_conversation_id, + 'timestamp': private_cloud_chunk_start_time, + 'retries': 0, + } ) private_cloud_sync_buffer = bytearray() private_cloud_chunk_start_time = None @@ -438,12 +481,24 @@ async def receive_tasks(): print(f'Could not process audio: error {e}') websocket_close_code = 1011 finally: + # Flush any remaining private cloud sync buffer before shutdown + if private_cloud_sync_enabled and current_conversation_id and len(private_cloud_sync_buffer) > 0: + private_cloud_queue.append( + { + 'data': bytes(private_cloud_sync_buffer), + 'conversation_id': current_conversation_id, + 'timestamp': private_cloud_chunk_start_time or time.time(), + 'retries': 0, + } + ) + print(f"Flushed final private cloud buffer: {len(private_cloud_sync_buffer)} bytes", uid) websocket_active = False try: receive_task = asyncio.create_task(receive_tasks()) speaker_sample_task = asyncio.create_task(process_speaker_sample_queue()) - await asyncio.gather(receive_task, speaker_sample_task) + private_cloud_task = asyncio.create_task(process_private_cloud_queue()) + await asyncio.gather(receive_task, speaker_sample_task, private_cloud_task) except Exception as e: print(f"Error during WebSocket operation: {e}") From ebd3b35c1cdda681da2526436eb342f5fda645cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 11:02:36 +0700 Subject: [PATCH 08/14] Segment-level accuracy for the conversation audio player --- app/lib/pages/capture/widgets/widgets.dart | 4 + app/lib/pages/conversation_detail/page.dart | 38 +++- app/lib/widgets/conversation_bottom_bar.dart | 222 +++++++++++++++++-- app/lib/widgets/transcript.dart | 51 ++++- backend/routers/sync.py | 13 +- backend/utils/other/storage.py | 69 +++++- 6 files changed, 352 insertions(+), 45 deletions(-) diff --git a/app/lib/pages/capture/widgets/widgets.dart b/app/lib/pages/capture/widgets/widgets.dart index d853bfe16b..ab8ba2369f 100644 --- a/app/lib/pages/capture/widgets/widgets.dart +++ b/app/lib/pages/capture/widgets/widgets.dart @@ -189,6 +189,8 @@ getTranscriptWidget( String searchQuery = '', int currentResultIndex = -1, VoidCallback? onTapWhenSearchEmpty, + Function(double segmentStartSeconds)? onPlaySegment, + bool hasAudio = false, }) { if (conversationCreating) { return const Padding( @@ -221,6 +223,8 @@ getTranscriptWidget( searchQuery: searchQuery, currentResultIndex: currentResultIndex, onTapWhenSearchEmpty: onTapWhenSearchEmpty, + onPlaySegment: onPlaySegment, + hasAudio: hasAudio, ); } diff --git a/app/lib/pages/conversation_detail/page.dart b/app/lib/pages/conversation_detail/page.dart index dfd9ff4bcb..c8bfc57a55 100644 --- a/app/lib/pages/conversation_detail/page.dart +++ b/app/lib/pages/conversation_detail/page.dart @@ -49,6 +49,7 @@ class _ConversationDetailPageState extends State with Ti final focusTitleField = FocusNode(); final focusOverviewField = FocusNode(); final GlobalKey _shareButtonKey = GlobalKey(); + final GlobalKey _audioBarKey = GlobalKey(); TabController? _controller; final AppReviewService _appReviewService = AppReviewService(); ConversationTab selectedTab = ConversationTab.summary; @@ -685,17 +686,25 @@ class _ConversationDetailPageState extends State with Ti controller: _controller, physics: const NeverScrollableScrollPhysics(), children: [ - TranscriptWidgets( - searchQuery: _searchQuery, - currentResultIndex: getCurrentResultIndexForHighlighting(), - onTapWhenSearchEmpty: () { - if (_isSearching && _searchQuery.isEmpty) { - setState(() { - _isSearching = false; - _searchController.clear(); - _searchFocusNode.unfocus(); - }); - } + Consumer( + builder: (context, detailProvider, _) { + return TranscriptWidgets( + searchQuery: _searchQuery, + currentResultIndex: getCurrentResultIndexForHighlighting(), + onTapWhenSearchEmpty: () { + if (_isSearching && _searchQuery.isEmpty) { + setState(() { + _isSearching = false; + _searchController.clear(); + _searchFocusNode.unfocus(); + }); + } + }, + onPlaySegment: (double segmentStartSeconds) { + _audioBarKey.currentState?.seekAndPlay(segmentStartSeconds); + }, + hasAudio: detailProvider.conversation.hasAudio(), + ); }, ), SummaryTab( @@ -732,6 +741,7 @@ class _ConversationDetailPageState extends State with Ti final hasActionItems = conversation.structured.actionItems.where((item) => !item.deleted).isNotEmpty; return ConversationBottomBar( + key: _audioBarKey, mode: ConversationBottomBarMode.detail, selectedTab: selectedTab, conversation: conversation, @@ -1083,12 +1093,16 @@ class TranscriptWidgets extends StatefulWidget { final String searchQuery; final int currentResultIndex; final VoidCallback? onTapWhenSearchEmpty; + final Function(double segmentStartSeconds)? onPlaySegment; + final bool hasAudio; const TranscriptWidgets({ super.key, this.searchQuery = '', this.currentResultIndex = -1, this.onTapWhenSearchEmpty, + this.onPlaySegment, + this.hasAudio = false, }); @override @@ -1152,6 +1166,8 @@ class _TranscriptWidgetsState extends State with AutomaticKee searchQuery: widget.searchQuery, currentResultIndex: widget.currentResultIndex, onTapWhenSearchEmpty: widget.onTapWhenSearchEmpty, + onPlaySegment: widget.onPlaySegment, + hasAudio: widget.hasAudio, editSegment: (segmentId, speakerId) { final connectivityProvider = Provider.of(context, listen: false); if (!connectivityProvider.isConnected) { diff --git a/app/lib/widgets/conversation_bottom_bar.dart b/app/lib/widgets/conversation_bottom_bar.dart index 449923c31e..8e96ffb967 100644 --- a/app/lib/widgets/conversation_bottom_bar.dart +++ b/app/lib/widgets/conversation_bottom_bar.dart @@ -1,3 +1,6 @@ +import 'dart:convert'; +import 'dart:typed_data'; + import 'package:cached_network_image/cached_network_image.dart'; import 'package:collection/collection.dart'; import 'package:flutter/material.dart'; @@ -42,10 +45,10 @@ class ConversationBottomBar extends StatefulWidget { }); @override - State createState() => _ConversationBottomBarState(); + State createState() => ConversationBottomBarState(); } -class _ConversationBottomBarState extends State { +class ConversationBottomBarState extends State { // Audio player for inline controls AudioPlayer? _audioPlayer; bool _isAudioLoading = false; @@ -56,14 +59,97 @@ class _ConversationBottomBarState extends State { @override void initState() { super.initState(); - _calculateTotalDuration(); + _calculateTotalDurationWithGaps(); + } + + /// Creates a silent audio source of the specified duration. + /// Uses minimal WAV format: 16kHz, mono, 16-bit PCM with zero samples. + AudioSource _createSilenceSource(Duration duration) { + const int sampleRate = 16000; + const int numChannels = 1; + const int bitsPerSample = 16; + const int bytesPerSample = bitsPerSample ~/ 8; + + final int numSamples = (duration.inMilliseconds * sampleRate / 1000).round(); + final int dataSize = numSamples * numChannels * bytesPerSample; + final int fileSize = 36 + dataSize; + + final buffer = Uint8List(44 + dataSize); + final byteData = ByteData.view(buffer.buffer); + + // RIFF header + buffer[0] = 0x52; // 'R' + buffer[1] = 0x49; // 'I' + buffer[2] = 0x46; // 'F' + buffer[3] = 0x46; // 'F' + byteData.setUint32(4, fileSize, Endian.little); + buffer[8] = 0x57; // 'W' + buffer[9] = 0x41; // 'A' + buffer[10] = 0x56; // 'V' + buffer[11] = 0x45; // 'E' + + // fmt subchunk + buffer[12] = 0x66; // 'f' + buffer[13] = 0x6D; // 'm' + buffer[14] = 0x74; // 't' + buffer[15] = 0x20; // ' ' + byteData.setUint32(16, 16, Endian.little); + byteData.setUint16(20, 1, Endian.little); + byteData.setUint16(22, numChannels, Endian.little); + byteData.setUint32(24, sampleRate, Endian.little); + byteData.setUint32(28, sampleRate * numChannels * bytesPerSample, Endian.little); + byteData.setUint16(32, numChannels * bytesPerSample, Endian.little); + byteData.setUint16(34, bitsPerSample, Endian.little); + + // data subchunk + buffer[36] = 0x64; // 'd' + buffer[37] = 0x61; // 'a' + buffer[38] = 0x74; // 't' + buffer[39] = 0x61; // 'a' + byteData.setUint32(40, dataSize, Endian.little); + + // Audio data (bytes 44+) is zeros = silence + + final base64Data = base64Encode(buffer); + return AudioSource.uri(Uri.parse('data:audio/wav;base64,$base64Data')); + } + + /// Seek to a specific segment time (in conversation-relative seconds) and start playback. + /// Since playlist now includes silence gaps, segment time maps directly to playlist position. + Future seekAndPlay(double segmentStartSeconds) async { + if (!_isAudioInitialized && !_isAudioLoading) { + await _initAudioIfNeeded(); + } + if (!mounted) return; + if (_audioPlayer == null) return; + + final conversation = widget.conversation; + if (conversation == null || conversation.audioFiles.isEmpty) return; + + // With silence gaps in playlist, segment time = playlist position directly + final targetPosition = Duration(milliseconds: (segmentStartSeconds * 1000).toInt()); + + // Clamp to valid range + final clampedPosition = targetPosition > _totalDuration ? _totalDuration : targetPosition; + final finalPosition = clampedPosition.isNegative ? Duration.zero : clampedPosition; + + // Track play event + MixpanelManager().audioPlaybackStarted( + conversationId: conversation.id, + durationSeconds: _totalDuration.inSeconds > 0 ? _totalDuration.inSeconds : null, + ); + + // Seek using combined position which handles track selection + await _seekToCombinedPosition(finalPosition); + await _audioPlayer!.play(); + if (mounted) setState(() {}); } @override void didUpdateWidget(ConversationBottomBar oldWidget) { super.didUpdateWidget(oldWidget); if (widget.conversation?.id != oldWidget.conversation?.id) { - _calculateTotalDuration(); + _calculateTotalDurationWithGaps(); } } @@ -73,15 +159,53 @@ class _ConversationBottomBarState extends State { super.dispose(); } - void _calculateTotalDuration() { + /// Calculates total duration including gaps between audio files. + /// This builds _trackStartOffsets to include silence tracks. + void _calculateTotalDurationWithGaps() { if (widget.conversation == null) return; - double totalSeconds = 0; + + final conversation = widget.conversation!; + final conversationStartedAt = conversation.startedAt; + + // Sort audio files by startedAt + final sortedAudioFiles = conversation.audioFiles.where((af) => af.startedAt != null).toList() + ..sort((a, b) => a.startedAt!.compareTo(b.startedAt!)); + + if (sortedAudioFiles.isEmpty) { + _totalDuration = Duration.zero; + _trackStartOffsets = []; + return; + } + _trackStartOffsets = []; - for (final audioFile in widget.conversation!.audioFiles) { - _trackStartOffsets.add(Duration(milliseconds: (totalSeconds * 1000).toInt())); - totalSeconds += audioFile.duration; + double currentTimeMs = 0; + + // Reference point for calculating gaps + DateTime? referenceStart = conversationStartedAt ?? sortedAudioFiles.first.startedAt; + DateTime? previousEndTime = referenceStart; + + for (final audioFile in sortedAudioFiles) { + final fileStart = audioFile.startedAt!; + + // Calculate gap from previous end to this file's start + if (previousEndTime != null) { + final gapMs = fileStart.difference(previousEndTime).inMilliseconds; + if (gapMs > 100) { + // Add offset for silence track + _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); + currentTimeMs += gapMs; + } + } + + // Add offset for audio file + _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); + currentTimeMs += audioFile.duration * 1000; + + // Update previous end time + previousEndTime = fileStart.add(Duration(milliseconds: (audioFile.duration * 1000).toInt())); } - _totalDuration = Duration(milliseconds: (totalSeconds * 1000).toInt()); + + _totalDuration = Duration(milliseconds: currentTimeMs.toInt()); } Duration _getCombinedPosition(int? currentIndex, Duration trackPosition) { @@ -101,39 +225,78 @@ class _ConversationBottomBarState extends State { _isAudioLoading = true; }); - _calculateTotalDuration(); - try { _audioPlayer = AudioPlayer(); - final signedUrlInfos = await getConversationAudioSignedUrls(widget.conversation!.id); - final audioFileIds = widget.conversation!.audioFiles.map((af) => af.id).toList(); + final conversation = widget.conversation!; + final conversationStartedAt = conversation.startedAt; - List audioSources = []; + // Sort audio files by startedAt + final sortedAudioFiles = conversation.audioFiles.where((af) => af.startedAt != null).toList() + ..sort((a, b) => a.startedAt!.compareTo(b.startedAt!)); + + if (sortedAudioFiles.isEmpty) { + debugPrint('No audio files with startedAt found'); + return; + } + + // Fetch signed URLs for all audio files + final signedUrlInfos = await getConversationAudioSignedUrls(conversation.id); Map? fallbackHeaders; - for (final fileId in audioFileIds) { - // Find matching signed URL info + // Build playlist with silence gaps + List audioSources = []; + _trackStartOffsets = []; + double currentTimeMs = 0; + + // Reference point for calculating gaps + DateTime? referenceStart = conversationStartedAt ?? sortedAudioFiles.first.startedAt; + DateTime? previousEndTime = referenceStart; + + for (final audioFile in sortedAudioFiles) { + final fileStart = audioFile.startedAt!; + + // Calculate gap from previous end to this file's start + if (previousEndTime != null) { + final gapMs = fileStart.difference(previousEndTime).inMilliseconds; + if (gapMs > 100) { + // Add silence track for gap + _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); + audioSources.add(_createSilenceSource(Duration(milliseconds: gapMs))); + currentTimeMs += gapMs; + debugPrint('Added silence gap: ${gapMs}ms before audio file ${audioFile.id}'); + } + } + + // Add offset for audio file + _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); + + // Get audio source for this file final urlInfo = signedUrlInfos.firstWhere( - (info) => info.id == fileId, - orElse: () => AudioFileUrlInfo(id: fileId, status: 'pending', duration: 0), + (info) => info.id == audioFile.id, + orElse: () => AudioFileUrlInfo(id: audioFile.id, status: 'pending', duration: 0), ); if (urlInfo.isCached && urlInfo.signedUrl != null) { - // Use signed URL directly audioSources.add(AudioSource.uri(Uri.parse(urlInfo.signedUrl!))); } else { - // Fall back to API URL fallbackHeaders ??= await getAudioHeaders(); final apiUrl = getAudioStreamUrl( - conversationId: widget.conversation!.id, - audioFileId: fileId, + conversationId: conversation.id, + audioFileId: audioFile.id, format: 'wav', ); audioSources.add(AudioSource.uri(Uri.parse(apiUrl), headers: fallbackHeaders)); } + + currentTimeMs += audioFile.duration * 1000; + + // Update previous end time + previousEndTime = fileStart.add(Duration(milliseconds: (audioFile.duration * 1000).toInt())); } + _totalDuration = Duration(milliseconds: currentTimeMs.toInt()); + final playlist = ConcatenatingAudioSource( useLazyPreparation: true, children: audioSources, @@ -141,6 +304,19 @@ class _ConversationBottomBarState extends State { await _audioPlayer!.setAudioSource(playlist, preload: true); _isAudioInitialized = true; + + // Seek to first segment start position + // With silence gaps, segment time = playlist position directly + if (conversation.transcriptSegments.isNotEmpty) { + final firstSegmentStart = conversation.transcriptSegments.first.start; + final targetPosition = Duration(milliseconds: (firstSegmentStart * 1000).toInt()); + + // Clamp to valid range + final clampedPosition = targetPosition > _totalDuration ? Duration.zero : targetPosition; + final finalPosition = clampedPosition.isNegative ? Duration.zero : clampedPosition; + + await _seekToCombinedPosition(finalPosition); + } } catch (e) { debugPrint('Error initializing audio: $e'); } finally { diff --git a/app/lib/widgets/transcript.dart b/app/lib/widgets/transcript.dart index 03d096debd..498cc1d9bf 100644 --- a/app/lib/widgets/transcript.dart +++ b/app/lib/widgets/transcript.dart @@ -28,6 +28,8 @@ class TranscriptWidget extends StatefulWidget { final int currentResultIndex; final Function(ScrollController)? onScrollControllerReady; final VoidCallback? onTapWhenSearchEmpty; + final Function(double segmentStartSeconds)? onPlaySegment; + final bool hasAudio; const TranscriptWidget({ super.key, @@ -46,6 +48,8 @@ class TranscriptWidget extends StatefulWidget { this.currentResultIndex = -1, this.onScrollControllerReady, this.onTapWhenSearchEmpty, + this.onPlaySegment, + this.hasAudio = false, }); @override @@ -657,12 +661,55 @@ class _TranscriptWidgetState extends State { const SizedBox(height: 4), _buildTranslationNotice(), ], - // Timestamp and provider (only shown when toggled) - if (_showSpeakerNames && (widget.canDisplaySeconds || data.sttProvider != null)) ...[ + // Timestamp, provider, and play button (only shown when toggled) + if (_showSpeakerNames && + (widget.canDisplaySeconds || + data.sttProvider != null || + (widget.hasAudio && widget.onPlaySegment != null))) ...[ const SizedBox(height: 4), Row( mainAxisAlignment: MainAxisAlignment.end, children: [ + // Play button for audio playback + if (widget.hasAudio && widget.onPlaySegment != null) ...[ + GestureDetector( + onTap: () { + widget.onPlaySegment?.call(data.start); + }, + child: Row( + mainAxisSize: MainAxisSize.min, + children: [ + Icon( + Icons.play_arrow_rounded, + color: isUser + ? Colors.white.withValues(alpha: 0.7) + : Colors.grey.shade400, + size: 14, + ), + const SizedBox(width: 2), + Text( + 'Play', + style: TextStyle( + color: isUser + ? Colors.white.withValues(alpha: 0.7) + : Colors.grey.shade400, + fontSize: 11, + ), + ), + ], + ), + ), + if (widget.canDisplaySeconds || data.sttProvider != null) + Text( + ' ยท ', + style: TextStyle( + color: isUser + ? Colors.white.withValues(alpha: 0.5) + : Colors.grey.shade500, + fontSize: 10, + ), + ), + ], if (data.sttProvider != null) ...[ Text( SttProviderConfig.getDisplayName(data.sttProvider), diff --git a/backend/routers/sync.py b/backend/routers/sync.py index 57b153751f..d0e684333c 100644 --- a/backend/routers/sync.py +++ b/backend/routers/sync.py @@ -28,6 +28,9 @@ get_or_create_merged_audio, get_merged_audio_signed_url, ) + +# Audio constants +AUDIO_SAMPLE_RATE = 16000 from utils import encryption from utils.stt.pre_recorded import deepgram_prerecorded, postprocess_words from utils.stt.vad import vad_is_empty @@ -102,7 +105,7 @@ def parse_range_header(range_header: str, file_size: int) -> tuple[int, int] | N # ********************************************** -def _precache_audio_file(uid: str, conversation_id: str, audio_file: dict): +def _precache_audio_file(uid: str, conversation_id: str, audio_file: dict, fill_gaps: bool = True): """Pre-cache a single audio file.""" try: audio_file_id = audio_file.get('id') @@ -116,6 +119,8 @@ def _precache_audio_file(uid: str, conversation_id: str, audio_file: dict): audio_file_id=audio_file_id, timestamps=timestamps, pcm_to_wav_func=pcm_to_wav, + fill_gaps=fill_gaps, + sample_rate=AUDIO_SAMPLE_RATE, ) print(f"Pre-cached audio file: {audio_file_id}") except Exception as e: @@ -310,11 +315,15 @@ def download_audio_file_endpoint( audio_file_id=audio_file_id, timestamps=audio_file['chunk_timestamps'], pcm_to_wav_func=pcm_to_wav, + fill_gaps=True, + sample_rate=AUDIO_SAMPLE_RATE, ) content_type = "audio/wav" extension = "wav" else: - audio_data = download_audio_chunks_and_merge(uid, conversation_id, audio_file['chunk_timestamps']) + audio_data = download_audio_chunks_and_merge( + uid, conversation_id, audio_file['chunk_timestamps'], fill_gaps=True, sample_rate=AUDIO_SAMPLE_RATE + ) content_type = "application/octet-stream" extension = "pcm" except FileNotFoundError: diff --git a/backend/utils/other/storage.py b/backend/utils/other/storage.py index 5833783295..68efa715ba 100644 --- a/backend/utils/other/storage.py +++ b/backend/utils/other/storage.py @@ -381,7 +381,13 @@ def delete_conversation_audio_files(uid: str, conversation_id: str) -> None: blob.delete() -def download_audio_chunks_and_merge(uid: str, conversation_id: str, timestamps: List[float]) -> bytes: +def download_audio_chunks_and_merge( + uid: str, + conversation_id: str, + timestamps: List[float], + fill_gaps: bool = True, + sample_rate: int = 16000, +) -> bytes: """ Download and merge audio chunks on-demand, handling mixed encryption states. Downloads chunks in parallel. @@ -391,6 +397,9 @@ def download_audio_chunks_and_merge(uid: str, conversation_id: str, timestamps: uid: User ID conversation_id: Conversation ID timestamps: List of chunk timestamps to merge + fill_gaps: If True, insert silence (zero bytes) between chunks to maintain + continuous time-aligned audio. Default True. + sample_rate: Audio sample rate in Hz (default 16000) Returns: Merged audio bytes (PCM16) @@ -441,9 +450,39 @@ def download_single_chunk(timestamp: float) -> tuple[float, bytes | None]: # Merge chunks merged_data = bytearray() - for timestamp in timestamps: - if timestamp in chunk_results: - merged_data.extend(chunk_results[timestamp]) + + if fill_gaps and timestamps and chunk_results: + # Sort timestamps to ensure proper ordering + sorted_timestamps = sorted(timestamps) + first_timestamp = sorted_timestamps[0] + current_time = first_timestamp # Track current audio end time in seconds + + for timestamp in sorted_timestamps: + if timestamp not in chunk_results: + continue + + pcm_data = chunk_results[timestamp] + + # Calculate gap from current position to this chunk's start + gap_seconds = timestamp - current_time + if gap_seconds > 0: + # Insert silence: 16-bit mono = 2 bytes per sample + gap_samples = int(gap_seconds * sample_rate) + silence_bytes = bytes(gap_samples * 2) # Zero bytes for silence + merged_data.extend(silence_bytes) + print(f"Filled {gap_seconds:.3f}s gap ({len(silence_bytes)} bytes) before chunk at {timestamp}") + + merged_data.extend(pcm_data) + + # Update current time based on chunk duration + # PCM16 mono: 2 bytes per sample + chunk_duration = len(pcm_data) / (sample_rate * 2) + current_time = timestamp + chunk_duration + else: + # Original behavior - just concatenate without gap filling + for timestamp in timestamps: + if timestamp in chunk_results: + merged_data.extend(chunk_results[timestamp]) if not merged_data: raise FileNotFoundError(f"No chunks found for conversation {conversation_id}") @@ -457,7 +496,13 @@ def get_cached_merged_audio_path(uid: str, conversation_id: str, audio_file_id: def get_or_create_merged_audio( - uid: str, conversation_id: str, audio_file_id: str, timestamps: List[float], pcm_to_wav_func + uid: str, + conversation_id: str, + audio_file_id: str, + timestamps: List[float], + pcm_to_wav_func, + fill_gaps: bool = True, + sample_rate: int = 16000, ) -> tuple[bytes, bool]: """ Get merged audio from cache or create it. @@ -469,6 +514,8 @@ def get_or_create_merged_audio( audio_file_id: Audio file ID timestamps: List of chunk timestamps pcm_to_wav_func: Function to convert PCM to WAV + fill_gaps: If True, insert silence between chunks to maintain time alignment. Default True. + sample_rate: Audio sample rate in Hz (default 16000) Returns: Tuple of (audio_data_bytes, was_cached) @@ -500,7 +547,9 @@ def get_or_create_merged_audio( print(f"Cache miss, merging audio for: {cache_path}") # Download and merge chunks - pcm_data = download_audio_chunks_and_merge(uid, conversation_id, timestamps) + pcm_data = download_audio_chunks_and_merge( + uid, conversation_id, timestamps, fill_gaps=fill_gaps, sample_rate=sample_rate + ) # Convert to WAV wav_data = pcm_to_wav_func(pcm_data) @@ -574,7 +623,9 @@ def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1) -> return wav_buffer.getvalue() -def precache_conversation_audio(uid: str, conversation_id: str, audio_files: list) -> None: +def precache_conversation_audio( + uid: str, conversation_id: str, audio_files: list, fill_gaps: bool = True, sample_rate: int = 16000 +) -> None: """ Pre-cache all audio files for a conversation in a background thread. @@ -582,6 +633,8 @@ def precache_conversation_audio(uid: str, conversation_id: str, audio_files: lis uid: User ID conversation_id: Conversation ID audio_files: List of audio file dicts with 'id' and 'chunk_timestamps' + fill_gaps: If True, insert silence between chunks to maintain time alignment. Default True. + sample_rate: Audio sample rate in Hz (default 16000) """ if not audio_files: return @@ -600,6 +653,8 @@ def _cache_single(af): audio_file_id=audio_file_id, timestamps=timestamps, pcm_to_wav_func=_pcm_to_wav, + fill_gaps=fill_gaps, + sample_rate=sample_rate, ) except Exception as e: print(f"[PRECACHE] Error caching audio file {af.get('id')}: {e}") From 6c0adaa9e8899b6617368e44464b4021b74358b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 22:34:15 +0700 Subject: [PATCH 09/14] Speaker embeding --- backend/Dockerfile | 62 +++-- backend/compose.yaml | 43 +++ backend/database/users.py | 58 +++- backend/pusher.Dockerfile | 74 +++++ backend/routers/conversations.py | 13 + backend/routers/pusher.py | 188 ++----------- backend/routers/transcribe.py | 342 ++++++++++++++++++++++-- backend/utils/speaker_identification.py | 314 +++++++++++++++++++++- backend/utils/stt/speaker_embedding.py | 176 ++++++++++++ 9 files changed, 1061 insertions(+), 209 deletions(-) create mode 100644 backend/compose.yaml create mode 100644 backend/pusher.Dockerfile create mode 100644 backend/utils/stt/speaker_embedding.py diff --git a/backend/Dockerfile b/backend/Dockerfile index 71ff23dd1a..6f5c0b43c3 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,9 +1,7 @@ -FROM python:3.11 AS builder +# Builder stage - compile liblc3 +FROM tiangolo/uvicorn-gunicorn:python3.11 as builder -ENV PATH="/opt/venv/bin:$PATH" -RUN python -m venv /opt/venv - -# Install build dependencies for liblc3 +# Install build dependencies RUN apt-get update && apt-get install -y \ git \ gcc \ @@ -24,20 +22,18 @@ RUN git clone https://github.com/google/liblc3.git && \ cd /tmp/liblc3 && \ python3 -m pip wheel --no-cache-dir --wheel-dir /tmp/wheels . -# Install Python requirements -WORKDIR /opt/venv -COPY backend/requirements.txt /tmp/requirements.txt -RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt - -FROM python:3.11-slim - -WORKDIR /app -ENV PATH="/opt/venv/bin:$PATH" -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +# Runtime stage - minimal image +FROM tiangolo/uvicorn-gunicorn:python3.11 -RUN apt-get update && apt-get -y install ffmpeg curl unzip && rm -rf /var/lib/apt/lists/* +# Only install runtime dependencies +RUN apt-get update && apt-get install -y \ + ffmpeg \ + curl \ + unzip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* -# Copy compiled liblc3 library and wheel from builder +# Copy compiled library and wheel from builder COPY --from=builder /usr/local/lib/liblc3.so* /usr/local/lib/ COPY --from=builder /tmp/wheels /tmp/wheels @@ -46,8 +42,34 @@ RUN ldconfig && \ pip install --no-cache-dir /tmp/wheels/*.whl && \ rm -rf /tmp/wheels -COPY --from=builder /opt/venv /opt/venv -COPY backend/ . +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + +# Install Python requirements +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +WORKDIR /app + +# COPY . . +COPY ./routers ./routers +COPY ./pretrained_models ./pretrained_models +COPY ./database ./database +COPY ./migrations ./migrations +COPY ./memories-tuner ./tuner +COPY ./pusher ./pusher +COPY ./typesense ./typesense +COPY ./charts ./charts +COPY ./utils ./utils +COPY ./models ./models +COPY ./testing ./testing +COPY ./scripts ./scripts +COPY ./templates ./templates +COPY ./modal ./modal +COPY ./migration ./migration +COPY google-credentials.json ./ EXPOSE 8080 -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] + +CMD uvicorn main:app --host 0.0.0.0 --port 8080 #--limit-concurrency 10 +#CMD gunicorn main:app -k uvicorn.workers.UvicornWorker --workers 2 --bind 0.0.0.0:8080 diff --git a/backend/compose.yaml b/backend/compose.yaml new file mode 100644 index 0000000000..a57e40ac02 --- /dev/null +++ b/backend/compose.yaml @@ -0,0 +1,43 @@ +services: + api: + build: + context: . + dockerfile: Dockerfile + ports: + - 8088:8080 + volumes: + - .:/app + env_file: + - .env + mem_limit: 2g + pusher: + build: + context: . + dockerfile: pusher.Dockerfile + ports: + - 8098:8080 + volumes: + - .:/app + env_file: + - .env + mem_limit: 1g + vad: + build: + context: . + dockerfile: vad.Dockerfile + mem_limit: 3g + ports: + - 8188:8080 + env_file: + - .env + plugins: + build: + context: ../plugins/example + dockerfile: Dockerfile + ports: + - 8189:8000 + volumes: + - ../plugins/example:/app + env_file: + - ../plugins/example/.env + mem_limit: 512m diff --git a/backend/database/users.py b/backend/database/users.py index 32239fc148..92ed91d519 100644 --- a/backend/database/users.py +++ b/backend/database/users.py @@ -127,10 +127,12 @@ def add_person_speech_sample(uid: str, person_id: str, sample_path: str, max_sam if len(current_samples) >= max_samples: return False - person_ref.update({ - 'speech_samples': firestore.ArrayUnion([sample_path]), - 'updated_at': datetime.now(timezone.utc), - }) + person_ref.update( + { + 'speech_samples': firestore.ArrayUnion([sample_path]), + 'updated_at': datetime.now(timezone.utc), + } + ) return True @@ -146,6 +148,54 @@ def get_person_speech_samples_count(uid: str, person_id: str) -> int: return len(person_data.get('speech_samples', [])) +def set_person_speaker_embedding(uid: str, person_id: str, embedding: list) -> bool: + """ + Store speaker embedding for a person. + + Args: + uid: User ID + person_id: Person ID + embedding: List of floats representing the speaker embedding + + Returns: + True if stored successfully, False if person not found + """ + person_ref = db.collection('users').document(uid).collection('people').document(person_id) + person_doc = person_ref.get() + + if not person_doc.exists: + return False + + person_ref.update( + { + 'speaker_embedding': embedding, + 'updated_at': datetime.now(timezone.utc), + } + ) + return True + + +def get_person_speaker_embedding(uid: str, person_id: str) -> Optional[list]: + """ + Get speaker embedding for a person. + + Args: + uid: User ID + person_id: Person ID + + Returns: + List of floats representing the embedding, or None if not found + """ + person_ref = db.collection('users').document(uid).collection('people').document(person_id) + person_doc = person_ref.get() + + if not person_doc.exists: + return None + + person_data = person_doc.to_dict() + return person_data.get('speaker_embedding') + + def delete_user_data(uid: str): user_ref = db.collection('users').document(uid) if not user_ref.get().exists: diff --git a/backend/pusher.Dockerfile b/backend/pusher.Dockerfile new file mode 100644 index 0000000000..a59f47e10c --- /dev/null +++ b/backend/pusher.Dockerfile @@ -0,0 +1,74 @@ +# Builder stage - compile liblc3 +FROM tiangolo/uvicorn-gunicorn:python3.11 as builder + +# Install build dependencies +RUN apt-get update && apt-get install -y \ + git \ + gcc \ + g++ \ + meson \ + ninja-build \ + python3-dev \ + && rm -rf /var/lib/apt/lists/* + +# Build liblc3 and create wheel +WORKDIR /tmp +RUN git clone https://github.com/google/liblc3.git && \ + cd liblc3 && \ + meson setup build && \ + cd build && \ + meson install && \ + ldconfig && \ + cd /tmp/liblc3 && \ + python3 -m pip wheel --no-cache-dir --wheel-dir /tmp/wheels . + +# Runtime stage - minimal image +FROM tiangolo/uvicorn-gunicorn:python3.11 + +# Only install runtime dependencies +RUN apt-get update && apt-get install -y \ + ffmpeg \ + curl \ + unzip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Copy compiled library and wheel from builder +COPY --from=builder /usr/local/lib/liblc3.so* /usr/local/lib/ +COPY --from=builder /tmp/wheels /tmp/wheels + +# Install liblc3 Python package and set library path +RUN ldconfig && \ + pip install --no-cache-dir /tmp/wheels/*.whl && \ + rm -rf /tmp/wheels + +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + +# Install Python requirements (now including lc3py if present) +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +WORKDIR /app + +#COPY . . +COPY ./routers ./routers +COPY ./pretrained_models ./pretrained_models +COPY ./database ./database +COPY ./migrations ./migrations +COPY ./memories-tuner ./tuner +COPY ./pusher ./pusher +COPY ./typesense ./typesense +COPY ./charts ./charts +COPY ./utils ./utils +COPY ./models ./models +COPY ./testing ./testing +COPY ./scripts ./scripts +COPY ./templates ./templates +COPY ./modal ./modal +COPY ./migration ./migration +COPY google-credentials.json ./ + + +EXPOSE 8080 + +CMD uvicorn pusher.main:app --host 0.0.0.0 --port 8080 --limit-concurrency 16 --backlog 32 diff --git a/backend/routers/conversations.py b/backend/routers/conversations.py index fc8e5dee82..78f814d380 100644 --- a/backend/routers/conversations.py +++ b/backend/routers/conversations.py @@ -32,6 +32,7 @@ from utils.conversations.process_conversation import process_conversation, retrieve_in_progress_conversation from utils.conversations.search import search_conversations from utils.llm.conversation_processing import generate_summary_with_prompt +from utils.speaker_identification import extract_speaker_samples from utils.other import endpoints as auth from utils.other.storage import get_conversation_recording_if_exists from utils.app_integrations import trigger_external_integrations @@ -495,6 +496,7 @@ def set_assignee_conversation_segment( def assign_segments_bulk( conversation_id: str, data: BulkAssignSegmentsRequest, + background_tasks: BackgroundTasks, uid: str = Depends(auth.get_current_user_uid), ): conversation = _get_valid_conversation_by_id(uid, conversation_id) @@ -521,6 +523,17 @@ def assign_segments_bulk( conversations_db.update_conversation_segments( uid, conversation_id, [segment.dict() for segment in conversation.transcript_segments] ) + + # Trigger speaker sample extraction when assigning to a person + if data.assign_type == 'person_id' and value: + background_tasks.add_task( + extract_speaker_samples, + uid=uid, + person_id=value, + conversation_id=conversation_id, + segment_ids=data.segment_ids, + ) + return conversation diff --git a/backend/routers/pusher.py b/backend/routers/pusher.py index 981a63fd79..ea2d2c6372 100644 --- a/backend/routers/pusher.py +++ b/backend/routers/pusher.py @@ -26,19 +26,14 @@ realtime_transcript_webhook, get_audio_bytes_webhook_seconds, ) -from utils.other.storage import ( - upload_audio_chunk, - list_audio_chunks, - download_audio_chunks_and_merge, - upload_person_speech_sample_from_bytes, -) +from utils.other.storage import upload_audio_chunk +from utils.speaker_identification import extract_speaker_samples router = APIRouter() # Constants for speaker sample extraction -SPEAKER_SAMPLE_PROCESS_INTERVAL = 5.0 -SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 10.0 -SPEAKER_SAMPLE_MIN_AGE = 15.0 +SPEAKER_SAMPLE_PROCESS_INTERVAL = 15.0 +SPEAKER_SAMPLE_MIN_AGE = 120.0 # Constants for private cloud sync PRIVATE_CLOUD_SYNC_PROCESS_INTERVAL = 1.0 @@ -46,162 +41,6 @@ PRIVATE_CLOUD_SYNC_MAX_RETRIES = 3 -async def _extract_speaker_samples( - uid: str, - person_id: str, - conversation_id: str, - segment_ids: List[str], - sample_rate: int = 16000, -): - """ - Extract speech samples from segments and store as speaker profiles. - Fetches conversation from DB to get started_at and segment details. - Processes each segment one by one, stops when sample limit reached. - """ - try: - # Check current sample count once - sample_count = users_db.get_person_speech_samples_count(uid, person_id) - if sample_count >= 5: - print(f"Person {person_id} already has {sample_count} samples, skipping", uid, conversation_id) - return - - # Fetch conversation to get started_at and segment details - conversation = conversations_db.get_conversation(uid, conversation_id) - if not conversation: - print(f"Conversation {conversation_id} not found", uid) - return - - started_at = conversation.get('started_at') - if not started_at: - print(f"Conversation {conversation_id} has no started_at", uid) - return - - started_at_ts = started_at.timestamp() if hasattr(started_at, 'timestamp') else float(started_at) - - # Build segment lookup from conversation's transcript_segments - conv_segments = conversation.get('transcript_segments', []) - segment_map = {s.get('id'): s for s in conv_segments if s.get('id')} - - # List chunks from storage - chunks = list_audio_chunks(uid, conversation_id) - if not chunks: - print(f"No chunks found for {conversation_id}, skipping speaker sample extraction", uid) - return - - samples_added = 0 - max_samples_to_add = 5 - sample_count - - # Build ordered list with index lookup for expansion - ordered_segments = [s for s in conv_segments if s.get('id')] - segment_index_map = {s.get('id'): i for i, s in enumerate(ordered_segments)} - - for seg_id in segment_ids: - if samples_added >= max_samples_to_add: - break - - seg = segment_map.get(seg_id) - if not seg: - print(f"Segment {seg_id} not found in conversation", uid, conversation_id) - continue - - segment_start = seg.get('start') - segment_end = seg.get('end') - if segment_start is None or segment_end is None: - continue - - seg_duration = segment_end - segment_start - speaker_id = seg.get('speaker_id') - - # If segment is too short, try expanding to adjacent segments with same speaker - if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: - seg_idx = segment_index_map.get(seg_id) - if seg_idx is not None: - i = seg_idx - 1 - while i >= 0: - prev_seg = ordered_segments[i] - if prev_seg.get('speaker_id') != speaker_id: - break - prev_start = prev_seg.get('start') - if prev_start is not None: - segment_start = min(segment_start, prev_start) - seg_duration = segment_end - segment_start - if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: - print( - f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", - uid, - conversation_id, - ) - break - i -= 1 - - if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: - print(f"Segment too short ({seg_duration:.1f}s) even after expansion, skipping", uid, conversation_id) - continue - - # Calculate absolute timestamps - abs_start = started_at_ts + segment_start - abs_end = started_at_ts + segment_end - - # Find relevant chunks - sorted_chunks = sorted(chunks, key=lambda c: c['timestamp']) - - # Find first chunk that starts at or before abs_start - first_idx = 0 - for i, chunk in enumerate(sorted_chunks): - if chunk['timestamp'] <= abs_start: - first_idx = i - else: - break - - # Collect from first_idx up to abs_end - relevant_timestamps = [] - for chunk in sorted_chunks[first_idx:]: - if chunk['timestamp'] <= abs_end: - relevant_timestamps.append(chunk['timestamp']) - else: - break - - if not relevant_timestamps: - print(f"No relevant chunks for segment {segment_start:.1f}-{segment_end:.1f}s", uid, conversation_id) - continue - - # Download, merge, and extract - merged = await asyncio.to_thread(download_audio_chunks_and_merge, uid, conversation_id, relevant_timestamps) - buffer_start = min(relevant_timestamps) - bytes_per_second = sample_rate * 2 # 16-bit mono - - start_byte = max(0, int((abs_start - buffer_start) * bytes_per_second)) - end_byte = min(len(merged), int((abs_end - buffer_start) * bytes_per_second)) - sample_audio = merged[start_byte:end_byte] - - # Ensure minimum sample length (0.5 seconds) - min_sample_bytes = int(sample_rate * 0.5 * 2) - if len(sample_audio) < min_sample_bytes: - print(f"Sample too short ({len(sample_audio)} bytes), skipping", uid, conversation_id) - continue - - # Upload and store - path = await asyncio.to_thread( - upload_person_speech_sample_from_bytes, sample_audio, uid, person_id, sample_rate - ) - - success = users_db.add_person_speech_sample(uid, person_id, path) - if success: - samples_added += 1 - seg_text = seg.get('text', '')[:100] # Truncate to 100 chars - print( - f"Stored speech sample {samples_added} for person {person_id}: segment_id={seg_id}, file={path}, text={seg_text}", - uid, - conversation_id, - ) - else: - print(f"Failed to add speech sample for person {person_id}", uid, conversation_id) - break # Likely hit limit - - except Exception as e: - print(f"Error extracting speaker samples: {e}", uid, conversation_id) - - async def _process_conversation_task(uid: str, conversation_id: str, language: str, websocket: WebSocket): """Process a conversation and send result back to _listen via websocket.""" try: @@ -301,6 +140,8 @@ async def process_private_cloud_queue(): chunks_to_process = private_cloud_queue.copy() private_cloud_queue = [] + successful_conversation_ids = set() # Track conversations with successful uploads + for chunk_info in chunks_to_process: chunk_data = chunk_info['data'] conv_id = chunk_info['conversation_id'] @@ -309,6 +150,7 @@ async def process_private_cloud_queue(): try: await asyncio.to_thread(upload_audio_chunk, chunk_data, uid, conv_id, timestamp) + successful_conversation_ids.add(conv_id) except Exception as e: if retries < PRIVATE_CLOUD_SYNC_MAX_RETRIES: # Re-queue with incremented retry count @@ -322,6 +164,20 @@ async def process_private_cloud_queue(): conv_id, ) + # Update audio_files for conversations with successful uploads + for conv_id in successful_conversation_ids: + try: + audio_files = await asyncio.to_thread(conversations_db.create_audio_files_from_chunks, uid, conv_id) + if audio_files: + await asyncio.to_thread( + conversations_db.update_conversation, + uid, + conv_id, + {'audio_files': [af.dict() for af in audio_files]}, + ) + except Exception as e: + print(f"Error updating audio files: {e}", uid, conv_id) + async def process_speaker_sample_queue(): """Background task that processes speaker sample extraction requests.""" nonlocal websocket_active, speaker_sample_queue @@ -354,7 +210,7 @@ async def process_speaker_sample_queue(): segment_ids = request['segment_ids'] try: - await _extract_speaker_samples( + await extract_speaker_samples( uid=uid, person_id=person_id, conversation_id=conv_id, diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 068130305e..c579a8769c 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -5,11 +5,13 @@ import struct import time import uuid +import wave from datetime import datetime, timedelta, timezone from enum import Enum from typing import Dict, List, Optional, Set, Tuple, Callable import av +import numpy as np import opuslib # type: ignore import webrtcvad # type: ignore @@ -81,12 +83,79 @@ from utils.webhooks import get_audio_bytes_webhook_seconds from utils.onboarding import OnboardingHandler +from utils.stt.speaker_embedding import ( + extract_embedding_from_bytes, + compare_embeddings, + SPEAKER_MATCH_THRESHOLD, +) + + router = APIRouter() PUSHER_ENABLED = bool(os.getenv('HOSTED_PUSHER_API_URL')) +class AudioRingBuffer: + """Circular buffer storing last N seconds of PCM16 mono audio with timestamp tracking.""" + + def __init__(self, duration_seconds: float, sample_rate: int): + self.sample_rate = sample_rate + self.bytes_per_second = sample_rate * 2 # PCM16 mono + self.capacity = int(duration_seconds * self.bytes_per_second) + self.buffer = bytearray(self.capacity) + self.write_pos = 0 + self.total_bytes_written = 0 + self.last_write_timestamp: Optional[float] = None + + def write(self, data: bytes, timestamp: float): + """Append audio data with timestamp.""" + for byte in data: + self.buffer[self.write_pos] = byte + self.write_pos = (self.write_pos + 1) % self.capacity + self.total_bytes_written += len(data) + self.last_write_timestamp = timestamp + + def get_time_range(self) -> Optional[Tuple[float, float]]: + """Return (start_ts, end_ts) of audio currently in buffer.""" + if self.last_write_timestamp is None: + return None + bytes_in_buffer = min(self.total_bytes_written, self.capacity) + buffer_duration = bytes_in_buffer / self.bytes_per_second + return (self.last_write_timestamp - buffer_duration, self.last_write_timestamp) + + def extract(self, start_ts: float, end_ts: float) -> Optional[bytes]: + """Extract audio for absolute timestamp range.""" + time_range = self.get_time_range() + if time_range is None: + return None + + buffer_start_ts, buffer_end_ts = time_range + actual_start = max(start_ts, buffer_start_ts) + actual_end = min(end_ts, buffer_end_ts) + + if actual_start >= actual_end: + return None + + bytes_in_buffer = min(self.total_bytes_written, self.capacity) + buffer_logical_start = (self.write_pos - bytes_in_buffer) % self.capacity + + start_offset = int((actual_start - buffer_start_ts) * self.bytes_per_second) + end_offset = int((actual_end - buffer_start_ts) * self.bytes_per_second) + + # Ensure even number of bytes (PCM16) + length = ((end_offset - start_offset) // 2) * 2 + if length <= 0: + return None + + result = bytearray(length) + for i in range(length): + pos = (buffer_logical_start + start_offset + i) % self.capacity + result[i] = self.buffer[pos] + + return bytes(result) + + class CustomSttMode(str, Enum): disabled = "disabled" enabled = "enabled" @@ -253,10 +322,21 @@ async def _listen( # Initialize segment buffers early (before onboarding handler needs them) realtime_segment_buffers = [] realtime_photo_buffers: list[ConversationPhoto] = [] - + + # === Speaker Identification State === + RING_BUFFER_DURATION = 60.0 # seconds + SPEAKER_ID_MIN_AUDIO = 2.0 + SPEAKER_ID_TARGET_AUDIO = 4.0 + + audio_ring_buffer: Optional[AudioRingBuffer] = None + speaker_id_segment_queue: asyncio.Queue[dict] = asyncio.Queue(maxsize=100) + person_embeddings_cache: Dict[str, dict] = {} # person_id -> {embedding, name} + speaker_id_enabled = False # Will be set after private_cloud_sync_enabled is known + # Onboarding handler onboarding_handler: Optional[OnboardingHandler] = None if onboarding_mode: + async def send_onboarding_event(event: dict): if websocket_active and websocket.client_state == WebSocketState.CONNECTED: try: @@ -271,7 +351,7 @@ def onboarding_stream_transcript(segments: List[dict]): onboarding_handler = OnboardingHandler(uid, send_onboarding_event, onboarding_stream_transcript) asyncio.create_task(onboarding_handler.send_current_question()) - + locked_conversation_ids: Set[str] = set() speaker_to_person_map: Dict[int, Tuple[str, str]] = {} segment_person_assignment_map: Dict[str, str] = {} @@ -415,6 +495,11 @@ async def send_heartbeat(): # Create or get conversation ID early for audio chunk storage private_cloud_sync_enabled = user_db.get_user_private_cloud_sync_enabled(uid) + # Enable speaker identification if not custom STT and private cloud sync is enabled + speaker_id_enabled = not use_custom_stt and private_cloud_sync_enabled + if speaker_id_enabled: + audio_ring_buffer = AudioRingBuffer(RING_BUFFER_DURATION, sample_rate) + # Conversation timeout (to process the conversation after x seconds of silence) # Max: 4h, min 2m conversation_creation_timeout = conversation_timeout @@ -621,7 +706,10 @@ def _process_speaker_assigned_segments(transcript_segments: List[TranscriptSegme segment.person_id = person_id def _update_in_progress_conversation( - conversation: Conversation, segments: List[TranscriptSegment], photos: List[ConversationPhoto], finished_at: datetime + conversation: Conversation, + segments: List[TranscriptSegment], + photos: List[ConversationPhoto], + finished_at: datetime, ): starts, ends = (0, 0) @@ -946,7 +1034,7 @@ async def _audio_bytes_flush(auto_reconnect: bool = True): # buffer_duration = buffer_length_bytes / (sample_rate * 2 bytes per sample) buffer_duration_seconds = len(audio_buffers) / (sample_rate * 2) buffer_start_time = (audio_buffer_last_received or time.time()) - buffer_duration_seconds - + # 101|timestamp(8 bytes double)|audio_data data = bytearray() data.extend(struct.pack("I", 101)) @@ -1077,13 +1165,24 @@ async def send_speaker_sample_request( try: data = bytearray() data.extend(struct.pack("I", 105)) - data.extend(bytes(json.dumps({ - "person_id": person_id, - "conversation_id": conv_id, - "segment_ids": segment_ids, - }), "utf-8")) + data.extend( + bytes( + json.dumps( + { + "person_id": person_id, + "conversation_id": conv_id, + "segment_ids": segment_ids, + } + ), + "utf-8", + ) + ) await pusher_ws.send(data) - print(f"Sent speaker sample request to pusher: person={person_id}, {len(segment_ids)} segments", uid, session_id) + print( + f"Sent speaker sample request to pusher: person={person_id}, {len(segment_ids)} segments", + uid, + session_id, + ) except Exception as e: print(f"Failed to send speaker sample request: {e}", uid, session_id) @@ -1225,6 +1324,180 @@ async def conversation_lifecycle_manager(): await _process_conversation(current_conversation_id) await _create_new_in_progress_conversation() + def _pcm_to_wav_bytes(pcm_data: bytes, sr: int) -> bytes: + """Convert PCM16 mono to WAV format using av.""" + output_buffer = io.BytesIO() + output_container = av.open(output_buffer, mode='w', format='wav') + output_stream = output_container.add_stream('pcm_s16le', rate=sr) + output_stream.layout = 'mono' + + samples = np.frombuffer(pcm_data, dtype=np.int16) + frame = av.AudioFrame.from_ndarray(samples.reshape(1, -1), format='s16', layout='mono') + frame.rate = sr + + for packet in output_stream.encode(frame): + output_container.mux(packet) + for packet in output_stream.encode(): + output_container.mux(packet) + + output_container.close() + return output_buffer.getvalue() + + async def speaker_identification_task(): + """Consume segment queue, accumulate per speaker, trigger match when ready.""" + nonlocal websocket_active, speaker_to_person_map + nonlocal person_embeddings_cache, audio_ring_buffer + + if not speaker_id_enabled: + return + + # Load person embeddings + try: + people = user_db.get_people(uid) + for person in people: + emb = person.get('speaker_embedding') + if emb: + person_embeddings_cache[person['id']] = { + 'embedding': np.array(emb, dtype=np.float32).reshape(1, -1), + 'name': person['name'], + } + print(f"Speaker ID: loaded {len(person_embeddings_cache)} person embeddings", uid, session_id) + except Exception as e: + print(f"Speaker ID: failed to load embeddings: {e}", uid, session_id) + return + + if not person_embeddings_cache: + print("Speaker ID: no stored embeddings, task disabled", uid, session_id) + return + + # Consume loop + while websocket_active: + try: + seg = await asyncio.wait_for(speaker_id_segment_queue.get(), timeout=2.0) + except asyncio.TimeoutError: + continue + + speaker_id = seg['speaker_id'] + + # Skip if already resolved + if speaker_id in speaker_to_person_map: + continue + + duration = seg['duration'] + if duration >= SPEAKER_ID_MIN_AUDIO: + asyncio.create_task(_match_speaker_embedding(speaker_id, seg)) + + print("Speaker ID task ended", uid, session_id) + + async def _match_speaker_embedding(speaker_id: int, segment: dict): + """Extract audio from ring buffer and match against stored embeddings.""" + nonlocal speaker_to_person_map, segment_person_assignment_map, audio_ring_buffer + + try: + seg_start = segment['abs_start'] + seg_end = segment['abs_end'] + duration = segment['duration'] + + if duration < SPEAKER_ID_MIN_AUDIO: + print(f"Speaker ID: segment too short ({duration:.1f}s)", uid, session_id) + return + + # Get buffer time range + time_range = audio_ring_buffer.get_time_range() + if time_range is None: + print(f"Speaker ID: buffer empty", uid, session_id) + return + + buffer_start_ts, buffer_end_ts = time_range + + # Calculate extraction range - stay within segment bounds, max 10 seconds from center + MAX_EXTRACT_DURATION = 10.0 + + if duration <= MAX_EXTRACT_DURATION: + # Segment fits within max duration, use full segment + extract_start = seg_start + extract_end = seg_end + else: + # Segment is longer than max, extract 10s from center + center = (seg_start + seg_end) / 2 + half_duration = MAX_EXTRACT_DURATION / 2 + extract_start = center - half_duration + extract_end = center + half_duration + + # Clamp to buffer availability + extract_start = max(buffer_start_ts, extract_start) + extract_end = min(buffer_end_ts, extract_end) + + if extract_end <= extract_start: + print(f"Speaker ID: no audio to extract", uid, session_id) + return + + # Extract only the needed bytes directly from ring buffer + pcm_data = audio_ring_buffer.extract(extract_start, extract_end) + if not pcm_data: + print(f"Speaker ID: failed to extract audio", uid, session_id) + return + + # Convert PCM to numpy for WAV encoding + samples = np.frombuffer(pcm_data, dtype=np.int16) + + # Convert PCM to WAV using av + output_buffer = io.BytesIO() + output_container = av.open(output_buffer, mode='w', format='wav') + output_stream = output_container.add_stream('pcm_s16le', rate=sample_rate) + output_stream.layout = 'mono' + + frame = av.AudioFrame.from_ndarray(samples.reshape(1, -1), format='s16', layout='mono') + frame.rate = sample_rate + + for packet in output_stream.encode(frame): + output_container.mux(packet) + for packet in output_stream.encode(): + output_container.mux(packet) + + output_container.close() + wav_bytes = output_buffer.getvalue() + + # Extract embedding (API call) + query_embedding = await asyncio.to_thread(extract_embedding_from_bytes, wav_bytes, "query.wav") + + # Find best match + best_match = None + best_distance = float('inf') + + for person_id, data in person_embeddings_cache.items(): + distance = compare_embeddings(query_embedding, data['embedding']) + if distance < best_distance: + best_distance = distance + best_match = (person_id, data['name']) + + if best_match and best_distance < SPEAKER_MATCH_THRESHOLD: + person_id, person_name = best_match + print( + f"Speaker ID: speaker {speaker_id} -> {person_name} (distance={best_distance:.3f})", uid, session_id + ) + + # Store for session consistency + speaker_to_person_map[speaker_id] = (person_id, person_name) + + # Auto-assign processed segment + segment_person_assignment_map[segment['id']] = person_id + + # Notify client + _send_message_event( + SpeakerLabelSuggestionEvent( + speaker_id=speaker_id, + person_id=person_id, + person_name=person_name, + segment_id=segment['id'], + ) + ) + else: + print(f"Speaker ID: speaker {speaker_id} no match (best={best_distance:.3f})", uid, session_id) + + except Exception as e: + print(f"Speaker ID: match error for speaker {speaker_id}: {e}", uid, session_id) + async def stream_transcript_process(): nonlocal websocket_active, realtime_segment_buffers, realtime_photo_buffers, websocket nonlocal current_conversation_id, translation_enabled, speaker_to_person_map, suggested_segments, words_transcribed_since_last_record, last_transcript_time @@ -1246,13 +1519,16 @@ async def stream_transcript_process(): # Get conversation conversation_data = conversations_db.get_conversation(uid, current_conversation_id) if not conversation_data: - print(f"Warning: conversation {current_conversation_id} not found during segment processing", uid, session_id) + print( + f"Warning: conversation {current_conversation_id} not found during segment processing", + uid, + session_id, + ) continue # Guard first_audio_byte_timestamp must be set if not first_audio_byte_timestamp: - print(f"Warning: first_audio_byte_timestamp not set, skipping segment processing", uid, -session_id) + print(f"Warning: first_audio_byte_timestamp not set, skipping segment processing", uid, session_id) continue transcript_segments = [] @@ -1295,9 +1571,7 @@ async def stream_transcript_process(): # Update transcript segments conversation = Conversation(**conversation_data) - result = _update_in_progress_conversation( - conversation, transcript_segments, photos_to_process, finished_at - ) + result = _update_in_progress_conversation(conversation, transcript_segments, photos_to_process, finished_at) if not result or not result[0]: continue conversation, (starts, ends) = result @@ -1321,8 +1595,8 @@ async def stream_transcript_process(): if segment.person_id or segment.is_user or segment.id in suggested_segments: continue + # Session consistency speaker identification if speech_profile_complete.is_set(): - # Session consistency if segment.speaker_id in speaker_to_person_map: person_id, person_name = speaker_to_person_map[segment.speaker_id] _send_message_event( @@ -1336,6 +1610,31 @@ async def stream_transcript_process(): suggested_segments.add(segment.id) continue + # Embeding id speaker indentification + if speaker_id_enabled and person_embeddings_cache: + started_at_ts = conversation.started_at.timestamp() + if ( + segment.speaker_id is not None + and not segment.person_id + and not segment.is_user + and segment.speaker_id not in speaker_to_person_map + ): + try: + speaker_id_segment_queue.put_nowait( + { + 'id': segment.id, + 'speaker_id': segment.speaker_id, + 'abs_start': first_audio_byte_timestamp + + segment.start + - time_offset, # raw start/end + 'abs_end': first_audio_byte_timestamp + segment.end - time_offset, + 'duration': segment.end - segment.start, + 'text': segment.text, # TODO: remove + } + ) + except asyncio.QueueFull: + pass # Drop if queue is full + # Text-based detection detected_name = detect_speaker_from_text(segment.text) if detected_name: @@ -1413,7 +1712,7 @@ async def handle_image_chunk( async def receive_data(dg_socket, dg_profile_socket, soniox_sock, soniox_profile_sock, speechmatics_sock): nonlocal websocket_active, websocket_close_code, last_audio_received_time, last_activity_time, current_conversation_id nonlocal realtime_photo_buffers, speaker_to_person_map, first_audio_byte_timestamp, last_usage_record_timestamp - nonlocal soniox_profile_socket, deepgram_profile_socket + nonlocal soniox_profile_socket, deepgram_profile_socket, audio_ring_buffer timer_start = time.time() last_audio_received_time = timer_start @@ -1539,6 +1838,10 @@ async def close_soniox_profile(): ) continue + # Feed ring buffer for speaker identification + if audio_ring_buffer is not None: + audio_ring_buffer.write(data, last_audio_received_time) + if not use_custom_stt: stt_audio_buffer.extend(data) await flush_stt_buffer() @@ -1671,6 +1974,7 @@ async def close_soniox_profile(): record_usage_task = asyncio.create_task(_record_usage_periodically()) lifecycle_manager_task = asyncio.create_task(conversation_lifecycle_manager()) pending_conversations_task = asyncio.create_task(process_pending_conversations(timed_out_conversation_id)) + speaker_id_task = asyncio.create_task(speaker_identification_task()) _send_message_event(MessageServiceStatusEvent(status="ready")) @@ -1681,6 +1985,7 @@ async def close_soniox_profile(): record_usage_task, lifecycle_manager_task, pending_conversations_task, + speaker_id_task, ] + pusher_tasks # Add speech profile task to run concurrently (sends profile audio in background) @@ -1742,6 +2047,7 @@ async def close_soniox_profile(): realtime_segment_buffers.clear() realtime_photo_buffers.clear() image_chunks.clear() + person_embeddings_cache.clear() except NameError as e: # Variables might not be defined if an error occurred early print(f"Cleanup error (safe to ignore): {e}", uid, session_id) diff --git a/backend/utils/speaker_identification.py b/backend/utils/speaker_identification.py index d7ba43c677..faa4754ee7 100644 --- a/backend/utils/speaker_identification.py +++ b/backend/utils/speaker_identification.py @@ -1,5 +1,115 @@ +import asyncio +import io import re -from typing import Optional +import wave +from typing import List, Optional + +import av +import numpy as np + +from database import conversations as conversations_db +from database import users as users_db +from utils.other.storage import ( + download_audio_chunks_and_merge, + upload_person_speech_sample_from_bytes, +) +from utils.stt.speaker_embedding import extract_embedding_from_bytes + + +def _pcm_to_wav_bytes(pcm_data: bytes, sample_rate: int) -> bytes: + """ + Convert PCM16 mono audio to WAV format bytes. + + Args: + pcm_data: Raw PCM16 mono audio bytes + sample_rate: Audio sample rate in Hz + + Returns: + WAV format bytes + """ + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(pcm_data) + return wav_buffer.getvalue() + + +def _trim_pcm_audio(pcm_data: bytes, sample_rate: int, start_sec: float, end_sec: float) -> bytes: + """ + Trim PCM16 mono audio using av for sample-accurate cutting. + + Args: + pcm_data: Raw PCM16 mono audio bytes + sample_rate: Audio sample rate in Hz + start_sec: Start time in seconds (relative to pcm_data start) + end_sec: End time in seconds (relative to pcm_data start) + + Returns: + Trimmed PCM16 mono audio bytes + """ + # Create WAV container for av to read + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(pcm_data) + wav_buffer.seek(0) + + # Use av to extract trimmed audio with sample-accurate boundaries + trimmed_samples = [] + with av.open(wav_buffer, mode='r') as container: + stream = container.streams.audio[0] + + for frame in container.decode(stream): + if frame.pts is None: + continue + + frame_time = float(frame.pts * stream.time_base) + frame_duration = frame.samples / sample_rate + frame_end_time = frame_time + frame_duration + + # Skip frames entirely before our start + if frame_end_time <= start_sec: + continue + # Stop once we're past the end + if frame_time >= end_sec: + break + + # Convert frame to numpy array + arr = frame.to_ndarray() + # For mono pcm_s16le, arr shape is (1, samples) + if arr.ndim == 2: + arr = arr[0] + + # Calculate which samples from this frame to include + frame_start_sample = 0 + frame_end_sample = len(arr) + + if frame_time < start_sec: + # Trim beginning of frame + skip_samples = int((start_sec - frame_time) * sample_rate) + frame_start_sample = skip_samples + + if frame_end_time > end_sec: + # Trim end of frame + keep_duration = end_sec - max(frame_time, start_sec) + frame_end_sample = frame_start_sample + int(keep_duration * sample_rate) + + if frame_start_sample < frame_end_sample: + trimmed_samples.append(arr[frame_start_sample:frame_end_sample]) + + if not trimmed_samples: + return b'' + + return np.concatenate(trimmed_samples).astype(np.int16).tobytes() + + +# Constants for speaker sample extraction +SPEAKER_SAMPLE_MIN_SEGMENT_DURATION = 10.0 +SPEAKER_SAMPLE_WINDOW_HALF = SPEAKER_SAMPLE_MIN_SEGMENT_DURATION / 2 # Language-specific patterns for speaker identification from text # Each pattern should have a capture group for the name. @@ -123,3 +233,205 @@ def detect_speaker_from_text(text: str) -> Optional[str]: if name and len(name) >= 2: return name.capitalize() return None + + +async def extract_speaker_samples( + uid: str, + person_id: str, + conversation_id: str, + segment_ids: List[str], + sample_rate: int = 16000, +): + """ + Extract speech samples from segments and store as speaker profiles. + Fetches conversation from DB to get started_at and segment details. + Processes each segment one by one, stops when sample limit reached. + """ + try: + # Check current sample count once + sample_count = users_db.get_person_speech_samples_count(uid, person_id) + if sample_count >= 1: + print(f"Person {person_id} already has {sample_count} samples, skipping", uid, conversation_id) + return + + # Fetch conversation to get started_at and segment details + conversation = conversations_db.get_conversation(uid, conversation_id) + if not conversation: + print(f"Conversation {conversation_id} not found", uid) + return + + started_at = conversation.get('started_at') + if not started_at: + print(f"Conversation {conversation_id} has no started_at", uid) + return + + started_at_ts = started_at.timestamp() if hasattr(started_at, 'timestamp') else float(started_at) + + # Build segment lookup from conversation's transcript_segments + conv_segments = conversation.get('transcript_segments', []) + segment_map = {s.get('id'): s for s in conv_segments if s.get('id')} + + # Get chunks from audio_files instead of storage listing + audio_files = conversation.get('audio_files', []) + if not audio_files: + print(f"No audio files found for {conversation_id}, skipping speaker sample extraction", uid) + return + + # Collect all chunk timestamps from audio files + all_timestamps = [] + for af in audio_files: + timestamps = af.get('chunk_timestamps', []) + all_timestamps.extend(timestamps) + + if not all_timestamps: + print(f"No chunk timestamps found for {conversation_id}, skipping speaker sample extraction", uid) + return + + # Build chunks list in expected format + chunks = [{'timestamp': ts} for ts in sorted(set(all_timestamps))] + + samples_added = 0 + max_samples_to_add = 1 - sample_count + + # Build ordered list with index lookup for expansion + ordered_segments = [s for s in conv_segments if s.get('id')] + segment_index_map = {s.get('id'): i for i, s in enumerate(ordered_segments)} + + for seg_id in segment_ids: + if samples_added >= max_samples_to_add: + break + + seg = segment_map.get(seg_id) + if not seg: + print(f"Segment {seg_id} not found in conversation", uid, conversation_id) + continue + + segment_start = seg.get('start') + segment_end = seg.get('end') + if segment_start is None or segment_end is None: + continue + + seg_duration = segment_end - segment_start + speaker_id = seg.get('speaker_id') + + # If segment is too short, try expanding to adjacent segments with same speaker + if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION and speaker_id is not None: + seg_idx = segment_index_map.get(seg_id) + if seg_idx is not None: + i = seg_idx - 1 + while i >= 0: + prev_seg = ordered_segments[i] + if prev_seg.get('speaker_id') != speaker_id: + break + prev_start = prev_seg.get('start') + if prev_start is not None: + segment_start = min(segment_start, prev_start) + seg_duration = segment_end - segment_start + if seg_duration >= SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: + print( + f"Expanded segment to {seg_duration:.1f}s by including adjacent segments", + uid, + conversation_id, + ) + break + i -= 1 + + if seg_duration < SPEAKER_SAMPLE_MIN_SEGMENT_DURATION: + print(f"Segment too short ({seg_duration:.1f}s) even after expansion, skipping", uid, conversation_id) + continue + + # Extract centered sample window (10 seconds max from center of segment) + seg_center = (segment_start + segment_end) / 2 + sample_start = max(segment_start, seg_center - SPEAKER_SAMPLE_WINDOW_HALF) + sample_end = min(segment_end, seg_center + SPEAKER_SAMPLE_WINDOW_HALF) + + # Calculate absolute timestamps using the sample window + abs_start = started_at_ts + sample_start + abs_end = started_at_ts + sample_end + + # Find relevant chunks + sorted_chunks = sorted(chunks, key=lambda c: c['timestamp']) + + # Find first chunk that starts at or before abs_start + first_idx = 0 + for i, chunk in enumerate(sorted_chunks): + if chunk['timestamp'] <= abs_start: + first_idx = i + else: + break + + # Collect from first_idx up to abs_end + relevant_timestamps = [] + for chunk in sorted_chunks[first_idx:]: + if chunk['timestamp'] <= abs_end: + relevant_timestamps.append(chunk['timestamp']) + else: + break + + if not relevant_timestamps: + print(f"No relevant chunks for segment {segment_start:.1f}-{segment_end:.1f}s", uid, conversation_id) + continue + + # Download, merge, and extract + merged = await asyncio.to_thread( + download_audio_chunks_and_merge, + uid, + conversation_id, + relevant_timestamps, + fill_gaps=True, + sample_rate=sample_rate, + ) + buffer_start = min(relevant_timestamps) + + # Use av for sample-accurate trimming + trim_start = abs_start - buffer_start + trim_end = abs_end - buffer_start + sample_audio = _trim_pcm_audio(merged, sample_rate, trim_start, trim_end) + + # Ensure minimum sample length (8 seconds) + min_sample_seconds = 8.0 + min_sample_bytes = int(sample_rate * min_sample_seconds * 2) + if len(sample_audio) < min_sample_bytes: + actual_seconds = len(sample_audio) / (sample_rate * 2) + print( + f"Sample too short ({actual_seconds:.1f}s), need {min_sample_seconds}s, skipping", + uid, + conversation_id, + ) + continue + + # Upload and store + path = await asyncio.to_thread( + upload_person_speech_sample_from_bytes, sample_audio, uid, person_id, sample_rate + ) + + success = users_db.add_person_speech_sample(uid, person_id, path) + if success: + samples_added += 1 + seg_text = seg.get('text', '')[:100] # Truncate to 100 chars + print( + f"Stored speech sample {samples_added} for person {person_id}: segment_id={seg_id}, file={path}, text={seg_text}", + uid, + conversation_id, + ) + + # Extract and store speaker embedding + try: + wav_bytes = _pcm_to_wav_bytes(sample_audio, sample_rate) + embedding = await asyncio.to_thread(extract_embedding_from_bytes, wav_bytes, "sample.wav") + # Convert numpy array to list for Firestore storage + embedding_list = embedding.flatten().tolist() + users_db.set_person_speaker_embedding(uid, person_id, embedding_list) + print( + f"Stored speaker embedding for person {person_id} (dim={len(embedding_list)})", + uid, + conversation_id, + ) + except Exception as emb_err: + print(f"Failed to extract/store speaker embedding: {emb_err}", uid, conversation_id) + else: + print(f"Failed to add speech sample for person {person_id}", uid, conversation_id) + break # Likely hit limit + + except Exception as e: + print(f"Error extracting speaker samples: {e}", uid, conversation_id) diff --git a/backend/utils/stt/speaker_embedding.py b/backend/utils/stt/speaker_embedding.py new file mode 100644 index 0000000000..fe193e9799 --- /dev/null +++ b/backend/utils/stt/speaker_embedding.py @@ -0,0 +1,176 @@ +import os +from typing import Optional, Tuple + +import numpy as np +import requests +from scipy.spatial.distance import cdist + +# Cosine distance threshold for speaker matching +# Based on VoxCeleb 1 test set EER of 2.8% +SPEAKER_MATCH_THRESHOLD = 0.35 + + +def _get_api_url() -> str: + """Get the speaker embedding API URL from environment.""" + url = os.getenv('HOSTED_SPEAKER_EMBEDDING_API_URL') + if not url: + raise ValueError("HOSTED_SPEAKER_EMBEDDING_API_URL environment variable not set") + return url + + +def extract_embedding(audio_path: str) -> np.ndarray: + """ + Extract speaker embedding from an audio file using hosted API. + + Args: + audio_path: Path to audio file (wav format recommended) + + Returns: + numpy array of shape (1, D) where D is embedding dimension + """ + api_url = _get_api_url() + + with open(audio_path, 'rb') as f: + files = {'file': (os.path.basename(audio_path), f, 'audio/wav')} + response = requests.post(f"{api_url}/v1/embedding", files=files) + response.raise_for_status() + + result = response.json() + + # Handle both formats: direct array or {"embedding": [...]} + if isinstance(result, list): + embedding = np.array(result, dtype=np.float32) + else: + embedding = np.array(result['embedding'], dtype=np.float32) + + # Ensure shape is (1, D) + if embedding.ndim == 1: + embedding = embedding.reshape(1, -1) + + return embedding + + +def extract_embedding_from_bytes(audio_data: bytes, filename: str = "audio.wav") -> np.ndarray: + """ + Extract speaker embedding from audio bytes using hosted API. + + Args: + audio_data: Raw audio bytes (wav format) + filename: Filename to use in the request + + Returns: + numpy array of shape (1, D) where D is embedding dimension + """ + api_url = _get_api_url() + + files = {'file': (filename, audio_data, 'audio/wav')} + response = requests.post(f"{api_url}/v1/embedding", files=files) + response.raise_for_status() + + result = response.json() + + # Handle both formats: direct array or {"embedding": [...]} + if isinstance(result, list): + embedding = np.array(result, dtype=np.float32) + else: + embedding = np.array(result['embedding'], dtype=np.float32) + + # Ensure shape is (1, D) + if embedding.ndim == 1: + embedding = embedding.reshape(1, -1) + + return embedding + + +def compare_embeddings(embedding1: np.ndarray, embedding2: np.ndarray) -> float: + """ + Compare two speaker embeddings using cosine distance. + + Args: + embedding1: First embedding array (1, D) + embedding2: Second embedding array (1, D) + + Returns: + Cosine distance (0.0 = identical, 2.0 = opposite) + Lower values indicate more similar speakers + """ + distance = cdist(embedding1, embedding2, metric="cosine")[0, 0] + return float(distance) + + +def is_same_speaker( + embedding1: np.ndarray, embedding2: np.ndarray, threshold: float = SPEAKER_MATCH_THRESHOLD +) -> Tuple[bool, float]: + """ + Determine if two embeddings belong to the same speaker. + + Args: + embedding1: First embedding array + embedding2: Second embedding array + threshold: Cosine distance threshold for matching + + Returns: + Tuple of (is_match, distance) + """ + distance = compare_embeddings(embedding1, embedding2) + return distance < threshold, distance + + +def embedding_to_bytes(embedding: np.ndarray) -> bytes: + """ + Serialize embedding to bytes for storage. + + Args: + embedding: numpy array embedding + + Returns: + Bytes representation of the embedding + """ + return embedding.astype(np.float32).tobytes() + + +def bytes_to_embedding(data: bytes, dim: int = 512) -> np.ndarray: + """ + Deserialize embedding from bytes. + + Args: + data: Bytes representation of embedding + dim: Embedding dimension (default 512 for pyannote/embedding) + + Returns: + numpy array of shape (1, D) + """ + embedding = np.frombuffer(data, dtype=np.float32) + return embedding.reshape(1, -1) + + +def find_best_match( + query_embedding: np.ndarray, candidate_embeddings: list[np.ndarray], threshold: float = SPEAKER_MATCH_THRESHOLD +) -> Optional[Tuple[int, float]]: + """ + Find the best matching speaker from a list of candidates. + + Args: + query_embedding: Embedding to match + candidate_embeddings: List of candidate embeddings + threshold: Maximum distance for a valid match + + Returns: + Tuple of (best_index, distance) or None if no match found + """ + if not candidate_embeddings: + return None + + best_idx = -1 + best_distance = float('inf') + + for idx, candidate in enumerate(candidate_embeddings): + distance = compare_embeddings(query_embedding, candidate) + if distance < best_distance: + best_distance = distance + best_idx = idx + + if best_distance < threshold: + return best_idx, best_distance + + return None From 1034a597bf9fbd623ece004bcd8a935ad6d6a79e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 22:34:41 +0700 Subject: [PATCH 10/14] Without apps changes --- app/lib/pages/capture/widgets/widgets.dart | 4 - app/lib/pages/conversation_detail/page.dart | 38 +--- app/lib/widgets/conversation_bottom_bar.dart | 222 ++----------------- app/lib/widgets/transcript.dart | 51 +---- 4 files changed, 36 insertions(+), 279 deletions(-) diff --git a/app/lib/pages/capture/widgets/widgets.dart b/app/lib/pages/capture/widgets/widgets.dart index ab8ba2369f..d853bfe16b 100644 --- a/app/lib/pages/capture/widgets/widgets.dart +++ b/app/lib/pages/capture/widgets/widgets.dart @@ -189,8 +189,6 @@ getTranscriptWidget( String searchQuery = '', int currentResultIndex = -1, VoidCallback? onTapWhenSearchEmpty, - Function(double segmentStartSeconds)? onPlaySegment, - bool hasAudio = false, }) { if (conversationCreating) { return const Padding( @@ -223,8 +221,6 @@ getTranscriptWidget( searchQuery: searchQuery, currentResultIndex: currentResultIndex, onTapWhenSearchEmpty: onTapWhenSearchEmpty, - onPlaySegment: onPlaySegment, - hasAudio: hasAudio, ); } diff --git a/app/lib/pages/conversation_detail/page.dart b/app/lib/pages/conversation_detail/page.dart index c8bfc57a55..dfd9ff4bcb 100644 --- a/app/lib/pages/conversation_detail/page.dart +++ b/app/lib/pages/conversation_detail/page.dart @@ -49,7 +49,6 @@ class _ConversationDetailPageState extends State with Ti final focusTitleField = FocusNode(); final focusOverviewField = FocusNode(); final GlobalKey _shareButtonKey = GlobalKey(); - final GlobalKey _audioBarKey = GlobalKey(); TabController? _controller; final AppReviewService _appReviewService = AppReviewService(); ConversationTab selectedTab = ConversationTab.summary; @@ -686,25 +685,17 @@ class _ConversationDetailPageState extends State with Ti controller: _controller, physics: const NeverScrollableScrollPhysics(), children: [ - Consumer( - builder: (context, detailProvider, _) { - return TranscriptWidgets( - searchQuery: _searchQuery, - currentResultIndex: getCurrentResultIndexForHighlighting(), - onTapWhenSearchEmpty: () { - if (_isSearching && _searchQuery.isEmpty) { - setState(() { - _isSearching = false; - _searchController.clear(); - _searchFocusNode.unfocus(); - }); - } - }, - onPlaySegment: (double segmentStartSeconds) { - _audioBarKey.currentState?.seekAndPlay(segmentStartSeconds); - }, - hasAudio: detailProvider.conversation.hasAudio(), - ); + TranscriptWidgets( + searchQuery: _searchQuery, + currentResultIndex: getCurrentResultIndexForHighlighting(), + onTapWhenSearchEmpty: () { + if (_isSearching && _searchQuery.isEmpty) { + setState(() { + _isSearching = false; + _searchController.clear(); + _searchFocusNode.unfocus(); + }); + } }, ), SummaryTab( @@ -741,7 +732,6 @@ class _ConversationDetailPageState extends State with Ti final hasActionItems = conversation.structured.actionItems.where((item) => !item.deleted).isNotEmpty; return ConversationBottomBar( - key: _audioBarKey, mode: ConversationBottomBarMode.detail, selectedTab: selectedTab, conversation: conversation, @@ -1093,16 +1083,12 @@ class TranscriptWidgets extends StatefulWidget { final String searchQuery; final int currentResultIndex; final VoidCallback? onTapWhenSearchEmpty; - final Function(double segmentStartSeconds)? onPlaySegment; - final bool hasAudio; const TranscriptWidgets({ super.key, this.searchQuery = '', this.currentResultIndex = -1, this.onTapWhenSearchEmpty, - this.onPlaySegment, - this.hasAudio = false, }); @override @@ -1166,8 +1152,6 @@ class _TranscriptWidgetsState extends State with AutomaticKee searchQuery: widget.searchQuery, currentResultIndex: widget.currentResultIndex, onTapWhenSearchEmpty: widget.onTapWhenSearchEmpty, - onPlaySegment: widget.onPlaySegment, - hasAudio: widget.hasAudio, editSegment: (segmentId, speakerId) { final connectivityProvider = Provider.of(context, listen: false); if (!connectivityProvider.isConnected) { diff --git a/app/lib/widgets/conversation_bottom_bar.dart b/app/lib/widgets/conversation_bottom_bar.dart index 8e96ffb967..449923c31e 100644 --- a/app/lib/widgets/conversation_bottom_bar.dart +++ b/app/lib/widgets/conversation_bottom_bar.dart @@ -1,6 +1,3 @@ -import 'dart:convert'; -import 'dart:typed_data'; - import 'package:cached_network_image/cached_network_image.dart'; import 'package:collection/collection.dart'; import 'package:flutter/material.dart'; @@ -45,10 +42,10 @@ class ConversationBottomBar extends StatefulWidget { }); @override - State createState() => ConversationBottomBarState(); + State createState() => _ConversationBottomBarState(); } -class ConversationBottomBarState extends State { +class _ConversationBottomBarState extends State { // Audio player for inline controls AudioPlayer? _audioPlayer; bool _isAudioLoading = false; @@ -59,97 +56,14 @@ class ConversationBottomBarState extends State { @override void initState() { super.initState(); - _calculateTotalDurationWithGaps(); - } - - /// Creates a silent audio source of the specified duration. - /// Uses minimal WAV format: 16kHz, mono, 16-bit PCM with zero samples. - AudioSource _createSilenceSource(Duration duration) { - const int sampleRate = 16000; - const int numChannels = 1; - const int bitsPerSample = 16; - const int bytesPerSample = bitsPerSample ~/ 8; - - final int numSamples = (duration.inMilliseconds * sampleRate / 1000).round(); - final int dataSize = numSamples * numChannels * bytesPerSample; - final int fileSize = 36 + dataSize; - - final buffer = Uint8List(44 + dataSize); - final byteData = ByteData.view(buffer.buffer); - - // RIFF header - buffer[0] = 0x52; // 'R' - buffer[1] = 0x49; // 'I' - buffer[2] = 0x46; // 'F' - buffer[3] = 0x46; // 'F' - byteData.setUint32(4, fileSize, Endian.little); - buffer[8] = 0x57; // 'W' - buffer[9] = 0x41; // 'A' - buffer[10] = 0x56; // 'V' - buffer[11] = 0x45; // 'E' - - // fmt subchunk - buffer[12] = 0x66; // 'f' - buffer[13] = 0x6D; // 'm' - buffer[14] = 0x74; // 't' - buffer[15] = 0x20; // ' ' - byteData.setUint32(16, 16, Endian.little); - byteData.setUint16(20, 1, Endian.little); - byteData.setUint16(22, numChannels, Endian.little); - byteData.setUint32(24, sampleRate, Endian.little); - byteData.setUint32(28, sampleRate * numChannels * bytesPerSample, Endian.little); - byteData.setUint16(32, numChannels * bytesPerSample, Endian.little); - byteData.setUint16(34, bitsPerSample, Endian.little); - - // data subchunk - buffer[36] = 0x64; // 'd' - buffer[37] = 0x61; // 'a' - buffer[38] = 0x74; // 't' - buffer[39] = 0x61; // 'a' - byteData.setUint32(40, dataSize, Endian.little); - - // Audio data (bytes 44+) is zeros = silence - - final base64Data = base64Encode(buffer); - return AudioSource.uri(Uri.parse('data:audio/wav;base64,$base64Data')); - } - - /// Seek to a specific segment time (in conversation-relative seconds) and start playback. - /// Since playlist now includes silence gaps, segment time maps directly to playlist position. - Future seekAndPlay(double segmentStartSeconds) async { - if (!_isAudioInitialized && !_isAudioLoading) { - await _initAudioIfNeeded(); - } - if (!mounted) return; - if (_audioPlayer == null) return; - - final conversation = widget.conversation; - if (conversation == null || conversation.audioFiles.isEmpty) return; - - // With silence gaps in playlist, segment time = playlist position directly - final targetPosition = Duration(milliseconds: (segmentStartSeconds * 1000).toInt()); - - // Clamp to valid range - final clampedPosition = targetPosition > _totalDuration ? _totalDuration : targetPosition; - final finalPosition = clampedPosition.isNegative ? Duration.zero : clampedPosition; - - // Track play event - MixpanelManager().audioPlaybackStarted( - conversationId: conversation.id, - durationSeconds: _totalDuration.inSeconds > 0 ? _totalDuration.inSeconds : null, - ); - - // Seek using combined position which handles track selection - await _seekToCombinedPosition(finalPosition); - await _audioPlayer!.play(); - if (mounted) setState(() {}); + _calculateTotalDuration(); } @override void didUpdateWidget(ConversationBottomBar oldWidget) { super.didUpdateWidget(oldWidget); if (widget.conversation?.id != oldWidget.conversation?.id) { - _calculateTotalDurationWithGaps(); + _calculateTotalDuration(); } } @@ -159,53 +73,15 @@ class ConversationBottomBarState extends State { super.dispose(); } - /// Calculates total duration including gaps between audio files. - /// This builds _trackStartOffsets to include silence tracks. - void _calculateTotalDurationWithGaps() { + void _calculateTotalDuration() { if (widget.conversation == null) return; - - final conversation = widget.conversation!; - final conversationStartedAt = conversation.startedAt; - - // Sort audio files by startedAt - final sortedAudioFiles = conversation.audioFiles.where((af) => af.startedAt != null).toList() - ..sort((a, b) => a.startedAt!.compareTo(b.startedAt!)); - - if (sortedAudioFiles.isEmpty) { - _totalDuration = Duration.zero; - _trackStartOffsets = []; - return; - } - + double totalSeconds = 0; _trackStartOffsets = []; - double currentTimeMs = 0; - - // Reference point for calculating gaps - DateTime? referenceStart = conversationStartedAt ?? sortedAudioFiles.first.startedAt; - DateTime? previousEndTime = referenceStart; - - for (final audioFile in sortedAudioFiles) { - final fileStart = audioFile.startedAt!; - - // Calculate gap from previous end to this file's start - if (previousEndTime != null) { - final gapMs = fileStart.difference(previousEndTime).inMilliseconds; - if (gapMs > 100) { - // Add offset for silence track - _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); - currentTimeMs += gapMs; - } - } - - // Add offset for audio file - _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); - currentTimeMs += audioFile.duration * 1000; - - // Update previous end time - previousEndTime = fileStart.add(Duration(milliseconds: (audioFile.duration * 1000).toInt())); + for (final audioFile in widget.conversation!.audioFiles) { + _trackStartOffsets.add(Duration(milliseconds: (totalSeconds * 1000).toInt())); + totalSeconds += audioFile.duration; } - - _totalDuration = Duration(milliseconds: currentTimeMs.toInt()); + _totalDuration = Duration(milliseconds: (totalSeconds * 1000).toInt()); } Duration _getCombinedPosition(int? currentIndex, Duration trackPosition) { @@ -225,78 +101,39 @@ class ConversationBottomBarState extends State { _isAudioLoading = true; }); + _calculateTotalDuration(); + try { _audioPlayer = AudioPlayer(); - final conversation = widget.conversation!; - final conversationStartedAt = conversation.startedAt; - - // Sort audio files by startedAt - final sortedAudioFiles = conversation.audioFiles.where((af) => af.startedAt != null).toList() - ..sort((a, b) => a.startedAt!.compareTo(b.startedAt!)); - - if (sortedAudioFiles.isEmpty) { - debugPrint('No audio files with startedAt found'); - return; - } - - // Fetch signed URLs for all audio files - final signedUrlInfos = await getConversationAudioSignedUrls(conversation.id); - Map? fallbackHeaders; + final signedUrlInfos = await getConversationAudioSignedUrls(widget.conversation!.id); + final audioFileIds = widget.conversation!.audioFiles.map((af) => af.id).toList(); - // Build playlist with silence gaps List audioSources = []; - _trackStartOffsets = []; - double currentTimeMs = 0; - - // Reference point for calculating gaps - DateTime? referenceStart = conversationStartedAt ?? sortedAudioFiles.first.startedAt; - DateTime? previousEndTime = referenceStart; - - for (final audioFile in sortedAudioFiles) { - final fileStart = audioFile.startedAt!; - - // Calculate gap from previous end to this file's start - if (previousEndTime != null) { - final gapMs = fileStart.difference(previousEndTime).inMilliseconds; - if (gapMs > 100) { - // Add silence track for gap - _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); - audioSources.add(_createSilenceSource(Duration(milliseconds: gapMs))); - currentTimeMs += gapMs; - debugPrint('Added silence gap: ${gapMs}ms before audio file ${audioFile.id}'); - } - } - - // Add offset for audio file - _trackStartOffsets.add(Duration(milliseconds: currentTimeMs.toInt())); + Map? fallbackHeaders; - // Get audio source for this file + for (final fileId in audioFileIds) { + // Find matching signed URL info final urlInfo = signedUrlInfos.firstWhere( - (info) => info.id == audioFile.id, - orElse: () => AudioFileUrlInfo(id: audioFile.id, status: 'pending', duration: 0), + (info) => info.id == fileId, + orElse: () => AudioFileUrlInfo(id: fileId, status: 'pending', duration: 0), ); if (urlInfo.isCached && urlInfo.signedUrl != null) { + // Use signed URL directly audioSources.add(AudioSource.uri(Uri.parse(urlInfo.signedUrl!))); } else { + // Fall back to API URL fallbackHeaders ??= await getAudioHeaders(); final apiUrl = getAudioStreamUrl( - conversationId: conversation.id, - audioFileId: audioFile.id, + conversationId: widget.conversation!.id, + audioFileId: fileId, format: 'wav', ); audioSources.add(AudioSource.uri(Uri.parse(apiUrl), headers: fallbackHeaders)); } - - currentTimeMs += audioFile.duration * 1000; - - // Update previous end time - previousEndTime = fileStart.add(Duration(milliseconds: (audioFile.duration * 1000).toInt())); } - _totalDuration = Duration(milliseconds: currentTimeMs.toInt()); - final playlist = ConcatenatingAudioSource( useLazyPreparation: true, children: audioSources, @@ -304,19 +141,6 @@ class ConversationBottomBarState extends State { await _audioPlayer!.setAudioSource(playlist, preload: true); _isAudioInitialized = true; - - // Seek to first segment start position - // With silence gaps, segment time = playlist position directly - if (conversation.transcriptSegments.isNotEmpty) { - final firstSegmentStart = conversation.transcriptSegments.first.start; - final targetPosition = Duration(milliseconds: (firstSegmentStart * 1000).toInt()); - - // Clamp to valid range - final clampedPosition = targetPosition > _totalDuration ? Duration.zero : targetPosition; - final finalPosition = clampedPosition.isNegative ? Duration.zero : clampedPosition; - - await _seekToCombinedPosition(finalPosition); - } } catch (e) { debugPrint('Error initializing audio: $e'); } finally { diff --git a/app/lib/widgets/transcript.dart b/app/lib/widgets/transcript.dart index 498cc1d9bf..03d096debd 100644 --- a/app/lib/widgets/transcript.dart +++ b/app/lib/widgets/transcript.dart @@ -28,8 +28,6 @@ class TranscriptWidget extends StatefulWidget { final int currentResultIndex; final Function(ScrollController)? onScrollControllerReady; final VoidCallback? onTapWhenSearchEmpty; - final Function(double segmentStartSeconds)? onPlaySegment; - final bool hasAudio; const TranscriptWidget({ super.key, @@ -48,8 +46,6 @@ class TranscriptWidget extends StatefulWidget { this.currentResultIndex = -1, this.onScrollControllerReady, this.onTapWhenSearchEmpty, - this.onPlaySegment, - this.hasAudio = false, }); @override @@ -661,55 +657,12 @@ class _TranscriptWidgetState extends State { const SizedBox(height: 4), _buildTranslationNotice(), ], - // Timestamp, provider, and play button (only shown when toggled) - if (_showSpeakerNames && - (widget.canDisplaySeconds || - data.sttProvider != null || - (widget.hasAudio && widget.onPlaySegment != null))) ...[ + // Timestamp and provider (only shown when toggled) + if (_showSpeakerNames && (widget.canDisplaySeconds || data.sttProvider != null)) ...[ const SizedBox(height: 4), Row( mainAxisAlignment: MainAxisAlignment.end, children: [ - // Play button for audio playback - if (widget.hasAudio && widget.onPlaySegment != null) ...[ - GestureDetector( - onTap: () { - widget.onPlaySegment?.call(data.start); - }, - child: Row( - mainAxisSize: MainAxisSize.min, - children: [ - Icon( - Icons.play_arrow_rounded, - color: isUser - ? Colors.white.withValues(alpha: 0.7) - : Colors.grey.shade400, - size: 14, - ), - const SizedBox(width: 2), - Text( - 'Play', - style: TextStyle( - color: isUser - ? Colors.white.withValues(alpha: 0.7) - : Colors.grey.shade400, - fontSize: 11, - ), - ), - ], - ), - ), - if (widget.canDisplaySeconds || data.sttProvider != null) - Text( - ' ยท ', - style: TextStyle( - color: isUser - ? Colors.white.withValues(alpha: 0.5) - : Colors.grey.shade500, - fontSize: 10, - ), - ), - ], if (data.sttProvider != null) ...[ Text( SttProviderConfig.getDisplayName(data.sttProvider), From 2e63f538f13606a0769a63f22112d47dec2f750a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 22:39:22 +0700 Subject: [PATCH 11/14] Without ops --- backend/Dockerfile | 62 +++++++++++--------------------- backend/compose.yaml | 43 ----------------------- backend/pusher.Dockerfile | 74 --------------------------------------- 3 files changed, 20 insertions(+), 159 deletions(-) delete mode 100644 backend/compose.yaml delete mode 100644 backend/pusher.Dockerfile diff --git a/backend/Dockerfile b/backend/Dockerfile index 6f5c0b43c3..71ff23dd1a 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,7 +1,9 @@ -# Builder stage - compile liblc3 -FROM tiangolo/uvicorn-gunicorn:python3.11 as builder +FROM python:3.11 AS builder -# Install build dependencies +ENV PATH="/opt/venv/bin:$PATH" +RUN python -m venv /opt/venv + +# Install build dependencies for liblc3 RUN apt-get update && apt-get install -y \ git \ gcc \ @@ -22,18 +24,20 @@ RUN git clone https://github.com/google/liblc3.git && \ cd /tmp/liblc3 && \ python3 -m pip wheel --no-cache-dir --wheel-dir /tmp/wheels . -# Runtime stage - minimal image -FROM tiangolo/uvicorn-gunicorn:python3.11 +# Install Python requirements +WORKDIR /opt/venv +COPY backend/requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt -# Only install runtime dependencies -RUN apt-get update && apt-get install -y \ - ffmpeg \ - curl \ - unzip \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* +FROM python:3.11-slim + +WORKDIR /app +ENV PATH="/opt/venv/bin:$PATH" +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + +RUN apt-get update && apt-get -y install ffmpeg curl unzip && rm -rf /var/lib/apt/lists/* -# Copy compiled library and wheel from builder +# Copy compiled liblc3 library and wheel from builder COPY --from=builder /usr/local/lib/liblc3.so* /usr/local/lib/ COPY --from=builder /tmp/wheels /tmp/wheels @@ -42,34 +46,8 @@ RUN ldconfig && \ pip install --no-cache-dir /tmp/wheels/*.whl && \ rm -rf /tmp/wheels -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -# Install Python requirements -WORKDIR /app -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -WORKDIR /app - -# COPY . . -COPY ./routers ./routers -COPY ./pretrained_models ./pretrained_models -COPY ./database ./database -COPY ./migrations ./migrations -COPY ./memories-tuner ./tuner -COPY ./pusher ./pusher -COPY ./typesense ./typesense -COPY ./charts ./charts -COPY ./utils ./utils -COPY ./models ./models -COPY ./testing ./testing -COPY ./scripts ./scripts -COPY ./templates ./templates -COPY ./modal ./modal -COPY ./migration ./migration -COPY google-credentials.json ./ +COPY --from=builder /opt/venv /opt/venv +COPY backend/ . EXPOSE 8080 - -CMD uvicorn main:app --host 0.0.0.0 --port 8080 #--limit-concurrency 10 -#CMD gunicorn main:app -k uvicorn.workers.UvicornWorker --workers 2 --bind 0.0.0.0:8080 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/backend/compose.yaml b/backend/compose.yaml deleted file mode 100644 index a57e40ac02..0000000000 --- a/backend/compose.yaml +++ /dev/null @@ -1,43 +0,0 @@ -services: - api: - build: - context: . - dockerfile: Dockerfile - ports: - - 8088:8080 - volumes: - - .:/app - env_file: - - .env - mem_limit: 2g - pusher: - build: - context: . - dockerfile: pusher.Dockerfile - ports: - - 8098:8080 - volumes: - - .:/app - env_file: - - .env - mem_limit: 1g - vad: - build: - context: . - dockerfile: vad.Dockerfile - mem_limit: 3g - ports: - - 8188:8080 - env_file: - - .env - plugins: - build: - context: ../plugins/example - dockerfile: Dockerfile - ports: - - 8189:8000 - volumes: - - ../plugins/example:/app - env_file: - - ../plugins/example/.env - mem_limit: 512m diff --git a/backend/pusher.Dockerfile b/backend/pusher.Dockerfile deleted file mode 100644 index a59f47e10c..0000000000 --- a/backend/pusher.Dockerfile +++ /dev/null @@ -1,74 +0,0 @@ -# Builder stage - compile liblc3 -FROM tiangolo/uvicorn-gunicorn:python3.11 as builder - -# Install build dependencies -RUN apt-get update && apt-get install -y \ - git \ - gcc \ - g++ \ - meson \ - ninja-build \ - python3-dev \ - && rm -rf /var/lib/apt/lists/* - -# Build liblc3 and create wheel -WORKDIR /tmp -RUN git clone https://github.com/google/liblc3.git && \ - cd liblc3 && \ - meson setup build && \ - cd build && \ - meson install && \ - ldconfig && \ - cd /tmp/liblc3 && \ - python3 -m pip wheel --no-cache-dir --wheel-dir /tmp/wheels . - -# Runtime stage - minimal image -FROM tiangolo/uvicorn-gunicorn:python3.11 - -# Only install runtime dependencies -RUN apt-get update && apt-get install -y \ - ffmpeg \ - curl \ - unzip \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Copy compiled library and wheel from builder -COPY --from=builder /usr/local/lib/liblc3.so* /usr/local/lib/ -COPY --from=builder /tmp/wheels /tmp/wheels - -# Install liblc3 Python package and set library path -RUN ldconfig && \ - pip install --no-cache-dir /tmp/wheels/*.whl && \ - rm -rf /tmp/wheels - -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH - -# Install Python requirements (now including lc3py if present) -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -WORKDIR /app - -#COPY . . -COPY ./routers ./routers -COPY ./pretrained_models ./pretrained_models -COPY ./database ./database -COPY ./migrations ./migrations -COPY ./memories-tuner ./tuner -COPY ./pusher ./pusher -COPY ./typesense ./typesense -COPY ./charts ./charts -COPY ./utils ./utils -COPY ./models ./models -COPY ./testing ./testing -COPY ./scripts ./scripts -COPY ./templates ./templates -COPY ./modal ./modal -COPY ./migration ./migration -COPY google-credentials.json ./ - - -EXPOSE 8080 - -CMD uvicorn pusher.main:app --host 0.0.0.0 --port 8080 --limit-concurrency 16 --backlog 32 From 0b53782366a097d565bb742020783b57076c3f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 22:46:51 +0700 Subject: [PATCH 12/14] Clean it up --- backend/routers/transcribe.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index c579a8769c..07570c12ea 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -1324,25 +1324,6 @@ async def conversation_lifecycle_manager(): await _process_conversation(current_conversation_id) await _create_new_in_progress_conversation() - def _pcm_to_wav_bytes(pcm_data: bytes, sr: int) -> bytes: - """Convert PCM16 mono to WAV format using av.""" - output_buffer = io.BytesIO() - output_container = av.open(output_buffer, mode='w', format='wav') - output_stream = output_container.add_stream('pcm_s16le', rate=sr) - output_stream.layout = 'mono' - - samples = np.frombuffer(pcm_data, dtype=np.int16) - frame = av.AudioFrame.from_ndarray(samples.reshape(1, -1), format='s16', layout='mono') - frame.rate = sr - - for packet in output_stream.encode(frame): - output_container.mux(packet) - for packet in output_stream.encode(): - output_container.mux(packet) - - output_container.close() - return output_buffer.getvalue() - async def speaker_identification_task(): """Consume segment queue, accumulate per speaker, trigger match when ready.""" nonlocal websocket_active, speaker_to_person_map From 55c883ccae81e9684ca56b18406ce9f141b9943a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 22:56:55 +0700 Subject: [PATCH 13/14] Optimize people API to use Firestore speech sample paths directly --- app/lib/backend/http/api/users.dart | 12 +++++++ app/lib/pages/settings/people.dart | 22 ++++-------- app/lib/providers/people_provider.dart | 25 ++++++-------- backend/database/users.py | 25 ++++++++++++++ backend/routers/users.py | 48 +++++++++++++++++++++----- backend/utils/other/storage.py | 21 +++++++++++ 6 files changed, 113 insertions(+), 40 deletions(-) diff --git a/app/lib/backend/http/api/users.dart b/app/lib/backend/http/api/users.dart index 26262db021..37dbffe016 100644 --- a/app/lib/backend/http/api/users.dart +++ b/app/lib/backend/http/api/users.dart @@ -243,6 +243,18 @@ Future deletePerson(String personId) async { return response.statusCode == 204; } +Future deletePersonSpeechSample(String personId, int sampleIndex) async { + var response = await makeApiCall( + url: '${Env.apiBaseUrl}v1/users/people/$personId/speech-samples/$sampleIndex', + headers: {}, + method: 'DELETE', + body: '', + ); + if (response == null) return false; + debugPrint('deletePersonSpeechSample response: ${response.body}'); + return response.statusCode == 200; +} + Future getFollowUpQuestion({String conversationId = '0'}) async { var response = await makeApiCall( url: '${Env.apiBaseUrl}v1/joan/$conversationId/followup-question', diff --git a/app/lib/pages/settings/people.dart b/app/lib/pages/settings/people.dart index 38e0c3d145..de40c6ba75 100644 --- a/app/lib/pages/settings/people.dart +++ b/app/lib/pages/settings/people.dart @@ -8,7 +8,6 @@ import 'package:omi/providers/people_provider.dart'; import 'package:omi/providers/connectivity_provider.dart'; import 'package:omi/widgets/dialog.dart'; import 'package:omi/widgets/extensions/functions.dart'; -import 'package:just_audio/just_audio.dart'; import 'package:omi/utils/l10n_extensions.dart'; import 'package:provider/provider.dart'; @@ -161,7 +160,7 @@ class _UserPeoplePageState extends State<_UserPeoplePage> { ); } - Future _confirmDeleteSample(int peopleIdx, Person person, String url, PeopleProvider provider) async { + Future _confirmDeleteSample(int peopleIdx, Person person, int sampleIdx, PeopleProvider provider) async { final connectivityProvider = Provider.of(context, listen: false); if (!connectivityProvider.isConnected) { ConnectivityProvider.showNoInternetDialog(context); @@ -180,7 +179,7 @@ class _UserPeoplePageState extends State<_UserPeoplePage> { ); if (confirmed == true) { - provider.deletePersonSample(peopleIdx, url); + await provider.deletePersonSample(peopleIdx, sampleIdx); } } @@ -297,20 +296,11 @@ class _UserPeoplePageState extends State<_UserPeoplePage> { ), onPressed: () => provider.playPause(index, j, sample), ), - title: Text(index == 0 + title: Text(j == 0 ? context.l10n.speechProfile - : context.l10n.sampleNumber(index)), - onTap: () => _confirmDeleteSample(index, person, sample, provider), - subtitle: FutureBuilder( - future: AudioPlayer().setUrl(sample), - builder: (context, snapshot) { - if (snapshot.hasData) { - return Text(context.l10n.secondsCount(snapshot.data!.inSeconds)); - } else { - return Text(context.l10n.loadingDuration); - } - }, - ), + : context.l10n.sampleNumber(j)), + onTap: () => _confirmDeleteSample(index, person, j, provider), + subtitle: Text('Tap to delete'), )), ], ), diff --git a/app/lib/providers/people_provider.dart b/app/lib/providers/people_provider.dart index 1554f15854..d1b8fccd1c 100644 --- a/app/lib/providers/people_provider.dart +++ b/app/lib/providers/people_provider.dart @@ -1,5 +1,4 @@ import 'package:flutter/cupertino.dart'; -import 'package:omi/backend/http/api/speech_profile.dart'; import 'package:omi/backend/http/api/users.dart'; import 'package:omi/backend/preferences.dart'; import 'package:omi/backend/schema/person.dart'; @@ -106,21 +105,17 @@ class PeopleProvider extends BaseProvider { notifyListeners(); } - String _getFileNameFromUrl(String url) { - Uri uri = Uri.parse(url); - String fileName = uri.pathSegments.last; - return fileName.split('.').first; - } + Future deletePersonSample(int personIdx, int sampleIdx) async { + String personId = people[personIdx].id; - void deletePersonSample(int personIdx, String url) { - String name = _getFileNameFromUrl(url); - var parts = name.split('_segment_'); - String conversationId = parts[0]; - int segmentIdx = int.parse(parts[1]); - deleteProfileSample(conversationId, segmentIdx, personId: people[personIdx].id); - people[personIdx].speechSamples!.remove(url); - SharedPreferencesUtil().replaceCachedPerson(people[personIdx]); - notifyListeners(); + bool success = await deletePersonSpeechSample(personId, sampleIdx); + if (success) { + people[personIdx].speechSamples!.removeAt(sampleIdx); + SharedPreferencesUtil().replaceCachedPerson(people[personIdx]); + notifyListeners(); + } else { + debugPrint('Failed to delete speech sample at index: $sampleIdx'); + } } void deletePersonProvider(Person person) { diff --git a/backend/database/users.py b/backend/database/users.py index 92ed91d519..668efdf699 100644 --- a/backend/database/users.py +++ b/backend/database/users.py @@ -148,6 +148,31 @@ def get_person_speech_samples_count(uid: str, person_id: str) -> int: return len(person_data.get('speech_samples', [])) +def remove_person_speech_sample(uid: str, person_id: str, sample_path: str) -> bool: + """ + Remove a speech sample path from person's speech_samples list. + + Args: + uid: User ID + person_id: Person ID + sample_path: GCS path to remove + + Returns: + True if removed, False if person not found + """ + person_ref = db.collection('users').document(uid).collection('people').document(person_id) + person_doc = person_ref.get() + + if not person_doc.exists: + return False + + person_ref.update({ + 'speech_samples': firestore.ArrayRemove([sample_path]), + 'updated_at': datetime.now(timezone.utc), + }) + return True + + def set_person_speaker_embedding(uid: str, person_id: str, embedding: list) -> bool: """ Store speaker embedding for a person. diff --git a/backend/routers/users.py b/backend/routers/users.py index 2afe4c1678..bd2bf19bbd 100644 --- a/backend/routers/users.py +++ b/backend/routers/users.py @@ -53,8 +53,9 @@ from utils.other import endpoints as auth from utils.other.storage import ( delete_all_conversation_recordings, - get_user_person_speech_samples, + get_speech_sample_signed_urls, delete_user_person_speech_samples, + delete_user_person_speech_sample, ) from utils.webhooks import webhook_first_time_setup @@ -242,7 +243,9 @@ def get_single_person( if not person: raise HTTPException(status_code=404, detail="Person not found") if include_speech_samples: - person['speech_samples'] = get_user_person_speech_samples(uid, person['id']) + # Convert stored GCS paths to signed URLs + stored_paths = person.get('speech_samples', []) + person['speech_samples'] = get_speech_sample_signed_urls(stored_paths) return person @@ -251,13 +254,10 @@ def get_all_people(include_speech_samples: bool = True, uid: str = Depends(auth. print('get_all_people', include_speech_samples) people = get_people(uid) if include_speech_samples: - - def single(person): - person['speech_samples'] = get_user_person_speech_samples(uid, person['id']) - - threads = [threading.Thread(target=single, args=(person,)) for person in people] - [t.start() for t in threads] - [t.join() for t in threads] + # Convert stored GCS paths to signed URLs for each person + for person in people: + stored_paths = person.get('speech_samples', []) + person['speech_samples'] = get_speech_sample_signed_urls(stored_paths) return people @@ -278,6 +278,36 @@ def delete_person_endpoint(person_id: str, uid: str = Depends(auth.get_current_u return {'status': 'ok'} +@router.delete('/v1/users/people/{person_id}/speech-samples/{sample_index}', tags=['v1']) +def delete_person_speech_sample_endpoint( + person_id: str, + sample_index: int, + uid: str = Depends(auth.get_current_user_uid), +): + """Delete a specific speech sample for a person by index.""" + person = get_person(uid, person_id) + if not person: + raise HTTPException(status_code=404, detail="Person not found") + + speech_samples = person.get('speech_samples', []) + if sample_index < 0 or sample_index >= len(speech_samples): + raise HTTPException(status_code=404, detail="Sample not found") + + path_to_delete = speech_samples[sample_index] + + # Extract filename from path for GCS deletion + filename = path_to_delete.split('/')[-1] + + # Delete from GCS + delete_user_person_speech_sample(uid, person_id, filename) + + # Remove from Firestore + from database.users import remove_person_speech_sample + remove_person_speech_sample(uid, person_id, path_to_delete) + + return {'status': 'ok'} + + # ********************************************************** # ************* RANDOM JOAN SPECIFIC FEATURES ************** # ********************************************************** diff --git a/backend/utils/other/storage.py b/backend/utils/other/storage.py index 68efa715ba..8089b9a8fa 100644 --- a/backend/utils/other/storage.py +++ b/backend/utils/other/storage.py @@ -186,6 +186,27 @@ def get_user_person_speech_samples(uid: str, person_id: str, download: bool = Fa return [_get_signed_url(blob, 60) for blob in blobs] +def get_speech_sample_signed_urls(paths: List[str]) -> List[str]: + """ + Generate signed URLs for speech samples given their GCS paths. + Uses the paths stored in Firestore instead of listing GCS blobs. + + Args: + paths: List of GCS paths (e.g., '{uid}/people_profiles/{person_id}/{filename}') + + Returns: + List of signed URLs + """ + if not paths: + return [] + bucket = storage_client.bucket(speech_profiles_bucket) + signed_urls = [] + for path in paths: + blob = bucket.blob(path) + signed_urls.append(_get_signed_url(blob, 60)) + return signed_urls + + # ******************************************** # ************* POST PROCESSING ************** # ******************************************** From cf7e41f506873df0c5da0bd570302bc8747c71b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Tue, 30 Dec 2025 23:21:52 +0700 Subject: [PATCH 14/14] Add HOSTED_SPEAKER_EMBEDDING_API_URL to deployment charts --- .../charts/backend-listen/dev_omi_backend_listen_values.yaml | 2 ++ .../charts/backend-listen/prod_omi_backend_listen_values.yaml | 2 ++ backend/charts/pusher/dev_omi_pusher_values.yaml | 2 ++ backend/charts/pusher/prod_omi_pusher_values.yaml | 2 ++ 4 files changed, 8 insertions(+) diff --git a/backend/charts/backend-listen/dev_omi_backend_listen_values.yaml b/backend/charts/backend-listen/dev_omi_backend_listen_values.yaml index 258163c720..a09d67717b 100644 --- a/backend/charts/backend-listen/dev_omi_backend_listen_values.yaml +++ b/backend/charts/backend-listen/dev_omi_backend_listen_values.yaml @@ -114,6 +114,8 @@ env: value: "http://34.172.155.20:80/v1/vad" - name: HOSTED_SPEECH_PROFILE_API_URL value: "http://34.172.155.20:80/v1/speaker-identification" + - name: HOSTED_SPEAKER_EMBEDDING_API_URL + value: "http://34.172.155.20:80" - name: PINECONE_API_KEY valueFrom: secretKeyRef: diff --git a/backend/charts/backend-listen/prod_omi_backend_listen_values.yaml b/backend/charts/backend-listen/prod_omi_backend_listen_values.yaml index e82ce287cb..6cf3b68fa6 100644 --- a/backend/charts/backend-listen/prod_omi_backend_listen_values.yaml +++ b/backend/charts/backend-listen/prod_omi_backend_listen_values.yaml @@ -107,6 +107,8 @@ env: value: "http://172.16.128.101:8080/v1/vad" - name: HOSTED_SPEECH_PROFILE_API_URL value: "http://172.16.128.101:8080/v1/speaker-identification" + - name: HOSTED_SPEAKER_EMBEDDING_API_URL + value: "http://diarizer.omi.me:80" - name: PINECONE_API_KEY valueFrom: secretKeyRef: diff --git a/backend/charts/pusher/dev_omi_pusher_values.yaml b/backend/charts/pusher/dev_omi_pusher_values.yaml index 083c4eb867..5d5f2df4ab 100644 --- a/backend/charts/pusher/dev_omi_pusher_values.yaml +++ b/backend/charts/pusher/dev_omi_pusher_values.yaml @@ -106,6 +106,8 @@ env: value: "http://34.172.155.20:80/v1/vad" - name: HOSTED_SPEECH_PROFILE_API_URL value: "http://34.172.155.20:80/v1/speaker-identification" + - name: HOSTED_SPEAKER_EMBEDDING_API_URL + value: "http://34.172.155.20:80" - name: PINECONE_API_KEY valueFrom: secretKeyRef: diff --git a/backend/charts/pusher/prod_omi_pusher_values.yaml b/backend/charts/pusher/prod_omi_pusher_values.yaml index c62369931e..d3d8e9416f 100644 --- a/backend/charts/pusher/prod_omi_pusher_values.yaml +++ b/backend/charts/pusher/prod_omi_pusher_values.yaml @@ -111,6 +111,8 @@ env: value: "http://vad.omi.me:80/v1/vad" - name: HOSTED_SPEECH_PROFILE_API_URL value: "http://vad.omi.me:80/v1/speaker-identification" + - name: HOSTED_SPEAKER_EMBEDDING_API_URL + value: "http://diarizer.omi.me:80" - name: PINECONE_API_KEY valueFrom: secretKeyRef: