"""Pipeline evaluation adapter. Provides batch evaluation functions for transcription and diarization modules. Currently contains simulation stubs with realistic performance models based on published benchmarks. Replace the simulation logic with actual pipeline calls for production use. """ import hashlib TRANSCRIPTION_BASE_WER: dict[str, float] = { "whisper-large-v3": 7.8, "whisper-medium": 13.5, "faster-whisper-large-v3": 7.6, "gigaam-ctc": 6.8, "gigaam-rnnt": 5.4, } TRANSCRIPTION_BASE_TIME: dict[str, float] = { "whisper-large-v3": 4.2, "whisper-medium": 2.8, "faster-whisper-large-v3": 2.2, "gigaam-ctc": 1.5, "gigaam-rnnt": 3.5, } WHISPER_MODELS = {"whisper-large-v3", "whisper-medium", "faster-whisper-large-v3"} BEAM_SIZE_WER_DELTA = {1: 1.2, 3: 0.4, 5: 0.0, 7: -0.1, 10: -0.15} BEAM_SIZE_TIME_FACTOR = {1: 0.6, 3: 0.8, 5: 1.0, 7: 1.15, 10: 1.4} VAD_WER_DELTA = {0.3: 0.8, 0.4: 0.2, 0.5: 0.0, 0.6: 0.3, 0.7: 1.0} DIARIZATION_BASE_DER: dict[str, float] = { "pyannote-3.1": 24.0, "pyannote-community-1": 20.5, "sortformer": 18.8, } DIARIZATION_BASE_TIME: dict[str, float] = { "pyannote-3.1": 2.5, "pyannote-community-1": 2.8, "sortformer": 3.8, } MIN_SPEECH_DER_DELTA = {0.25: 1.5, 0.5: 0.0, 0.75: 0.3, 1.0: 1.2, 1.5: 3.0} CLUSTERING_DER_DELTA = {0.3: 3.0, 0.45: 0.8, 0.6: 0.0, 0.75: 0.5, 0.9: 2.5} VAD_DER_DELTA = {0.3: 1.0, 0.4: 0.3, 0.5: 0.0, 0.6: 0.5, 0.7: 1.5} def _deterministic_noise(seed_str: str, amplitude: float = 0.3) -> float: h = int(hashlib.md5(seed_str.encode()).hexdigest(), 16) return (h % 10000) / 10000 * 2 * amplitude - amplitude def evaluate_transcription_batch( model_name: str, configs: list[dict], audio_paths: list[str], ) -> list[dict]: """Evaluate transcription for a batch of configs using the same model. In production, this loads the model once and iterates over configs. Currently returns simulated results. Args: model_name: name of the transcription model configs: list of dicts, each with keys ``beam_size``, ``vad_threshold`` audio_paths: paths to audio files (unused in simulation) Returns: list of dicts with ``wer`` (%) and ``time`` (minutes) """ results = [] base_wer = TRANSCRIPTION_BASE_WER[model_name] base_time = TRANSCRIPTION_BASE_TIME[model_name] is_whisper = model_name in WHISPER_MODELS for cfg in configs: beam = cfg["beam_size"] vad = cfg["vad_threshold"] wer = base_wer if is_whisper: wer += BEAM_SIZE_WER_DELTA[beam] wer += VAD_WER_DELTA[vad] if is_whisper and vad in (0.3, 0.7) and beam >= 7: wer += 0.4 noise = _deterministic_noise(f"t_{model_name}_{beam}_{vad}") wer = max(1.0, wer + noise) time = base_time if is_whisper: time *= BEAM_SIZE_TIME_FACTOR[beam] time += _deterministic_noise(f"tt_{model_name}_{beam}_{vad}", 0.1) time = max(0.5, time) results.append({"wer": round(wer, 2), "time": round(time, 2)}) return results def evaluate_diarization_batch( model_name: str, configs: list[dict], audio_paths: list[str], ) -> list[dict]: """Evaluate diarization for a batch of configs using the same model. In production, this loads the model once and iterates over configs. Currently returns simulated results. Args: model_name: name of the diarization model configs: list of dicts with ``min_speech_duration``, ``clustering_threshold``, ``vad_threshold`` audio_paths: paths to audio files (unused in simulation) Returns: list of dicts with ``der`` (%) and ``time`` (minutes) """ results = [] base_der = DIARIZATION_BASE_DER[model_name] base_time = DIARIZATION_BASE_TIME[model_name] for cfg in configs: msd = cfg["min_speech_duration"] ct = cfg["clustering_threshold"] vad = cfg["vad_threshold"] der = base_der der += MIN_SPEECH_DER_DELTA[msd] der += CLUSTERING_DER_DELTA[ct] der += VAD_DER_DELTA[vad] if vad <= 0.3 and msd <= 0.25: der += 1.2 if ct >= 0.9 and msd >= 1.5: der += 0.8 noise = _deterministic_noise(f"d_{model_name}_{msd}_{ct}_{vad}") der = max(5.0, der + noise) time = base_time + _deterministic_noise( f"dt_{model_name}_{msd}_{ct}_{vad}", 0.15 ) time = max(0.5, time) results.append({"der": round(der, 2), "time": round(time, 2)}) return results