| | import gradio as gr |
| | import pandas as pd |
| | import plotly.graph_objects as go |
| | import plotly.express as px |
| | from pathlib import Path |
| |
|
| | |
| | reference_text_path = Path("text/reference.txt") |
| | if reference_text_path.exists(): |
| | with open(reference_text_path, "r") as f: |
| | reference_text = f.read() |
| | else: |
| | reference_text = "Reference text not available" |
| |
|
| | |
| | audio_path = Path("audio/001.wav") |
| | audio_exists = audio_path.exists() |
| |
|
| | |
| | wer_data = { |
| | "Model": ["tiny", "base", "small", "medium", "large-v3-turbo"], |
| | "WER (%)": [15.05, 9.95, 11.17, 6.07, 7.04], |
| | "Speed (s)": [2.73, 5.01, 5.14, 19.42, 33.08], |
| | "Model Size": ["39M", "74M", "244M", "769M", "809M"] |
| | } |
| | df_wer = pd.DataFrame(wer_data) |
| |
|
| | |
| | engine_data = { |
| | "Engine": ["faster-whisper", "openai-whisper", "distil-whisper"], |
| | "WER (%)": [9.95, 9.95, 21.6], |
| | "Speed (s)": [4.87, 6.51, 38.49] |
| | } |
| | df_engine = pd.DataFrame(engine_data) |
| |
|
| | |
| | fig_wer = go.Figure() |
| | fig_wer.add_trace(go.Bar( |
| | x=df_wer["Model"], |
| | y=df_wer["WER (%)"], |
| | text=df_wer["WER (%)"].round(2), |
| | textposition='auto', |
| | marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'], |
| | hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<br>Size: %{customdata}<extra></extra>', |
| | customdata=df_wer["Model Size"] |
| | )) |
| | fig_wer.update_layout( |
| | title="Word Error Rate by Model Size", |
| | xaxis_title="Model", |
| | yaxis_title="WER (%)", |
| | template="plotly_white", |
| | height=400 |
| | ) |
| |
|
| | |
| | fig_scatter = go.Figure() |
| | fig_scatter.add_trace(go.Scatter( |
| | x=df_wer["Speed (s)"], |
| | y=df_wer["WER (%)"], |
| | mode='markers+text', |
| | marker=dict(size=15, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']), |
| | text=df_wer["Model"], |
| | textposition="top center", |
| | hovertemplate='<b>%{text}</b><br>Speed: %{x:.2f}s<br>WER: %{y:.2f}%<extra></extra>' |
| | )) |
| | fig_scatter.update_layout( |
| | title="Speed vs Accuracy Tradeoff", |
| | xaxis_title="Inference Time (seconds)", |
| | yaxis_title="WER (%)", |
| | template="plotly_white", |
| | height=400 |
| | ) |
| |
|
| | |
| | fig_engine = go.Figure() |
| | fig_engine.add_trace(go.Bar( |
| | x=df_engine["Engine"], |
| | y=df_engine["WER (%)"], |
| | name="WER (%)", |
| | marker_color='#4ECDC4', |
| | text=df_engine["WER (%)"].round(2), |
| | textposition='auto' |
| | )) |
| | fig_engine.update_layout( |
| | title="WER by Engine (Base Model)", |
| | xaxis_title="Engine", |
| | yaxis_title="WER (%)", |
| | template="plotly_white", |
| | height=400 |
| | ) |
| |
|
| | |
| | custom_css = """ |
| | .gradio-container { |
| | font-family: 'Inter', sans-serif; |
| | } |
| | .limitation-box { |
| | background-color: #FFF3CD; |
| | border-left: 4px solid #FFC107; |
| | padding: 15px; |
| | margin: 10px 0; |
| | } |
| | .question-box { |
| | background-color: #E3F2FD; |
| | border-left: 4px solid #2196F3; |
| | padding: 15px; |
| | margin: 15px 0; |
| | } |
| | """ |
| |
|
| | |
| | with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
| | gr.Markdown( |
| | """ |
| | # Local ASR/STT Benchmark Evaluation |
| | ### A Single Sample Evaluation on Local Hardware |
| | |
| | Testing different Whisper model sizes to find the optimal balance between accuracy and speed for daily transcription workflow. |
| | """ |
| | ) |
| |
|
| | with gr.Tabs(): |
| | |
| | with gr.Tab("📊 Overview"): |
| | gr.Markdown( |
| | """ |
| | ## About This Evaluation |
| | |
| | This was a "back of the envelope" style experiment to determine which Whisper model size works best |
| | for daily transcription on local hardware, focusing on the tradeoff between accuracy (WER) and inference speed. |
| | """ |
| | ) |
| |
|
| | gr.Markdown("### 🎯 Test Sample") |
| |
|
| | if audio_exists: |
| | gr.Audio( |
| | value=str(audio_path), |
| | label="Test Audio (001.wav)", |
| | type="filepath" |
| | ) |
| | else: |
| | gr.Markdown("**Note:** Audio file will be added soon.") |
| |
|
| | gr.Markdown("### 📝 Reference Text (Ground Truth)") |
| | gr.Textbox( |
| | value=reference_text, |
| | label="Reference Transcription", |
| | lines=10, |
| | max_lines=15, |
| | interactive=False |
| | ) |
| |
|
| | gr.Markdown( |
| | """ |
| | ### ⚠️ Important Limitations |
| | |
| | - **Quick experiment**: Not a definitive scientific evaluation |
| | - **Hardware specific**: AMD GPU with ROCm (not ideal for STT), using CPU inference |
| | - **Single sample**: Results based on one audio clip |
| | - **Variable conditions**: ASR accuracy depends on mic quality, background noise, speaking style |
| | - **Personal use case**: Optimized for one user's voice and workflow |
| | """ |
| | ) |
| |
|
| | |
| | with gr.Tab("📈 Results"): |
| | gr.Markdown("## Key Findings") |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Markdown( |
| | """ |
| | ### Best Accuracy |
| | **medium** model |
| | - 6.07% WER |
| | - 19.42s inference |
| | |
| | ### Fastest |
| | **tiny** model |
| | - 15.05% WER |
| | - 2.73s inference |
| | |
| | ### Recommended for Daily Use |
| | **base** model (faster-whisper) |
| | - 9.95% WER |
| | - ~5s inference |
| | - Good balance |
| | """ |
| | ) |
| |
|
| | with gr.Column(): |
| | gr.Markdown( |
| | """ |
| | ### Key Takeaways |
| | |
| | 1. **Biggest jump**: tiny → base (15% → 10% WER) |
| | 2. **Diminishing returns**: After base, accuracy gains are smaller |
| | 3. **faster-whisper**: Same accuracy as OpenAI, 1.2x faster |
| | 4. **distil-whisper**: Unexpectedly slower AND less accurate on this sample |
| | """ |
| | ) |
| |
|
| | gr.Markdown("## Interactive Visualizations") |
| |
|
| | with gr.Row(): |
| | gr.Plot(fig_wer, label="WER by Model Size") |
| |
|
| | with gr.Row(): |
| | gr.Plot(fig_scatter, label="Speed vs Accuracy") |
| |
|
| | with gr.Row(): |
| | gr.Plot(fig_engine, label="Engine Comparison") |
| |
|
| | gr.Markdown("## Original Charts from Benchmark") |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Image("results/wer_by_size.png", label="WER by Size") |
| | with gr.Column(): |
| | gr.Image("results/speed_by_size.png", label="Speed by Size") |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Image("results/accuracy_speed_tradeoff.png", label="Accuracy vs Speed") |
| | with gr.Column(): |
| | gr.Image("results/engine_comparison.png", label="Engine Comparison") |
| |
|
| | with gr.Row(): |
| | gr.Image("results/variants_comparison.png", label="All Variants Tested") |
| |
|
| | |
| | with gr.Tab("❓ Questions & Answers"): |
| | gr.Markdown( |
| | """ |
| | # Research Questions & Findings |
| | |
| | ## Q1: How much does model size actually matter for accuracy? |
| | |
| | **Answer:** On my hardware, diminishing returns set in around **medium**. |
| | |
| | The biggest accuracy jump was from tiny (15.05% WER) → base (9.95% WER). After that, improvements are smaller: |
| | - tiny → base: 5.1% improvement |
| | - base → medium: 3.88% improvement |
| | - medium → large-v3-turbo: Actually worse (1% regression) |
| | |
| | The "sweet spot" depends on your use case: |
| | - **Live transcription**: Even small lags matter → base or small |
| | - **Batch processing**: Can afford slower → medium or large |
| | |
| | --- |
| | |
| | ## Q2: Is faster-whisper really as good as OpenAI Whisper? |
| | |
| | **Answer:** Yes! On this test, identical accuracy with better speed. |
| | |
| | Testing the base model: |
| | - **faster-whisper**: 9.95% WER in 5.01s |
| | - **openai-whisper**: 9.95% WER in 6.17s |
| | |
| | faster-whisper was ~1.2x faster with no accuracy loss. Clear winner for my use case. |
| | |
| | --- |
| | |
| | ## Q3: What's the speed vs. accuracy tradeoff? |
| | |
| | **Answer:** For daily transcription of my own voice, base or small hits the sweet spot. |
| | |
| | - **tiny**: 2.73s but 15% WER is too rough |
| | - **base**: 5s with 10% WER - acceptable for daily use |
| | - **small**: Similar to base, slightly slower |
| | - **medium**: 6% WER but 7x slower than tiny |
| | - **large-v3-turbo**: 33s for 7% WER - overkill for casual use |
| | |
| | --- |
| | |
| | ## Q4: Which model should I use for my daily STT workflow? |
| | |
| | **My personal answer:** base model with faster-whisper |
| | |
| | **Why it works for me:** |
| | - ~10% WER is acceptable for dictation (I can quickly fix errors) |
| | - 5 seconds per clip is fast enough |
| | - 140MB model size is manageable |
| | - Good balance for daily workflow |
| | |
| | **When I'd use something else:** |
| | - **tiny**: Quick tests or very long recordings where speed matters most |
| | - **medium/large**: Publishing or professional work needing better accuracy |
| | |
| | --- |
| | |
| | ## Bonus Finding: distil-whisper |
| | |
| | I tested distil-whisper expecting it to be faster, but on my sample: |
| | - **distil-whisper**: 21.6% WER in 38.49s ✗ |
| | |
| | Both slower AND less accurate than the standard models. Unexpected, but that's the data. |
| | """ |
| | ) |
| |
|
| | |
| | with gr.Tab("💻 Hardware & Setup"): |
| | gr.Markdown( |
| | """ |
| | ## Test Environment |
| | |
| | ### Hardware |
| | - **GPU**: AMD Radeon RX 7700 XT (ROCm available but using CPU inference) |
| | - **CPU**: Intel Core i7-12700F (12 cores, 20 threads) |
| | - **RAM**: 64 GB |
| | - **OS**: Ubuntu 25.04 |
| | |
| | ### Why CPU Inference? |
| | - AMD GPU with ROCm isn't ideal for STT workloads |
| | - CPU inference provided more consistent results |
| | - Your performance will differ based on your hardware |
| | |
| | ### Models Tested |
| | |
| | **Whisper model sizes:** |
| | - tiny (39M params) |
| | - base (74M params) |
| | - small (244M params) |
| | - medium (769M params) |
| | - large-v3-turbo (809M params) |
| | |
| | **Engines compared:** |
| | - OpenAI Whisper (original implementation) |
| | - faster-whisper (optimized CTranslate2) |
| | - distil-whisper (distilled variant) |
| | |
| | ### Metrics |
| | - **WER (Word Error Rate)**: Lower is better - percentage of words transcribed incorrectly |
| | - **Inference Time**: How long it takes to transcribe the audio sample |
| | |
| | ## Running Your Own Tests |
| | |
| | Want to benchmark on your own voice and hardware? |
| | |
| | 1. Clone the repository: [github.com/danielrosehill/Local-ASR-STT-Benchmark](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) |
| | 2. Set up the conda environment (see `setup.md`) |
| | 3. Record your own audio and create reference transcriptions |
| | 4. Run the benchmark scripts |
| | 5. Generate visualizations |
| | |
| | Your results will likely differ based on: |
| | - Your hardware (GPU/CPU) |
| | - Your voice characteristics |
| | - Your microphone quality |
| | - Background noise conditions |
| | - Speaking style and pace |
| | """ |
| | ) |
| |
|
| | |
| | with gr.Tab("ℹ️ About"): |
| | gr.Markdown( |
| | """ |
| | ## About This Project |
| | |
| | ### Motivation |
| | |
| | I was tired of guessing which Whisper model size to use for speech-to-text. There are plenty of |
| | benchmarks out there, but they're often: |
| | - Run on different hardware than mine |
| | - Tested on different voice characteristics |
| | - Using different microphones and conditions |
| | |
| | So I decided to run my own evaluation on my actual setup with my actual voice. |
| | |
| | ### Why This Matters |
| | |
| | If you're doing hours of transcription per day (like I am), optimizing your STT setup is worth it: |
| | - Faster models = less waiting |
| | - More accurate models = less editing |
| | - Finding the sweet spot = better workflow |
| | |
| | ### Next Steps |
| | |
| | For a more robust evaluation, I'd want to: |
| | - Test on multiple audio samples |
| | - Include different speaking styles (casual, technical, professional) |
| | - Test on different microphones |
| | - Evaluate punctuation and capitalization accuracy |
| | - Compare ASR (Automatic Speech Recognition) vs traditional STT |
| | - Test GPU inference on NVIDIA hardware |
| | |
| | ### Repository |
| | |
| | Full benchmark code and results: |
| | [github.com/danielrosehill/Local-ASR-STT-Benchmark](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) |
| | |
| | ### License |
| | |
| | MIT License - Feel free to use and adapt for your own benchmarks! |
| | |
| | --- |
| | |
| | *Built with Gradio • Whisper models by OpenAI • Hosted on Hugging Face Spaces* |
| | """ |
| | ) |
| |
|
| | gr.Markdown( |
| | """ |
| | --- |
| | ### 📧 Questions or feedback? |
| | Visit the [GitHub repository](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) to open an issue or contribute. |
| | """ |
| | ) |
| |
|
| | gr.HTML( |
| | """ |
| | <div style="text-align: center; margin-top: 20px;"> |
| | <a href="https://danielrosehill.com" target="_blank"> |
| | <img src="/file/badge.png" alt="Daniel Rosehill" style="width: 480px;"> |
| | </a> |
| | </div> |
| | """ |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|