karesaeedff's picture
Upload 3 files
f245e46 verified
raw
history blame
3.21 kB
import gradio as gr
import librosa
import numpy as np
import torch
from transformers import pipeline
from tqdm import tqdm
import tempfile
import json
import soundfile as sf
# ==== 参数设置 ====
SAMPLE_RATE = 8000 # 降采样,节省计算
WINDOW = 5 # 每个分析窗口长度(秒)
STEP = 2 # 滑动步长(秒)
MUSIC_THRESHOLD = 0.4
VOICE_THRESHOLD = 0.3
MIN_SING_DURATION = 8 # 最短唱歌片段(秒)
# ==== 初始化模型 ====
music_pipe = pipeline("audio-classification", model="AI-Music-Detection/ai_music_detection_large_60s")
voice_pipe = pipeline("audio-classification", model="superb/hubert-large-superb-sid")
def detect_singing(audio_path):
"""核心:检测唱歌时间戳"""
wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
duration = len(wav) / SAMPLE_RATE
results = []
for start in np.arange(0, duration - WINDOW, STEP):
end = start + WINDOW
snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
# 音乐概率
music_pred = music_pipe(snippet, sampling_rate=SAMPLE_RATE)
music_score = max([p['score'] for p in music_pred if 'music' in p['label'].lower()] or [0])
# 声音概率(有语音活动)
voice_pred = voice_pipe(snippet, sampling_rate=SAMPLE_RATE)
voice_score = max([p['score'] for p in voice_pred if 'speech' in p['label'].lower()] or [0])
if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
results.append((float(start), float(end)))
# 合并连续区间
merged = []
for seg in results:
if not merged or seg[0] > merged[-1][1]:
merged.append(list(seg))
else:
merged[-1][1] = seg[1]
merged = [(s, e) for s, e in merged if e - s >= MIN_SING_DURATION]
return merged
def analyze_audio(file):
"""Gradio 接口函数"""
if file is None:
return "请上传音频文件", None
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
# 确保为wav格式
data, sr = librosa.load(file.name, sr=SAMPLE_RATE)
sf.write(tmp.name, data, sr)
segments = detect_singing(tmp.name)
if not segments:
return "未检测到明显唱歌片段", json.dumps([], indent=2)
json_output = json.dumps(
[{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
indent=2
)
return f"检测到 {len(segments)} 段唱歌片段", json_output
# ==== Gradio UI ====
with gr.Blocks(title="🎵 Singing Segment Detector") as demo:
gr.Markdown("# 🎤 自动识别唱歌片段 (Hugging Face Space)\n上传音频文件,返回检测到的唱歌时间段 JSON。")
audio_in = gr.Audio(type="filepath", label="上传音频文件(从视频提取后)")
btn = gr.Button("开始分析")
status = gr.Textbox(label="分析结果", interactive=False)
json_out = gr.Code(label="唱歌片段时间戳(JSON)", language="json")
btn.click(fn=analyze_audio, inputs=[audio_in], outputs=[status, json_out])
demo.launch()