86 lines
2.1 KiB
Python
Executable File
86 lines
2.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
语音识别示例 - 使用 faster-whisper
|
|
"""
|
|
|
|
from faster_whisper import WhisperModel
|
|
import os
|
|
|
|
# 默认配置
|
|
MODEL_SIZE = "base"
|
|
LANGUAGE = "zh"
|
|
|
|
def transcribe(audio_path, model_size=None, language=None):
|
|
"""语音转文字"""
|
|
model_size = model_size or MODEL_SIZE
|
|
language = language or LANGUAGE
|
|
|
|
print(f"\n🎤 开始识别...")
|
|
print(f" 模型: {model_size}")
|
|
print(f" 语言: {language}")
|
|
print(f" 文件: {audio_path}")
|
|
|
|
# 加载模型
|
|
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
|
|
|
# 识别
|
|
segments, info = model.transcribe(
|
|
audio_path,
|
|
language=language,
|
|
beam_size=5
|
|
)
|
|
|
|
print(f"\n📝 识别结果:")
|
|
print(f" 语言: {info.language} (置信度: {info.language_probability:.2f})")
|
|
print("-" * 50)
|
|
|
|
text = ""
|
|
for segment in segments:
|
|
text += segment.text
|
|
print(f" {segment.text}")
|
|
|
|
return text.strip()
|
|
|
|
def main():
|
|
import sys
|
|
|
|
print(f"\n{'='*60}")
|
|
print("🎤 语音识别 (faster-whisper)")
|
|
print(f"{'='*60}")
|
|
|
|
if len(sys.argv) < 2:
|
|
print("\n用法:")
|
|
print(" python whisper_example.py <音频文件>")
|
|
print(" python whisper_example.py audio.mp3 --model base --lang zh")
|
|
print("\n选项:")
|
|
print(" --model, -m 模型大小: tiny, base, small, medium, large")
|
|
print(" --lang, -l 语言: zh, en, ja, ko 等")
|
|
return
|
|
|
|
audio_file = sys.argv[1]
|
|
model = "base"
|
|
lang = "zh"
|
|
|
|
# 解析参数
|
|
i = 2
|
|
while i < len(sys.argv):
|
|
key = sys.argv[i].replace("--", "")
|
|
value = sys.argv[i + 1] if i + 1 < len(sys.argv) else ""
|
|
|
|
if key in ["model", "m"]:
|
|
model = value
|
|
elif key in ["lang", "l"]:
|
|
lang = value
|
|
|
|
i += 2
|
|
|
|
if not os.path.exists(audio_file):
|
|
print(f"❌ 文件不存在: {audio_file}")
|
|
return
|
|
|
|
text = transcribe(audio_file, model, lang)
|
|
print(f"\n✅ 识别完成: {len(text)} 字符")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|