1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
| #!/usr/bin/env python
"""
Whisper 语音识别 - 生成 ASS 字幕文件
"""
from faster_whisper import WhisperModel
def format_ass_time(seconds):
"""转换为 ASS 时间格式:H:MM:SS.CC"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
centisecs = int((seconds % 1) * 100)
return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"
def transcribe(audio_file, output_file=None):
"""转录音频并生成 ASS 字幕"""
model = WhisperModel("tiny", device="cpu", compute_type="int8")
segments, info = model.transcribe(
audio_file,
language="zh",
word_timestamps=True
)
# ASS 文件头
ass_header = """[Script Info]
Title: Whisper Subtitle
ScriptType: v4.00+
WrapStyle: 0
PlayResX: 1920
PlayResY: 1080
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,48,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,2,2,10,10,20,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
# 生成字幕事件
events = []
for segment in segments:
start = format_ass_time(segment.start)
end = format_ass_time(segment.end)
text = segment.text.strip().replace("\n", "\\N")
events.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}")
result = ass_header + "\n".join(events)
# 保存到文件
if output_file:
with open(output_file, "w", encoding="utf-8-sig") as f: # utf-8-sig for ASS
f.write(result)
print(f"✅ 字幕已保存: {output_file}")
return result
# 使用示例
if __name__ == "__main__":
audio_file = "audio/test_zh.wav"
output_file = "audio/test_zh.ass"
print(transcribe(audio_file, output_file))
|