Whisper 语音识别 - 生成 ASS 字幕文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
"""
Whisper 语音识别 - 生成 ASS 字幕文件
"""

from faster_whisper import WhisperModel

def format_ass_time(seconds):
    """转换为 ASS 时间格式：H:MM:SS.CC"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    centisecs = int((seconds % 1) * 100)
    return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"

def transcribe(audio_file, output_file=None):
    """转录音频并生成 ASS 字幕"""
    model = WhisperModel("tiny", device="cpu", compute_type="int8")

    segments, info = model.transcribe(
        audio_file,
        language="zh",
        word_timestamps=True
    )

    # ASS 文件头
    ass_header = """[Script Info]
Title: Whisper Subtitle
ScriptType: v4.00+
WrapStyle: 0
PlayResX: 1920
PlayResY: 1080

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,48,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,2,2,10,10,20,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

    # 生成字幕事件
    events = []
    for segment in segments:
        start = format_ass_time(segment.start)
        end = format_ass_time(segment.end)
        text = segment.text.strip().replace("\n", "\\N")

        events.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}")

    result = ass_header + "\n".join(events)

    # 保存到文件
    if output_file:
        with open(output_file, "w", encoding="utf-8-sig") as f:  # utf-8-sig for ASS
            f.write(result)
        print(f"✅ 字幕已保存: {output_file}")

    return result

# 使用示例
if __name__ == "__main__":
    audio_file = "audio/test_zh.wav"
    output_file = "audio/test_zh.ass"

    print(transcribe(audio_file, output_file))