-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwhisper_transcribe.py
101 lines (84 loc) · 3.61 KB
/
whisper_transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import json
import torch
import yt_dlp
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
# 設置模型
model_name = "openai/whisper-large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
# 使用 ASR Pipeline
asr_pipeline = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=0 if torch.cuda.is_available() else -1,
return_timestamps=True
)
def transcribe_audio_segments(audio_path):
"""
將音頻文件精確分段轉錄為文本
"""
import librosa
# 加載音頻文件
audio, _ = librosa.load(audio_path, sr=16000, mono=True)
# 使用 ASR Pipeline 進行轉錄
result = asr_pipeline(audio, return_timestamps="word")
# 整理段落
segments = []
current_segment = {"start": None, "end": None, "text": ""}
for word in result["chunks"]:
if current_segment["start"] is None:
current_segment["start"] = word["timestamp"][0]
current_segment["end"] = word["timestamp"][1]
current_segment["text"] += word["text"] + " "
if word["text"].endswith((".", "!", "?", "\n")):
current_segment["text"] = current_segment["text"].strip()
segments.append(current_segment)
current_segment = {"start": None, "end": None, "text": ""}
if current_segment["text"]:
current_segment["text"] = current_segment["text"].strip()
segments.append(current_segment)
return segments
def save_transcriptions_to_json(transcriptions, output_path):
"""
將轉錄的文本分段保存為 JSON 格式
"""
with open(output_path, "w", encoding="utf-8") as f:
json.dump({"segments": transcriptions}, f, ensure_ascii=False, indent=4)
def download_and_transcribe_youtube(url, output_path):
"""
下載 YouTube 視頻並將其分段轉錄為文本,最終保存為 JSON 格式
"""
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": "temp_audio.%(ext)s",
"postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3", "preferredquality": "192"}],
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
transcriptions = transcribe_audio_segments("temp_audio.mp3")
save_transcriptions_to_json(transcriptions, output_path)
os.remove("temp_audio.mp3")
def transcribe_local_file(file_path, output_path):
"""
將本機音頻或視頻文件分段轉錄為文本,最終保存為 JSON 格式
"""
transcriptions = transcribe_audio_segments(file_path)
save_transcriptions_to_json(transcriptions, output_path)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Whisper Transcription Tool with Accurate Segments")
parser.add_argument("--youtube", type=str, help="YouTube video URL to transcribe")
parser.add_argument("--file", type=str, help="Local audio/video file path to transcribe")
parser.add_argument("--output", type=str, required=True, help="Output JSON file path")
args = parser.parse_args()
if args.youtube:
download_and_transcribe_youtube(args.youtube, args.output)
elif args.file:
transcribe_local_file(args.file, args.output)
else:
print("Please provide either a YouTube URL or a local file path.")