Jun 30, 2024
This lecture walks through the steps to download a YouTube video, extract its audio, transcribe it using OpenAI's Whisper, preprocess the transcript, and embed the subtitles back into the video using Python libraries.
.wav
fileHH:MM:SS,SSS
.srt
file with indices, time codes, and textrequirements.txt
PBE
to download a YouTube videoimport os
from pytube import YouTube
url = 'YOUR_YOUTUBE_VIDEO_URL'
yt = YouTube(url)
yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
.mp4
to .wav
import ffmpeg
def extract_audio(video_path):
audio_path = 'output_audio.wav'
stream = ffmpeg.input(video_path)
stream = ffmpeg.output(stream, audio_path)
ffmpeg.run(stream, overwrite_output=True)
return audio_path
audio_file = extract_audio('your_video.mp4')
from faster_whisper import WhisperModel
def transcribe_audio(audio_path):
model = WhisperModel('small')
segments, info = model.transcribe(audio_path)
language = info['language']
return language, segments
language, segments = transcribe_audio('output_audio.wav')
import math
def format_time_for_srt(seconds):
hours = math.floor(seconds / 3600)
seconds %= 3600
minutes = math.floor(seconds / 60)
seconds %= 60
milliseconds = round((seconds - math.floor(seconds)) * 1000)
formatted_time = f"{hours:02d}:{minutes:02d}:{math.floor(seconds):02d},{milliseconds:03d}"
return formatted_time
.srt
file from segments
def generate_srt_file(language, segments, output_file):
with open(output_file, 'w') as f:
for index, segment in enumerate(segments):
start = format_time_for_srt(segment['start'])
end = format_time_for_srt(segment['end'])
text = segment['text']
f.write(f"{index + 1}\n{start} --> {end}\n{text}\n\n")
srt_file = 'your_subtitle.srt'
generate_srt_file(language, segments, srt_file)
def add_subtitles_to_video(video_path, srt_path, output_path):
video = ffmpeg.input(video_path)
audio = video.audio
ffmpeg.concat(video.filter('subtitles', srt_path), audio, v=1, a=1).output(output_path).run(overwrite_output=True)
output_video = 'output_with_subtitles.mp4'
add_subtitles_to_video('your_video.mp4', 'your_subtitle.srt', output_video)
Thank you!