图片与语音对应关系的python实现

十二月 20, 2024

同一个算法问过ChatGPT、claude、gemini、grok、千问，发现copilot的代码能力更强一些，咋就说微软大模型就不行了？感觉微软作为老三在猥琐发育呢，哈哈~

from pydub import AudioSegment

import pysrt

import pytesseract

from PIL import Image

def get_word_timestamps(srt_file):

# 读取字幕文件

subs = pysrt.open(srt_file)

# 创建一个空列表来存储（单词，时间）元组

word_timestamps = []

# 遍历每一条字幕

for sub in subs:

start_time = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0

words = sub.text.split()

# 将每个单词及其开始时间添加到列表中

for word in words:

word_timestamps.append((word, start_time))

return word_timestamps

def get_word_positions(image_file):

# 打开图像文件

image = Image.open(image_file)

# 使用pytesseract进行OCR，并获取每个单词及其位置信息

data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

word_positions = []

for i in range(len(data['text'])):

word = data['text'][i]

if word.strip(): # 只处理非空的单词

x = data['left'][i]

y = data['top'][i]

width = data['width'][i]

height = data['height'][i]

word_positions.append((word, (x, y, width, height)))

return word_positions

def combine_results(srt_file, image_file):

# 获取SRT字幕的单词及其时间戳

word_timestamps = get_word_timestamps(srt_file)

# 获取图片OCR识别的单词及其位置信息

word_positions = get_word_positions(image_file)

# 创建一个空列表来存储整合后的结果

combined_results = []

# 创建一个索引来跟踪图片单词的位置

position_index = 0

# 将两个结果整合在一起

for word, timestamp in word_timestamps:

# 用while循环找到与当前字幕单词匹配的图片中的单词

while position_index < len(word_positions) and word_positions[position_index][0] != word:

# 如果当前图片单词不匹配字幕单词，继续往下找

position_index += 1

if position_index < len(word_positions):

# 如果找到匹配的单词，获取其位置

position = word_positions[position_index][1]

# 将单词、时间戳和位置整合到结果列表中

combined_results.append((word, timestamp, position))

# 移动到下一个图片单词的位置

position_index += 1

return combined_results

# 示例用法

srt_file = 'path_to_your_subtitles.srt'

image_file = 'path_to_your_image.png'

combined_results = combine_results(srt_file, image_file)

# 打印每个单词及其时间和位置

for word, timestamp, (x, y, width, height) in combined_results:

print(f'{word}: time {timestamp:.2f} seconds, position (x: {x}, y: {y}, width: {width}, height: {height})')

...

from pydub import AudioSegment

import pysrt

import pytesseract

from PIL import Image

def get_word_timestamps(srt_file):

# 读取字幕文件

subs = pysrt.open(srt_file)

# 创建一个空列表来存储（单词，时间）元组

word_timestamps = []

# 遍历每一条字幕

for sub in subs:

start_time = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0

words = sub.text.split()

# 将每个单词及其开始时间添加到列表中

for word in words:

word_ttimestamps.append((word, start_time))

return word_timestamps

def get_word_positions(image_file):

# 打开图像文件

image = Image.open(image_file)

# 使用pytesseract进行OCR，并获取每个单词及其位置信息

data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

word_positions = []

for i in range(len(data['text'])):

word = data['text'][i]

if word.strip(): # 只处理非空的单词

x = data['left'][i]

y = data['top'][i]

width = data['width'][i]

height = data['height'][i]

word_positions.append((word, (x, y, width, height)))

return word_positions

def combine_results(srt_file, image_file):

# 获取SRT字幕的单词及其时间戳

word_timestamps = get_word_timestamps(srt_file)

# 获取图片OCR识别的单词及其位置信息

word_positions = get_word_positions(image_file)

# 创建一个空列表来存储整合后的结果

combined_results = []

# 创建一个索引来跟踪图片单词的位置

position_index = 0

# 将两个结果整合在一起

for word, timestamp in word_ttimestamps:

# 我们需要查找当前字幕单词在图片中的下一个匹配位置

while position_index < len(word_positions):

if word_positions[position_index][0] == word:

position = word_positions[position_index][1]

combined_results.append((word, timestamp, position))

position_index += 1 # 找到匹配后，将索引向前移动

break

position_index += 1

return combined_results

# 示例用法

srt_file = 'path_to_your_subtitles.srt'

image_file = 'path_to_your_image.png'

combined_results = combine_results(srt_file, image_file)

# 打印每个单词及其时间和位置

for word, timestamp, (x, y, width, height) in combined_results:

print(f'{word}: time {timestamp:.2f} seconds, position (x: {x}, y: {y}, width: {width}, height: {height})')

搜索此博客

civilpy

图片与语音对应关系的python实现

评论

发表评论

此博客中的热门博文

布林带收窄后的主力行为逻辑

2025-5-29 部署一个跨境站

2025-07-01 视频经验总结