图片与语音对应关系的python实现
同一个算法问过ChatGPT、claude、gemini、grok、千问,发现copilot的代码能力更强一些,咋就说微软大模型就不行了?感觉微软作为老三在猥琐发育呢,哈哈~
from pydub import AudioSegment
import pysrt
import pytesseract
from PIL import Image
def get_word_timestamps(srt_file):
# 读取字幕文件
subs = pysrt.open(srt_file)
# 创建一个空列表来存储(单词,时间)元组
word_timestamps = []
# 遍历每一条字幕
for sub in subs:
start_time = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0
words = sub.text.split()
# 将每个单词及其开始时间添加到列表中
for word in words:
word_timestamps.append((word, start_time))
return word_timestamps
def get_word_positions(image_file):
# 打开图像文件
image = Image.open(image_file)
# 使用pytesseract进行OCR,并获取每个单词及其位置信息
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
word_positions = []
for i in range(len(data['text'])):
word = data['text'][i]
if word.strip(): # 只处理非空的单词
x = data['left'][i]
y = data['top'][i]
width = data['width'][i]
height = data['height'][i]
word_positions.append((word, (x, y, width, height)))
return word_positions
def combine_results(srt_file, image_file):
# 获取SRT字幕的单词及其时间戳
word_timestamps = get_word_timestamps(srt_file)
# 获取图片OCR识别的单词及其位置信息
word_positions = get_word_positions(image_file)
# 创建一个空列表来存储整合后的结果
combined_results = []
# 创建一个索引来跟踪图片单词的位置
position_index = 0
# 将两个结果整合在一起
for word, timestamp in word_timestamps:
# 用while循环找到与当前字幕单词匹配的图片中的单词
while position_index < len(word_positions) and word_positions[position_index][0] != word:
# 如果当前图片单词不匹配字幕单词,继续往下找
position_index += 1
if position_index < len(word_positions):
# 如果找到匹配的单词,获取其位置
position = word_positions[position_index][1]
# 将单词、时间戳和位置整合到结果列表中
combined_results.append((word, timestamp, position))
# 移动到下一个图片单词的位置
position_index += 1
return combined_results
# 示例用法
srt_file = 'path_to_your_subtitles.srt'
image_file = 'path_to_your_image.png'
combined_results = combine_results(srt_file, image_file)
# 打印每个单词及其时间和位置
for word, timestamp, (x, y, width, height) in combined_results:
print(f'{word}: time {timestamp:.2f} seconds, position (x: {x}, y: {y}, width: {width}, height: {height})')
...
from pydub import AudioSegment
import pysrt
import pytesseract
from PIL import Image
def get_word_timestamps(srt_file):
# 读取字幕文件
subs = pysrt.open(srt_file)
# 创建一个空列表来存储(单词,时间)元组
word_timestamps = []
# 遍历每一条字幕
for sub in subs:
start_time = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0
words = sub.text.split()
# 将每个单词及其开始时间添加到列表中
for word in words:
word_ttimestamps.append((word, start_time))
return word_timestamps
def get_word_positions(image_file):
# 打开图像文件
image = Image.open(image_file)
# 使用pytesseract进行OCR,并获取每个单词及其位置信息
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
word_positions = []
for i in range(len(data['text'])):
word = data['text'][i]
if word.strip(): # 只处理非空的单词
x = data['left'][i]
y = data['top'][i]
width = data['width'][i]
height = data['height'][i]
word_positions.append((word, (x, y, width, height)))
return word_positions
def combine_results(srt_file, image_file):
# 获取SRT字幕的单词及其时间戳
word_timestamps = get_word_timestamps(srt_file)
# 获取图片OCR识别的单词及其位置信息
word_positions = get_word_positions(image_file)
# 创建一个空列表来存储整合后的结果
combined_results = []
# 创建一个索引来跟踪图片单词的位置
position_index = 0
# 将两个结果整合在一起
for word, timestamp in word_ttimestamps:
# 我们需要查找当前字幕单词在图片中的下一个匹配位置
while position_index < len(word_positions):
if word_positions[position_index][0] == word:
position = word_positions[position_index][1]
combined_results.append((word, timestamp, position))
position_index += 1 # 找到匹配后,将索引向前移动
break
position_index += 1
return combined_results
# 示例用法
srt_file = 'path_to_your_subtitles.srt'
image_file = 'path_to_your_image.png'
combined_results = combine_results(srt_file, image_file)
# 打印每个单词及其时间和位置
for word, timestamp, (x, y, width, height) in combined_results:
print(f'{word}: time {timestamp:.2f} seconds, position (x: {x}, y: {y}, width: {width}, height: {height})')
评论
发表评论