Spaces:

RafaG
/

legen

Running

App Files Files Community

legen / subtitle_utils.py

RafaG

Upload 24 files

5fa5566 verified 4 months ago

raw

history blame contribute delete

6.35 kB

	import os
	import re
	import tkinter as tk
	from pathlib import Path

	import pysrt


	def SaveSegmentsToSrt(segments: list, output_path: Path):
	# Create the subtitle file
	subs = pysrt.SubRipFile()
	sub_idx = 1

	for i in range(len(segments)):
	start_time = segments[i]["start"]
	end_time = segments[i]["end"]
	duration = end_time - start_time
	timestamp = f"{start_time:.3f} - {end_time:.3f}"
	text = segments[i]["text"]

	sub = pysrt.SubRipItem(index=sub_idx, start=pysrt.SubRipTime(seconds=start_time),
	end=pysrt.SubRipTime(seconds=end_time), text=text)
	subs.append(sub)
	sub_idx += 1

	# make dir and save .srt
	os.makedirs(output_path.parent, exist_ok=True)
	subs.save(output_path)


	def string_width(text, font_name="Jost", font_size=18):
	"""
	Determines the width of a string using tkinter.
	"""
	tries_remaining = 5

	while (tries_remaining > 0):
	tries_remaining -= 1
	try:
	root = tk.Tk()
	width = tk.font.Font(name=font_name, size=font_size,
	weight="bold").measure(text)
	root.destroy()
	return width
	except Exception:
	pass

	# all failed, return 60% of height per char
	return len(text) * font_size * 0.60


	def is_punctuation_end(word):
	"""Verifica se a palavra termina com uma pontuação."""
	return any(word.endswith(punct) for punct in ['.', ',', '!', '?', ':', ';'])


	def split_segments(segments, max_width_px=1440, font_name="Jost", font_size=18):
	"""
	Split segments based on the max width provided.
	"""
	new_segments = []
	for segment in segments:
	words = segment['words']
	current_words = []
	current_width = 0

	for word in words:
	# Calculate the width with a space after the word
	added_width = string_width(
	word['word'] + " ", font_name, font_size)
	isolated_sentence_ending = is_punctuation_end(word['word']) and not (
	current_words and is_punctuation_end(current_words[-1]['word']))
	possible_logical_break_point = len(current_words) >= 2 and len(
	current_words[-1]['word']) <= 3 and not len(current_words[-2]['word']) <= 3

	if (current_width + added_width < max_width_px) or len(current_words) == 0 or isolated_sentence_ending or possible_logical_break_point:
	current_words.append(word)
	current_width += added_width
	else:
	new_segments.append({
	'text': ' '.join(word['word'] for word in current_words),
	'start': next((word['start'] for word in current_words if 'start' in word), segment['start']),
	'end': next((word['end'] for word in reversed(current_words) if 'end' in word), segment['end']),
	'words': current_words.copy()
	})
	current_words = [word]
	current_width = added_width

	# For any remaining words
	if current_words:
	new_segments.append({
	'text': ' '.join(word['word'] for word in current_words),
	'start': next((word['start'] for word in current_words if 'start' in word), segment['start']),
	'end': next((word['end'] for word in reversed(current_words) if 'end' in word), segment['end']),
	'words': current_words
	})

	return new_segments


	def split_string_to_max_lines(text, max_width=720, max_lines=2, font_name="Jost", font_size=18):
	threshold = max_width * 0.8
	total_text_width = string_width(text, font_name, font_size)

	if total_text_width <= threshold or max_lines < 2:
	return [text]

	words = text.split()
	lines = []
	current_line_words = []
	current_line_width = 0

	for i, word in enumerate(words):
	word_width = string_width(word + ' ', font_name, font_size)
	isolated_sentence_ending = is_punctuation_end(word) and not (
	current_line_words and is_punctuation_end(current_line_words[-1]))
	possible_logical_break_point = len(current_line_words) >= 2 and len(
	current_line_words[-1]) <= 3 and not len(current_line_words[-2]) <= 3

	if current_line_width + word_width < total_text_width / max_lines or len(current_line_words) == 0 or isolated_sentence_ending or possible_logical_break_point:
	current_line_words.append(word)
	current_line_width += word_width
	else:
	lines.append(' '.join(current_line_words))
	current_line_words = [word]
	current_line_width = word_width

	if len(lines) == max_lines - 1:
	remaining_words = words[i:]
	lines.append(' '.join(remaining_words))
	break

	if current_line_words and len(lines) < max_lines:
	lines.append(' '.join(current_line_words))

	return lines


	def adjust_times(segments, extra_end_time=1.0):
	for i in range(len(segments) - 1): # We don't need to check the last segment
	current_end = segments[i]['end']
	next_start = segments[i + 1]['start']

	gap = next_start - current_end

	# If the gap is more than 1.5 + extra_end_time
	if gap > 1.5 + extra_end_time:
	segments[i]['end'] = current_end + extra_end_time

	# If the gap is less than 1.5 + extra_end_time
	elif gap < 1.5 + extra_end_time:
	segments[i]['end'] = next_start

	return segments


	def format_segments(segments: list, max_line_width_px: int = 380, max_lines_per_segment: int = 2):
	print('Formatting segments...', end='', flush=True)

	segments = split_segments(
	segments, max_line_width_px * max_lines_per_segment)

	for segment in segments:
	segment["text"] = "\n".join(split_string_to_max_lines(
	text=segment["text"], max_width=max_line_width_px, max_lines=max_lines_per_segment))

	segments = adjust_times(segments)

	print('\r ', end='\r', flush=True)

	return segments