Spaces:
Running
How to save the transcribed text and timestamp as an SRT file
Your website can transcribe the text and timestamp of videos, which is very impressive. Can you add a function to save as an SRT file?
did you figure it out?
here's the python script to do this if anyone who needs it. Just change the input_file location:
import re
# Function to convert timestamp to seconds
def time_to_seconds(timestamp):
m, s = map(float, timestamp.split(':'))
return m * 60 + s
# Function to format time in seconds as SRT time (hh:mm:ss,ms)
def format_time(seconds):
milliseconds = int((seconds - int(seconds)) * 1000)
seconds = int(seconds)
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
# Input and output file names
input_file = r"C:\Documents\Python scripts\input.txt"
output_file = r"C:\Documents\Python scripts\output.srt"
# Open the input and output files
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
# Read lines from the input file
lines = infile.readlines()
# Initialize subtitle index
subtitle_index = 1
# Iterate through lines and convert to SRT format
for line in lines:
# Use regular expression to extract start and end times
time_match = re.search(r'\[(\d+:\d+\.\d+) -> (\d+:\d+\.\d+)]', line)
if time_match:
start_time = time_to_seconds(time_match.group(1))
end_time = time_to_seconds(time_match.group(2))
outfile.write(f"{subtitle_index}\n")
outfile.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
subtitle_index += 1
else:
# Skip lines without time information
continue
# Write the subtitle text
text = re.sub(r'\[.*\]', '', line).strip()
outfile.write(f"{text}\n\n")
print(f"Conversion completed. The output has been saved to {output_file}")
const fs = require('fs');
// Funkcja przekształcająca napisy w formacie "whisper" na format SRT
function convertWhisperToSrt(data) {
const lines = data.split('\n');
let srtOutput = '';
let subtitleIndex = 1;
for (const line of lines) {
if (line.trim() !== '') {
const match = line.match(/\[(\d+:\d+\.\d+) --> (\d+:\d+\.\d+)](.*)/);
if (match && match.length === 4) {
const startTime = formatTime(match[1]);
const endTime = formatTime(match[2]);
const text = match[3].trim();
srtOutput += `${subtitleIndex}\n${startTime} --> ${endTime}\n${text}\n\n`;
subtitleIndex++;
}
}
}
return srtOutput;
}
// Funkcja do formatowania czasu w stylu "00:00:00,000 --> 00:00:01,960"
function formatTime(timestamp) {
const parts = timestamp.split(':');
const minutes = parts[0];
const secondsAndMillis = parts[1].split('.');
const seconds = secondsAndMillis[0];
const milliseconds = secondsAndMillis[1].padEnd(3, '0');
return `00:${pad(minutes)}:${pad(seconds)},${milliseconds}`;
}
// Funkcja do wypełniania zerami
function pad(number, length = 2) {
return number.toString().padStart(length, '0');
}
// Odczyt pliku "whisper.txt" i przetwarzanie danych
const inputFile = "whisper.txt";
try {
const data = fs.readFileSync(inputFile, 'utf8');
const srtOutput = convertWhisperToSrt(data);
// Zapis przekonwertowanych napisów do pliku "converted.srt"
const outputFileName = "german2.srt";
fs.writeFileSync(outputFileName, srtOutput, 'utf8');
console.log(`Plik "${outputFileName}" został zapisany w formacie SRT.`);
} catch (err) {
console.error(Błąd podczas odczytu/zapisu plików: ${err.message}
);
}
here is js node script that convert a file txt to srt.Remember to change "-" to "--" in txt file after that it going to work easyly.
I just copy and paste the output into a text file, and then I load that text file into the free Subtitle Edit program - https://www.nikse.dk/subtitleedit - Then I run "Fix common errors" to have it auto-fix any issues and then export it as an SRT.
I adjust CuddleMaster' code above to match both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545);
and remove the time convert
import re
# Input and output file names
input_file = r"./input.txt"
output_file = r"./output.srt"
# Open the input and output files
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
# Read lines from the input file
lines = infile.readlines()
# Initialize subtitle index
subtitle_index = 1
# Iterate through lines and convert to SRT format
for line in lines:
# Use regular expression to extract start and end times, both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545)
time_match = re.search(r'\[((\d+:){1,2}\d+\.\d+) -> ((\d+:){1,2}\d+\.\d+)]', line)
if time_match:
start_time = time_match.group(1).replace(".", ",")
if len(start_time) < 10: # if time string like 34:08,342, lengs <= 9
start_time = "00:" + start_time # add hour '00:'
end_time = time_match.group(3).replace(".", ",")
if len(end_time) < 10:
end_time = "00:" + end_time
outfile.write(f"{subtitle_index}\n")
outfile.write(f"{start_time} --> {end_time}\n")
subtitle_index += 1
else:
# Skip lines without time information
continue
# Write the subtitle text
text = re.sub(r'\[.*\]', '', line).strip()
outfile.write(f"{text}\n\n")
print(f"Conversion completed. The output has been saved to {output_file}")
here's the python script to do this if anyone who needs it. Just change the input_file location:
import re # Function to convert timestamp to seconds def time_to_seconds(timestamp): m, s = map(float, timestamp.split(':')) return m * 60 + s # Function to format time in seconds as SRT time (hh:mm:ss,ms) def format_time(seconds): milliseconds = int((seconds - int(seconds)) * 1000) seconds = int(seconds) minutes, seconds = divmod(seconds, 60) hours, minutes = divmod(minutes, 60) return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" # Input and output file names input_file = r"C:\Documents\Python scripts\input.txt" output_file = r"C:\Documents\Python scripts\output.srt" # Open the input and output files with open(input_file, "r") as infile, open(output_file, "w") as outfile: # Read lines from the input file lines = infile.readlines() # Initialize subtitle index subtitle_index = 1 # Iterate through lines and convert to SRT format for line in lines: # Use regular expression to extract start and end times time_match = re.search(r'\[(\d+:\d+\.\d+) -> (\d+:\d+\.\d+)]', line) if time_match: start_time = time_to_seconds(time_match.group(1)) end_time = time_to_seconds(time_match.group(2)) outfile.write(f"{subtitle_index}\n") outfile.write(f"{format_time(start_time)} --> {format_time(end_time)}\n") subtitle_index += 1 else: # Skip lines without time information continue # Write the subtitle text text = re.sub(r'\[.*\]', '', line).strip() outfile.write(f"{text}\n\n") print(f"Conversion completed. The output has been saved to {output_file}")
Good lookin' out!
I adjust CuddleMaster' code above to match both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545);
and remove the time convert
import re # Input and output file names input_file = r"./input.txt" output_file = r"./output.srt" # Open the input and output files with open(input_file, "r") as infile, open(output_file, "w") as outfile: # Read lines from the input file lines = infile.readlines() # Initialize subtitle index subtitle_index = 1 # Iterate through lines and convert to SRT format for line in lines: # Use regular expression to extract start and end times, both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545) time_match = re.search(r'\[((\d+:){1,2}\d+\.\d+) -> ((\d+:){1,2}\d+\.\d+)]', line) if time_match: start_time = time_match.group(1).replace(".", ",") if len(start_time) < 10: # if time string like 34:08,342, lengs <= 9 start_time = "00:" + start_time # add hour '00:' end_time = time_match.group(3).replace(".", ",") if len(end_time) < 10: end_time = "00:" + end_time outfile.write(f"{subtitle_index}\n") outfile.write(f"{start_time} --> {end_time}\n") subtitle_index += 1 else: # Skip lines without time information continue # Write the subtitle text text = re.sub(r'\[.*\]', '', line).strip() outfile.write(f"{text}\n\n") print(f"Conversion completed. The output has been saved to {output_file}")
Nice, thanks
Yes, I encountered the same problem while transcribing foreign youtube videos with whisper-jax and decided to develop the following software. The application is very simple to use and has multi-process support. At the moment it's just focusing on solving this problem, maybe in the future when I have some free time I might consider adding more features to make it more global and feature rich. For more information about the software you can check the following github address: https://github.com/xeloxa/WTOSRT
Download the exe and dmg files of the software now:
https://github.com/xeloxa/WTOSRT/releases/tag/whisper