Spaces:
Running
Running
File size: 3,138 Bytes
2f6de57 d788666 2f6de57 d788666 2f6de57 ead35f9 2f6de57 ead35f9 2f6de57 d788666 2f6de57 59d39d4 2f6de57 d788666 2f6de57 31e8063 2f6de57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
from dataclasses import dataclass
from datetime import time as dt_time
from datetime import timedelta
import pandas as pd
from config import STORE_DIR, DATA_DIR, divider_time, RE_PODCAST_SRT_FILE
@dataclass
class SplitedText:
part: int
start: int
end: int
text: str
@dataclass
class Episode:
id_: int
title: str | None
texts: list[SplitedText]
def str_to_timedelta(s: str) -> timedelta:
t = dt_time.fromisoformat(s)
return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)
def make_episode(id_: int, title: str, srt_filename: str) -> Episode:
episode = Episode(
id_=id_,
title=title,
texts=[]
)
part = 1
start = None
end = None
text = None
with open(srt_filename) as f:
for line in f:
first = None
second = None
line_text = None
if line.strip().isdigit():
continue
elif line.strip() == "":
continue
elif "-->" in line:
first_str, second_str = line.strip().split("-->")
first = str_to_timedelta(first_str.strip())
second = str_to_timedelta(second_str.strip())
else:
line_text = line.strip()
if first:
if start is None:
start = first
if line_text:
if text is None:
text = line_text
else:
text += "\n" + line_text
if start and second and text:
if abs(second - start) > divider_time:
end = second
st = SplitedText(part=part,
start=int(start.total_seconds()),
end=int(end.total_seconds()),
text=text)
episode.texts.append(st)
# print(text)
part += 1
start = None
text = None
# print(episode)
print(len(episode.texts))
return episode
def make_df(episode: Episode) -> pd.DataFrame:
data = []
for text in episode.texts:
data.append([episode.id_, text.part, text.start, text.end, text.text])
df = pd.DataFrame(data, columns=["id", "part", "start", "end_", "text"])
return df
def get_srt_files():
lst = []
for file_path in DATA_DIR.glob("*.srt"):
m = RE_PODCAST_SRT_FILE.search(file_path.name)
if m is not None:
filename = file_path.name
id_ = int(m.group(1))
lst.append({"id": id_, "srt": filename})
return lst
def main():
lst = sorted(get_srt_files(), key=lambda x: x["id"])
print(f"{len(lst)=}")
for item in lst:
print(item["id"])
episode = make_episode(item["id"], item.get("title"), DATA_DIR / item["srt"])
df = make_df(episode)
# print(df)
df.to_parquet(STORE_DIR / f"podcast-{item['id']}.parquet")
# break
if __name__ == "__main__":
main()
|