Spaces:

terapyon
/

podcast-search

Running

File size: 3,138 Bytes

from dataclasses import dataclass
from datetime import time as dt_time
from datetime import timedelta
import pandas as pd
from config import STORE_DIR, DATA_DIR, divider_time, RE_PODCAST_SRT_FILE


@dataclass
class SplitedText:
    part: int
    start: int
    end: int
    text: str


@dataclass
class Episode:
    id_: int
    title: str | None
    texts: list[SplitedText]


def str_to_timedelta(s: str) -> timedelta:
    t = dt_time.fromisoformat(s)
    return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)


def make_episode(id_: int, title: str, srt_filename: str) -> Episode:
    episode = Episode(
        id_=id_,
        title=title,
        texts=[]
    )
    part = 1
    start = None
    end = None
    text = None

    with open(srt_filename) as f:
        for line in f:
            first = None
            second = None
            line_text = None
            if line.strip().isdigit():
                continue
            elif line.strip() == "":
                continue
            elif "-->" in line:
                first_str, second_str = line.strip().split("-->")
                first = str_to_timedelta(first_str.strip())
                second = str_to_timedelta(second_str.strip())
            else:
                line_text = line.strip()
            
            if first:
                if start is None:
                    start = first
            if line_text:
                if text is None:
                    text = line_text
                else:
                    text += "\n" + line_text

            if start and second and text:
                if abs(second - start) > divider_time:
                    end = second
                    st = SplitedText(part=part,
                                     start=int(start.total_seconds()),
                                     end=int(end.total_seconds()),
                                     text=text)
                    episode.texts.append(st)

                    # print(text)
                    part += 1
                    start = None
                    text = None
        # print(episode)
        print(len(episode.texts))
    return episode


def make_df(episode: Episode) -> pd.DataFrame:
    data = []
    for text in episode.texts:
        data.append([episode.id_, text.part, text.start, text.end, text.text])
    df = pd.DataFrame(data, columns=["id", "part", "start", "end_", "text"])
    return df


def get_srt_files():
    lst = []
    for file_path in DATA_DIR.glob("*.srt"):
        m = RE_PODCAST_SRT_FILE.search(file_path.name)
        if m is not None:
            filename = file_path.name
            id_ = int(m.group(1))
            lst.append({"id": id_, "srt": filename})
    return lst


def main():
    lst = sorted(get_srt_files(), key=lambda x: x["id"])
    print(f"{len(lst)=}")
    for item in lst:
        print(item["id"])
        episode = make_episode(item["id"], item.get("title"), DATA_DIR / item["srt"])
        df = make_df(episode)
        # print(df)
        df.to_parquet(STORE_DIR / f"podcast-{item['id']}.parquet")
        # break


if __name__ == "__main__":
    main()