podcast-search / src /titles.py
terapyon's picture
length, audio ใ‚’่ฟฝๅŠ ใ—ใŸ
b1bf02b
from datetime import datetime, date
from pathlib import Path
from zoneinfo import ZoneInfo
import pandas as pd
HERE = Path(__file__).parent
JST = ZoneInfo("Asia/Tokyo")
def parse_ja_date(dt: datetime) -> date:
return dt.astimezone(JST).date()
def parse_csv(filename: str | Path) -> pd.DataFrame:
df = pd.read_csv(filename,
header=None,
)
df.columns = ["id", "day_import", "date_import", "length", "audio", "title_import"]
df["datetime"] = pd.to_datetime(df["date_import"], format=" %d %b %Y %H:%M:%S %Z")
df["date"] = df.loc[:, "datetime"].apply(lambda x: parse_ja_date(x))
df["title"] = df.loc[:, "title_import"].astype("string")
return df
def ignore_columns(df: pd.DataFrame, limit: str | None) -> pd.DataFrame:
if limit is not None:
df_out = df.loc[df.loc[:, "datetime"] > limit, ["id", "date", "length", "audio", "title"]]
else:
df_out = df.loc[:, ["id", "date", "length", "audio", "title"]]
return df_out
def main(input: str | Path, output: str | Path, limit: str | None):
df = parse_csv(input)
df_out = ignore_columns(df, limit)
df_out.to_parquet(output, index=False)
if __name__ == "__main__":
input = HERE.parent / "data" / "episode-list-202501.csv"
output = HERE.parent / "store" / "podcast-title-list-202301-202501.parquet"
limit = "2023-01-01 00:00:00+9:00"
main(input, output, limit)