Spaces:
Sleeping
Sleeping
from datetime import datetime, date | |
from pathlib import Path | |
from zoneinfo import ZoneInfo | |
import pandas as pd | |
HERE = Path(__file__).parent | |
JST = ZoneInfo("Asia/Tokyo") | |
def parse_ja_date(dt: datetime) -> date: | |
return dt.astimezone(JST).date() | |
def parse_csv(filename: str | Path) -> pd.DataFrame: | |
df = pd.read_csv(filename, | |
header=None, | |
) | |
df.columns = ["id", "day_import", "date_import", "length", "audio", "title_import"] | |
df["datetime"] = pd.to_datetime(df["date_import"], format=" %d %b %Y %H:%M:%S %Z") | |
df["date"] = df.loc[:, "datetime"].apply(lambda x: parse_ja_date(x)) | |
df["title"] = df.loc[:, "title_import"].astype("string") | |
return df | |
def ignore_columns(df: pd.DataFrame, limit: str | None) -> pd.DataFrame: | |
if limit is not None: | |
df_out = df.loc[df.loc[:, "datetime"] > limit, ["id", "date", "length", "audio", "title"]] | |
else: | |
df_out = df.loc[:, ["id", "date", "length", "audio", "title"]] | |
return df_out | |
def main(input: str | Path, output: str | Path, limit: str | None): | |
df = parse_csv(input) | |
df_out = ignore_columns(df, limit) | |
df_out.to_parquet(output, index=False) | |
if __name__ == "__main__": | |
input = HERE.parent / "data" / "episode-list-202501.csv" | |
output = HERE.parent / "store" / "podcast-title-list-202301-202501.parquet" | |
limit = "2023-01-01 00:00:00+9:00" | |
main(input, output, limit) | |