|
import gradio as gr |
|
from datasets import load_dataset |
|
import pandas as pd |
|
import sys |
|
import subprocess |
|
from datetime import datetime |
|
from huggingface_hub import HfApi |
|
|
|
def get_newest_file(repo_id, prefix): |
|
"""Get the newest file with given prefix from HuggingFace repo""" |
|
api = HfApi() |
|
files = api.list_repo_files(repo_id, repo_type="dataset") |
|
relevant_files = [f for f in files if f.startswith(prefix)] |
|
|
|
if not relevant_files: |
|
return None |
|
|
|
file_dates = [] |
|
for filename in relevant_files: |
|
try: |
|
date_str = filename.split('_')[-1].split('.')[0] |
|
date = datetime.strptime(date_str, '%Y%m%d') |
|
file_dates.append((date, filename)) |
|
except (IndexError, ValueError): |
|
continue |
|
|
|
if not file_dates: |
|
return None |
|
|
|
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] |
|
return newest_file |
|
|
|
def load_data(repo_id, file_path): |
|
"""Load data from HuggingFace and return as DataFrame""" |
|
try: |
|
dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train') |
|
df = pd.DataFrame(dataset) |
|
return df.head(3) |
|
except Exception as e: |
|
return pd.DataFrame({'Error': [str(e)]}) |
|
|
|
def praw_new_data(): |
|
"""Execute praw.py and show the latest data""" |
|
try: |
|
|
|
subprocess.run([sys.executable, "praw.py"], check=True) |
|
success_message = "β
Successfully crawled new data!" |
|
except Exception as e: |
|
success_message = f"β Error executing praw.py: {str(e)}" |
|
|
|
|
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
newest_file = get_newest_file(repo_id, "submissions/df_") |
|
|
|
if newest_file: |
|
df = load_data(repo_id, newest_file) |
|
return success_message, df, load_merged_data()[1] |
|
else: |
|
return "No crawled data files found", pd.DataFrame(), load_merged_data()[1] |
|
|
|
def merge_data(): |
|
"""Execute merge.py and show the latest merged data""" |
|
try: |
|
|
|
subprocess.run([sys.executable, "merge.py"], check=True) |
|
success_message = "β
Successfully merged data!" |
|
except Exception as e: |
|
success_message = f"β Error executing merge.py: {str(e)}" |
|
|
|
|
|
merged_df = load_merged_data()[1] |
|
crawled_df = load_crawled_data()[1] |
|
return success_message, crawled_df, merged_df |
|
|
|
def load_crawled_data(): |
|
"""Load latest crawled data""" |
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
newest_file = get_newest_file(repo_id, "submissions/df_24") |
|
|
|
if newest_file: |
|
return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file) |
|
return "No crawled data available", pd.DataFrame() |
|
|
|
def load_merged_data(): |
|
"""Load latest merged data""" |
|
repo_id = "Vera-ZWY/reddite2024elections_submissions" |
|
newest_merged = "submission/merged_reddit_data.csv" |
|
|
|
if newest_merged: |
|
return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged) |
|
return "No merged data available", pd.DataFrame() |
|
|
|
|
|
with gr.Blocks(title="Reddit Data Processing") as iface: |
|
gr.Markdown("# Reddit Data Processing Interface") |
|
|
|
|
|
status_text = gr.Textbox(label="Status", interactive=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
praw_button = gr.Button("Crawl New Data", variant="primary") |
|
with gr.Column(): |
|
merge_button = gr.Button("Merge Data", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Latest Crawled Data (Top 3 Rows)") |
|
crawled_table = gr.Dataframe( |
|
headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"], |
|
value=load_crawled_data()[1], |
|
wrap=True |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Latest Merged Data (Top 3 Rows)") |
|
merged_table = gr.Dataframe( |
|
headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"], |
|
value=load_merged_data()[1], |
|
wrap=True |
|
) |
|
|
|
|
|
praw_button.click( |
|
fn=praw_new_data, |
|
outputs=[status_text, crawled_table, merged_table] |
|
) |
|
|
|
merge_button.click( |
|
fn=merge_data, |
|
outputs=[status_text, crawled_table, merged_table] |
|
) |
|
|
|
gr.Markdown(""" |
|
## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/ |
|
### Instructions: |
|
1. Click 'Crawl New Data' to fetch new Reddit data |
|
2. Click 'Merge Data' to merge the latest datasets |
|
3. Tables will automatically update to show the latest data |
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |