File size: 5,041 Bytes
a725a50
 
 
 
 
8b96174
a725a50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b96174
 
a725a50
 
 
8b96174
a725a50
8b96174
a725a50
 
 
 
 
 
8b96174
a725a50
8b96174
 
 
 
 
 
 
 
 
 
 
a725a50
 
 
 
 
 
8b96174
a725a50
8b96174
 
 
 
 
 
a725a50
8b96174
 
a725a50
8b96174
a725a50
8b96174
 
 
 
 
 
 
 
a725a50
 
8b96174
 
a725a50
 
 
 
 
8b96174
 
 
 
 
 
 
 
 
a725a50
 
8b96174
 
 
 
 
 
a725a50
8b96174
 
 
 
 
 
 
 
a725a50
 
8b96174
 
 
 
 
 
 
 
 
a725a50
 
8b96174
a725a50
 
 
8b96174
a725a50
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
from datasets import load_dataset
import pandas as pd
import sys
import subprocess
from datetime import datetime
from huggingface_hub import HfApi

def get_newest_file(repo_id, prefix):
    """Get the newest file with given prefix from HuggingFace repo"""
    api = HfApi()
    files = api.list_repo_files(repo_id, repo_type="dataset")
    relevant_files = [f for f in files if f.startswith(prefix)]
    
    if not relevant_files:
        return None
    
    file_dates = []
    for filename in relevant_files:
        try:
            date_str = filename.split('_')[-1].split('.')[0]
            date = datetime.strptime(date_str, '%Y%m%d')
            file_dates.append((date, filename))
        except (IndexError, ValueError):
            continue
    
    if not file_dates:
        return None
    
    newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
    return newest_file

def load_data(repo_id, file_path):
    """Load data from HuggingFace and return as DataFrame"""
    try:
        dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
        df = pd.DataFrame(dataset)
        return df.head(3)
    except Exception as e:
        return pd.DataFrame({'Error': [str(e)]})

def praw_new_data():
    """Execute praw.py and show the latest data"""
    try:
        # Execute praw.py
        subprocess.run([sys.executable, "praw.py"], check=True)
        success_message = "βœ… Successfully crawled new data!"
    except Exception as e:
        success_message = f"❌ Error executing praw.py: {str(e)}"
    
    # Load and return latest data
    repo_id = "Vera-ZWY/reddite2024elections_submissions"
    newest_file = get_newest_file(repo_id, "submissions/df_")
    
    if newest_file:
        df = load_data(repo_id, newest_file)
        return success_message, df, load_merged_data()[1]  # Return current merged data state
    else:
        return "No crawled data files found", pd.DataFrame(), load_merged_data()[1]

def merge_data():
    """Execute merge.py and show the latest merged data"""
    try:
        # Execute merge.py
        subprocess.run([sys.executable, "merge.py"], check=True)
        success_message = "βœ… Successfully merged data!"
    except Exception as e:
        success_message = f"❌ Error executing merge.py: {str(e)}"
    
    # Load and return latest merged data
    merged_df = load_merged_data()[1]
    crawled_df = load_crawled_data()[1]
    return success_message, crawled_df, merged_df

def load_crawled_data():
    """Load latest crawled data"""
    repo_id = "Vera-ZWY/reddite2024elections_submissions"
    newest_file = get_newest_file(repo_id, "submissions/df_24")
    
    if newest_file:
        return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file)
    return "No crawled data available", pd.DataFrame()

def load_merged_data():
    """Load latest merged data"""
    repo_id = "Vera-ZWY/reddite2024elections_submissions"
    newest_merged =  "submission/merged_reddit_data.csv"
    
    if newest_merged:
        return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged)
    return "No merged data available", pd.DataFrame()

# Create Gradio interface
with gr.Blocks(title="Reddit Data Processing") as iface:
    gr.Markdown("# Reddit Data Processing Interface")
    
    # Status message for operations
    status_text = gr.Textbox(label="Status", interactive=False)
    
    with gr.Row():
        with gr.Column():
            praw_button = gr.Button("Crawl New Data", variant="primary")
        with gr.Column():
            merge_button = gr.Button("Merge Data", variant="primary")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Latest Crawled Data (Top 3 Rows)")
            crawled_table = gr.Dataframe(
                headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"],
                value=load_crawled_data()[1],
                wrap=True
            )
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Latest Merged Data (Top 3 Rows)")
            merged_table = gr.Dataframe(
                headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"],
                value=load_merged_data()[1],
                wrap=True
            )
    
    # Button click handlers
    praw_button.click(
        fn=praw_new_data,
        outputs=[status_text, crawled_table, merged_table]
    )
    
    merge_button.click(
        fn=merge_data,
        outputs=[status_text, crawled_table, merged_table]
    )
    
    gr.Markdown("""
    ## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/
    ### Instructions:
    1. Click 'Crawl New Data' to fetch new Reddit data
    2. Click 'Merge Data' to merge the latest datasets
    3. Tables will automatically update to show the latest data
    """)

# Launch the interface
if __name__ == "__main__":
    iface.launch()