File size: 4,979 Bytes
b47e16a
e1f36d5
b47e16a
 
e1f36d5
 
 
b47e16a
 
 
 
 
 
 
 
 
 
 
6bf9f6c
b47e16a
7c83e84
4165a1d
b47e16a
 
 
 
 
 
ee1dc7d
b47e16a
 
 
 
 
9178174
 
 
 
 
 
 
b47e16a
 
e1f36d5
 
b47e16a
e1f36d5
b47e16a
e1f36d5
b47e16a
 
 
e1f36d5
b47e16a
 
6bf9f6c
 
b47e16a
 
 
 
6bf9f6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b47e16a
 
 
 
 
 
 
 
6bf9f6c
 
 
 
 
 
 
b47e16a
e1f36d5
b47e16a
 
 
 
 
 
 
 
 
 
 
6bf9f6c
 
 
 
b47e16a
 
 
 
 
 
 
 
6bf9f6c
b47e16a
 
6bf9f6c
b47e16a
 
 
 
 
 
 
6bf9f6c
 
b47e16a
 
 
18555fc
b47e16a
 
e1f36d5
b47e16a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

import gradio as gr
import os
from datasets import load_dataset, Dataset
import pandas as pd
from huggingface_hub import login

def load_huggingface_dataset(dataset_link, token):
    """
    Load a Hugging Face dataset using the provided link and token.
    """
    # Extract dataset name and config if applicable
    
    # Load the dataset
    dataset = load_dataset(dataset_link, split="train")
    
    # Return the dataset as a DataFrame with index and transcription columns
    df = dataset.to_pandas().reset_index()
    # print(df)
    return df[["index", "transcription"]], dataset
    

def update_transcriptions(df, dataset, token,dataset_link):
    """
    Update the transcriptions in the dataset and push it back to the Hugging Face Hub.
    """
    # Convert DataFrame back to Dataset
    updated_dataset = Dataset.from_pandas(df)

    # print(updated_dataset)
    # print(dataset)
    
    # Replace the original transcription column in the dataset
    dataset = dataset.map(
        lambda examples, idx: {"transcription": updated_dataset["transcription"][idx]},
        with_indices=True 
    )

    # Filter out rows with empty or whitespace-only transcriptions
    dataset = dataset.filter(
        lambda examples: examples["transcription"].strip() != "",  # Keep only non-empty transcriptions
    )
    
    login(token)
    
    dataset.push_to_hub(dataset_link)
    
    return "Dataset updated and changes submitted to the Hugging Face Hub!"

# Gradio Interface
def main():
    
    
    dataset = None  # To store the loaded dataset object globally
    
    original_df = None  # Store the original DataFrame globally for resetting search results

    def load_dataset_and_show_table(dataset_link, token):
        """
        Load the dataset and return the DataFrame to display in Gradio.
        """
        nonlocal dataset, original_df
        original_df, dataset = load_huggingface_dataset(dataset_link, token)
        return original_df
    
    def search_transcriptions(search_term):
        """
        Search the transcription column and filter the table based on the search term.
        """
        if original_df is None:
            return pd.DataFrame(columns=["index", "transcription"])  # Empty table if no dataset is loaded
        filtered_df = original_df[original_df["transcription"].str.contains(search_term, case=False, na=False)]
        return filtered_df
    
    def update_original(df):
        # Merge modified DataFrame into original DataFrame
        for _, row in df.iterrows():
            # Locate the row in the original DataFrame with the same index
            original_df.loc[original_df["index"] == row["index"], "transcription"] = row["transcription"]
            
        return "update Successful"
    
    def submit_changes(df, token,dataset_link):
        """
        Submit updated changes to the Hugging Face Hub.
        """
        if dataset is None:
            return "No dataset loaded to update."
        
        print(len(dataset))
        print(len(df))
        
        if len(df) < len(dataset):
            update_original(df)
            return update_transcriptions(original_df, dataset, token,dataset_link)
        
        return update_transcriptions(df, dataset, token,dataset_link)

    # Gradio Interface
    with gr.Blocks(css=".dataframe-row { height: 200px; }") as interface:
        gr.Markdown("## Hugging Face Audio Dataset Editor")
        
        # Input fields for dataset link and token
        dataset_link = gr.Textbox(label="Hugging Face Dataset Link")
        hf_token = gr.Textbox(label="Hugging Face Token", type="password")
        
        # Button to load dataset
        load_button = gr.Button("Load Dataset")
        
        # Search bar
        search_box = gr.Textbox(label="Search Transcriptions", placeholder="Enter a search term...")
        
        
        # Table to display and edit dataset
        table = gr.Dataframe(
            headers=["index", "transcription"],
            datatype=["number", "str"],
            interactive=True,
            label="Edit Dataset (Transcriptions are RTL)",
        )
        
        update_button = gr.Button("Update Table")
        # Button to submit changes
        submit_button = gr.Button("Submit Changes")
        update_message = gr.Textbox(label="update message")
        output_message = gr.Textbox(label="Message")
        
        # RTL styling for transcription column
        table.style = {"transcription": {"direction": "rtl"}}
        
        # Button functionality
        load_button.click(load_dataset_and_show_table, [dataset_link, hf_token], table)
        search_box.change(search_transcriptions, search_box, table)
        update_button.click(update_original, [table], update_message)
        submit_button.click(submit_changes, [table, hf_token,dataset_link], output_message)
    
    # Launch Gradio Interface
    interface.launch()
    
    

if __name__ == "__main__":
    main()